batchalign 0.8.2__tar.gz → 0.8.2.post2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batchalign-0.8.2/batchalign.egg-info → batchalign-0.8.2.post2}/PKG-INFO +1 -1
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/analysis/compare.py +179 -63
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/version +1 -1
- {batchalign-0.8.2 → batchalign-0.8.2.post2/batchalign.egg-info}/PKG-INFO +1 -1
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/LICENSE +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/MANIFEST.in +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/README.md +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/__main__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/cli/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/cli/bench.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/cli/cache.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/cli/cli.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/cli/dispatch.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/constants.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/document.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/errors.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/base.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/chat/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/chat/file.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/chat/generator.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/chat/lexer.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/chat/parser.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/chat/utils.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/textgrid/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/textgrid/file.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/textgrid/generator.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/textgrid/parser.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/audio_io.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/resolve.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/speaker/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/speaker/config.yaml +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/speaker/infer.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/speaker/utils.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/training/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/training/run.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/training/utils.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/utils.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/utterance/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/utterance/cantonese_infer.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/utterance/dataset.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/utterance/execute.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/utterance/infer.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/utterance/prep.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/utterance/train.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/wave2vec/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/wave2vec/infer_fa.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/whisper/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/whisper/infer_asr.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/whisper/infer_fa.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/analysis/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/analysis/eval.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2chinese.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/deu.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/ell.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/eng.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/eus.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/fra.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/hrv.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/ind.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/jpn.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/nld.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/por.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/spa.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/tha.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/oai_whisper.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/rev.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/utils.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/whisper.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/whisperx.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/avqi/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/avqi/engine.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/base.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/cache.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/cleanup/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/cleanup/cleanup.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/cleanup/parse_support.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/cleanup/retrace.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/cleanup/support/test.test +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/diarization/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/diarization/pyannote.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/dispatch.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/fa/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/fa/wave2vec_fa.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/fa/whisper_fa.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/morphosyntax/coref.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/morphosyntax/ud.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/opensmile/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/opensmile/engine.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/pipeline.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/speaker/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/translate/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/translate/gtrans.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/translate/seamless.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/translate/utils.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/utr/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/utr/rev_utr.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/utr/utils.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/utr/whisper_utr.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/utterance/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/cli/test_dispatch_memory.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/conftest.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/models/test_audio_io.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/models/test_audio_lazy.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/cache/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/cache/test_cache.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/fa/test_fa_short_segments.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/fixures.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/test_pipeline.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/test_document.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/utils/__init__.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/utils/abbrev.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/utils/compounds.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/utils/config.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/utils/device.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/utils/dp.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/utils/names.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/utils/utils.py +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign.egg-info/SOURCES.txt +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign.egg-info/dependency_links.txt +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign.egg-info/entry_points.txt +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign.egg-info/requires.txt +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign.egg-info/top_level.txt +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/setup.cfg +0 -0
- {batchalign-0.8.2 → batchalign-0.8.2.post2}/setup.py +0 -0
|
@@ -12,6 +12,7 @@ computes error-rate metrics for CSV output.
|
|
|
12
12
|
|
|
13
13
|
import re
|
|
14
14
|
import logging
|
|
15
|
+
from collections import Counter
|
|
15
16
|
from batchalign.document import *
|
|
16
17
|
from batchalign.pipelines.base import *
|
|
17
18
|
from batchalign.utils.dp import align, ExtraType, Extra, Match
|
|
@@ -151,6 +152,65 @@ def match_fn(x, y):
|
|
|
151
152
|
# --- End of eval.py duplicates ---
|
|
152
153
|
|
|
153
154
|
|
|
155
|
+
def _find_best_segment(gold_tokens, main_tokens, mfn):
|
|
156
|
+
"""Find a rough window using bag-of-words overlap.
|
|
157
|
+
|
|
158
|
+
The rough pass is order-invariant: it scores contiguous windows by token
|
|
159
|
+
multiset overlap with the gold utterance, ignoring order. To keep common
|
|
160
|
+
words from swallowing later transcript material, it only considers windows
|
|
161
|
+
near the gold utterance length. Among equally good windows it prefers the
|
|
162
|
+
latest one, not the earliest. The caller then runs the full Levenshtein
|
|
163
|
+
aligner inside that window to produce token annotations.
|
|
164
|
+
"""
|
|
165
|
+
if not gold_tokens or not main_tokens:
|
|
166
|
+
return 0, 0
|
|
167
|
+
|
|
168
|
+
gold_counts = Counter(gold_tokens)
|
|
169
|
+
gold_len = len(gold_tokens)
|
|
170
|
+
main_len = len(main_tokens)
|
|
171
|
+
|
|
172
|
+
min_window = max(1, gold_len - 2)
|
|
173
|
+
max_window = min(main_len, gold_len + 2)
|
|
174
|
+
|
|
175
|
+
best = (0, min(main_len, gold_len))
|
|
176
|
+
best_score = -1.0
|
|
177
|
+
best_len_delta = None
|
|
178
|
+
|
|
179
|
+
for span in range(min_window, max_window + 1):
|
|
180
|
+
window_counts = Counter(main_tokens[:span])
|
|
181
|
+
overlap = sum(min(window_counts[token], gold_counts[token]) for token in window_counts)
|
|
182
|
+
|
|
183
|
+
for start in range(0, main_len - span + 1):
|
|
184
|
+
if start > 0:
|
|
185
|
+
left = main_tokens[start - 1]
|
|
186
|
+
right = main_tokens[start + span - 1]
|
|
187
|
+
|
|
188
|
+
overlap -= min(window_counts[left], gold_counts[left])
|
|
189
|
+
window_counts[left] -= 1
|
|
190
|
+
overlap += min(window_counts[left], gold_counts[left])
|
|
191
|
+
|
|
192
|
+
overlap -= min(window_counts[right], gold_counts[right])
|
|
193
|
+
window_counts[right] += 1
|
|
194
|
+
overlap += min(window_counts[right], gold_counts[right])
|
|
195
|
+
|
|
196
|
+
score = overlap / gold_len
|
|
197
|
+
len_delta = abs(span - gold_len)
|
|
198
|
+
end = start + span
|
|
199
|
+
|
|
200
|
+
if score > best_score:
|
|
201
|
+
best = (start, end)
|
|
202
|
+
best_score = score
|
|
203
|
+
best_len_delta = len_delta
|
|
204
|
+
elif score == best_score:
|
|
205
|
+
if best_len_delta is None or len_delta < best_len_delta:
|
|
206
|
+
best = (start, end)
|
|
207
|
+
best_len_delta = len_delta
|
|
208
|
+
elif len_delta == best_len_delta and end > best[1]:
|
|
209
|
+
best = (start, end)
|
|
210
|
+
|
|
211
|
+
return best
|
|
212
|
+
|
|
213
|
+
|
|
154
214
|
def _get_pos(form):
|
|
155
215
|
"""Extract uppercased POS from a Form's morphology, or '?' if absent."""
|
|
156
216
|
if form is not None and form.morphology:
|
|
@@ -202,13 +262,10 @@ class CompareEngine(BatchalignEngine):
|
|
|
202
262
|
]
|
|
203
263
|
main_info = [] # (utt_idx, form_idx, Form)
|
|
204
264
|
main_words = []
|
|
205
|
-
main_punct = {} # utt_idx -> list of (form_idx, Form)
|
|
206
265
|
|
|
207
266
|
for utt_idx, utt in enumerate(main_utterances):
|
|
208
|
-
main_punct[utt_idx] = []
|
|
209
267
|
for form_idx, form in enumerate(utt.content):
|
|
210
268
|
if form.text.strip() in MOR_PUNCT + ENDING_PUNCT:
|
|
211
|
-
main_punct[utt_idx].append((form_idx, form))
|
|
212
269
|
continue
|
|
213
270
|
if form.text.strip().lower() in fillers:
|
|
214
271
|
continue
|
|
@@ -221,10 +278,13 @@ class CompareEngine(BatchalignEngine):
|
|
|
221
278
|
]
|
|
222
279
|
gold_info = [] # (utt_idx, form_idx, Form)
|
|
223
280
|
gold_words = []
|
|
281
|
+
gold_punct = {} # utt_idx -> list of (form_idx, Form)
|
|
224
282
|
|
|
225
283
|
for utt_idx, utt in enumerate(gold_utterances):
|
|
284
|
+
gold_punct[utt_idx] = []
|
|
226
285
|
for form_idx, form in enumerate(utt.content):
|
|
227
286
|
if form.text.strip() in MOR_PUNCT + ENDING_PUNCT:
|
|
287
|
+
gold_punct[utt_idx].append((form_idx, form))
|
|
228
288
|
continue
|
|
229
289
|
if form.text.strip().lower() in fillers:
|
|
230
290
|
continue
|
|
@@ -235,102 +295,149 @@ class CompareEngine(BatchalignEngine):
|
|
|
235
295
|
conformed_main, main_map = conform_with_mapping(main_words, conform)
|
|
236
296
|
conformed_gold, gold_map = conform_with_mapping(gold_words, conform)
|
|
237
297
|
|
|
238
|
-
# --- 4.
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
status="match"
|
|
262
|
-
)))
|
|
263
|
-
main_cursor += 1
|
|
264
|
-
gold_cursor += 1
|
|
265
|
-
|
|
266
|
-
elif isinstance(item, Extra):
|
|
267
|
-
if item.extra_type == ExtraType.PAYLOAD:
|
|
268
|
-
# Word in main but not in gold -> extra_main (+)
|
|
269
|
-
orig_main_idx = main_map[main_cursor]
|
|
270
|
-
main_utt_idx = main_info[orig_main_idx][0]
|
|
271
|
-
main_form_idx = main_info[orig_main_idx][1]
|
|
272
|
-
main_form = main_info[orig_main_idx][2]
|
|
273
|
-
current_main_utt = main_utt_idx
|
|
274
|
-
last_main_form_idx = main_form_idx
|
|
298
|
+
# --- 4. Partition conformed gold tokens by utterance ---
|
|
299
|
+
gold_utt_tokens = {i: [] for i in range(len(gold_utterances))}
|
|
300
|
+
gold_utt_maps = {i: [] for i in range(len(gold_utterances))}
|
|
301
|
+
for j in range(len(conformed_gold)):
|
|
302
|
+
orig_idx = gold_map[j]
|
|
303
|
+
utt_idx = gold_info[orig_idx][0]
|
|
304
|
+
gold_utt_tokens[utt_idx].append(conformed_gold[j])
|
|
305
|
+
gold_utt_maps[utt_idx].append(orig_idx)
|
|
306
|
+
|
|
307
|
+
# --- 5. Per-utterance alignment ---
|
|
308
|
+
# For each gold utterance, find a rough last-possible bag-of-words
|
|
309
|
+
# window in the remaining main tokens, then run Levenshtein inside
|
|
310
|
+
# that window to produce the annotations.
|
|
311
|
+
utt_positioned = {i: [] for i in range(len(gold_utterances))}
|
|
312
|
+
search_start = 0
|
|
313
|
+
|
|
314
|
+
for utt_idx in range(len(gold_utterances)):
|
|
315
|
+
g_tokens = gold_utt_tokens[utt_idx]
|
|
316
|
+
g_maps = gold_utt_maps[utt_idx]
|
|
317
|
+
G = len(g_tokens)
|
|
318
|
+
|
|
319
|
+
if G == 0:
|
|
320
|
+
continue
|
|
275
321
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
322
|
+
remaining_main = conformed_main[search_start:]
|
|
323
|
+
win_start, win_end = _find_best_segment(g_tokens, remaining_main, match_fn)
|
|
324
|
+
|
|
325
|
+
abs_start = search_start + win_start
|
|
326
|
+
abs_end = search_start + win_end
|
|
327
|
+
|
|
328
|
+
# Align the chosen window against this gold utterance
|
|
329
|
+
window_main = conformed_main[abs_start:abs_end]
|
|
330
|
+
utt_alignment = align(window_main, g_tokens, False, match_fn)
|
|
282
331
|
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
332
|
+
local_main_cursor = 0
|
|
333
|
+
local_gold_cursor = 0
|
|
334
|
+
last_gold_form_idx = -1
|
|
335
|
+
|
|
336
|
+
for item in utt_alignment:
|
|
337
|
+
if isinstance(item, Match):
|
|
338
|
+
global_main_idx = abs_start + local_main_cursor
|
|
339
|
+
orig_main_idx = main_map[global_main_idx]
|
|
340
|
+
main_form = main_info[orig_main_idx][2]
|
|
341
|
+
orig_gold_idx = g_maps[local_gold_cursor]
|
|
342
|
+
gold_form_idx = gold_info[orig_gold_idx][1]
|
|
286
343
|
gold_form = gold_info[orig_gold_idx][2]
|
|
344
|
+
last_gold_form_idx = gold_form_idx
|
|
287
345
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
346
|
+
if main_form.time is not None:
|
|
347
|
+
gold_form.time = main_form.time
|
|
348
|
+
if main_form.morphology is not None:
|
|
349
|
+
gold_form.morphology = main_form.morphology
|
|
350
|
+
if main_form.dependency is not None:
|
|
351
|
+
gold_form.dependency = main_form.dependency
|
|
352
|
+
|
|
353
|
+
utt_positioned[utt_idx].append((gold_form_idx, CompareToken(
|
|
291
354
|
text=item.key,
|
|
292
|
-
pos=_get_pos(
|
|
293
|
-
status="
|
|
355
|
+
pos=_get_pos(main_form),
|
|
356
|
+
status="match"
|
|
294
357
|
)))
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
358
|
+
local_main_cursor += 1
|
|
359
|
+
local_gold_cursor += 1
|
|
360
|
+
|
|
361
|
+
elif isinstance(item, Extra):
|
|
362
|
+
if item.extra_type == ExtraType.REFERENCE:
|
|
363
|
+
orig_gold_idx = g_maps[local_gold_cursor]
|
|
364
|
+
gold_form_idx = gold_info[orig_gold_idx][1]
|
|
365
|
+
gold_form = gold_info[orig_gold_idx][2]
|
|
366
|
+
last_gold_form_idx = gold_form_idx
|
|
367
|
+
|
|
368
|
+
utt_positioned[utt_idx].append((gold_form_idx, CompareToken(
|
|
369
|
+
text=item.key,
|
|
370
|
+
pos=_get_pos(gold_form),
|
|
371
|
+
status="extra_gold"
|
|
372
|
+
)))
|
|
373
|
+
local_gold_cursor += 1
|
|
374
|
+
|
|
375
|
+
else:
|
|
376
|
+
global_main_idx = abs_start + local_main_cursor
|
|
377
|
+
orig_main_idx = main_map[global_main_idx]
|
|
378
|
+
main_form = main_info[orig_main_idx][2]
|
|
379
|
+
|
|
380
|
+
pos = last_gold_form_idx + 0.5
|
|
381
|
+
utt_positioned[utt_idx].append((pos, CompareToken(
|
|
382
|
+
text=item.key,
|
|
383
|
+
pos=_get_pos(main_form),
|
|
384
|
+
status="extra_main"
|
|
385
|
+
)))
|
|
386
|
+
local_main_cursor += 1
|
|
387
|
+
|
|
388
|
+
search_start = abs_end
|
|
389
|
+
|
|
390
|
+
# --- 6. Merge punctuation from gold at original positions ---
|
|
391
|
+
for utt_idx in range(len(gold_utterances)):
|
|
392
|
+
for form_idx, form in gold_punct[utt_idx]:
|
|
300
393
|
utt_positioned[utt_idx].append((form_idx, CompareToken(
|
|
301
394
|
text=form.text,
|
|
302
395
|
pos="PUNCT",
|
|
303
396
|
status="match"
|
|
304
397
|
)))
|
|
305
|
-
# Stable sort by position preserves order within same form_idx
|
|
306
398
|
utt_positioned[utt_idx].sort(key=lambda x: x[0])
|
|
307
399
|
|
|
308
|
-
# --- 7. Set comparison on each utterance ---
|
|
309
|
-
for utt_idx, utt in enumerate(
|
|
400
|
+
# --- 7. Set comparison on each gold utterance ---
|
|
401
|
+
for utt_idx, utt in enumerate(gold_utterances):
|
|
310
402
|
tokens = [tok for _, tok in utt_positioned[utt_idx]]
|
|
311
403
|
utt.comparison = tokens if tokens else None
|
|
312
404
|
|
|
313
|
-
|
|
405
|
+
timed_forms = [form for form in utt.content if form.time is not None]
|
|
406
|
+
if timed_forms:
|
|
407
|
+
utt.time = (timed_forms[0].time[0], timed_forms[-1].time[1])
|
|
408
|
+
utt.text = None
|
|
409
|
+
|
|
410
|
+
return gold
|
|
314
411
|
|
|
315
412
|
|
|
316
413
|
class CompareAnalysisEngine(BatchalignEngine):
|
|
317
414
|
tasks = [Task.COMPARE_ANALYSIS]
|
|
318
415
|
|
|
319
416
|
def analyze(self, doc, **kwargs):
|
|
417
|
+
from collections import defaultdict
|
|
418
|
+
|
|
320
419
|
matches = 0
|
|
321
420
|
extra_main = 0
|
|
322
421
|
extra_gold = 0
|
|
323
422
|
|
|
423
|
+
# Per-POS counters: pos -> {matches, insertions, deletions}
|
|
424
|
+
pos_counts = defaultdict(lambda: {"matches": 0, "insertions": 0, "deletions": 0})
|
|
425
|
+
|
|
324
426
|
for utt in doc.content:
|
|
325
427
|
if not isinstance(utt, Utterance) or utt.comparison is None:
|
|
326
428
|
continue
|
|
327
429
|
for tok in utt.comparison:
|
|
430
|
+
if tok.pos == "PUNCT":
|
|
431
|
+
continue
|
|
328
432
|
if tok.status == "match":
|
|
329
433
|
matches += 1
|
|
434
|
+
pos_counts[tok.pos]["matches"] += 1
|
|
330
435
|
elif tok.status == "extra_main":
|
|
331
436
|
extra_main += 1
|
|
437
|
+
pos_counts[tok.pos]["insertions"] += 1
|
|
332
438
|
elif tok.status == "extra_gold":
|
|
333
439
|
extra_gold += 1
|
|
440
|
+
pos_counts[tok.pos]["deletions"] += 1
|
|
334
441
|
|
|
335
442
|
total_gold = matches + extra_gold
|
|
336
443
|
total_main = matches + extra_main
|
|
@@ -347,6 +454,15 @@ class CompareAnalysisEngine(BatchalignEngine):
|
|
|
347
454
|
"total_main_words": total_main,
|
|
348
455
|
}
|
|
349
456
|
|
|
457
|
+
# Add per-POS breakdown
|
|
458
|
+
for pos in sorted(pos_counts.keys()):
|
|
459
|
+
counts = pos_counts[pos]
|
|
460
|
+
total = counts["matches"] + counts["deletions"]
|
|
461
|
+
metrics[f"{pos}:matches"] = counts["matches"]
|
|
462
|
+
metrics[f"{pos}:insertions"] = counts["insertions"]
|
|
463
|
+
metrics[f"{pos}:deletions"] = counts["deletions"]
|
|
464
|
+
metrics[f"{pos}:total"] = total
|
|
465
|
+
|
|
350
466
|
return {
|
|
351
467
|
"doc": doc,
|
|
352
468
|
"metrics": metrics,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/cleanup/support/filled_pauses.eng
RENAMED
|
File without changes
|
{batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/cleanup/support/replacements.eng
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/morphosyntax/ja/verbforms.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/formats/chat/test_chat_generator.py
RENAMED
|
File without changes
|
{batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/formats/chat/test_chat_lexer.py
RENAMED
|
File without changes
|
{batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/formats/chat/test_chat_parser.py
RENAMED
|
File without changes
|
{batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/formats/chat/test_chat_utils.py
RENAMED
|
File without changes
|
{batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/formats/textgrid/test_textgrid.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/analysis/test_eval.py
RENAMED
|
File without changes
|
{batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/asr/test_asr_pipeline.py
RENAMED
|
File without changes
|
{batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/asr/test_asr_utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/cleanup/test_disfluency.py
RENAMED
|
File without changes
|
{batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/cleanup/test_parse_support.py
RENAMED
|
File without changes
|
{batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/fa/test_fa_pipeline.py
RENAMED
|
File without changes
|
{batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/fa/test_fa_short_segments.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/test_pipeline_models.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|