batchalign 0.7.21.post9__tar.gz → 0.7.21.post11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of batchalign might be problematic. Click here for more details.
- {batchalign-0.7.21.post9/batchalign.egg-info → batchalign-0.7.21.post11}/PKG-INFO +1 -1
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/analysis/eval.py +48 -46
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/asr/rev.py +1 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/asr/utils.py +16 -15
- batchalign-0.7.21.post11/batchalign/version +3 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11/batchalign.egg-info}/PKG-INFO +1 -1
- batchalign-0.7.21.post9/batchalign/version +0 -3
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/LICENSE +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/MANIFEST.in +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/README.md +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/__main__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/cli/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/cli/cli.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/cli/dispatch.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/constants.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/document.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/errors.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/base.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/chat/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/chat/file.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/chat/generator.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/chat/lexer.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/chat/parser.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/chat/utils.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/textgrid/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/textgrid/file.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/textgrid/generator.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/textgrid/parser.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/resolve.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/speaker/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/speaker/config.yaml +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/speaker/infer.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/speaker/utils.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/training/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/training/run.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/training/utils.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/utils.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/utterance/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/utterance/cantonese_infer.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/utterance/dataset.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/utterance/execute.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/utterance/infer.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/utterance/prep.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/utterance/train.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/wave2vec/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/wave2vec/infer_fa.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/whisper/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/whisper/infer_asr.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/whisper/infer_fa.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/analysis/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/asr/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/asr/num2chinese.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/asr/oai_whisper.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/asr/whisper.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/asr/whisperx.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/avqi/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/avqi/engine.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/base.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/cleanup/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/cleanup/cleanup.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/cleanup/parse_support.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/cleanup/retrace.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/cleanup/support/test.test +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/diarization/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/diarization/pyannote.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/dispatch.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/fa/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/fa/wave2vec_fa.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/fa/whisper_fa.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/coref.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/ud.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/pipeline.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/speaker/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/translate/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/translate/gtrans.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/translate/seamless.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/translate/utils.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/utr/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/utr/rev_utr.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/utr/utils.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/utr/whisper_utr.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/utterance/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/conftest.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/pipelines/fixures.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/pipelines/test_pipeline.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/test_document.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/utils/__init__.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/utils/abbrev.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/utils/compounds.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/utils/config.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/utils/dp.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/utils/names.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/utils/utils.py +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign.egg-info/SOURCES.txt +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign.egg-info/dependency_links.txt +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign.egg-info/entry_points.txt +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign.egg-info/requires.txt +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign.egg-info/top_level.txt +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/setup.cfg +0 -0
- {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/setup.py +0 -0
|
@@ -18,134 +18,136 @@ import logging
|
|
|
18
18
|
L = logging.getLogger("batchalign")
|
|
19
19
|
|
|
20
20
|
joined_compounds = ["".join(k) for k in compounds]
|
|
21
|
-
lowered_abbrev = [k
|
|
21
|
+
lowered_abbrev = [k for k in abbrev]
|
|
22
22
|
|
|
23
23
|
fillers = ["um", "uhm", "em", "mhm", "uhhm", "eh", "uh", "hm"]
|
|
24
24
|
def conform(x):
|
|
25
25
|
result = []
|
|
26
26
|
for i in x:
|
|
27
|
-
if i.strip() in joined_compounds:
|
|
27
|
+
if i.strip().lower() in joined_compounds:
|
|
28
28
|
for k in compounds[joined_compounds.index(i.strip())]:
|
|
29
29
|
result.append(k)
|
|
30
30
|
elif i.strip() in lowered_abbrev:
|
|
31
31
|
for j in i.strip():
|
|
32
|
-
result.append(j)
|
|
33
|
-
elif "'s" in i.strip():
|
|
32
|
+
result.append(j.strip())
|
|
33
|
+
elif "'s" in i.strip().lower():
|
|
34
34
|
result.append(i.split("'")[0])
|
|
35
35
|
result.append("is")
|
|
36
|
-
elif "'ve" in i.strip():
|
|
36
|
+
elif "'ve" in i.strip().lower():
|
|
37
37
|
result.append(i.split("'")[0])
|
|
38
38
|
result.append("have")
|
|
39
|
-
elif "'s" in i.strip():
|
|
39
|
+
elif "'s" in i.strip().lower():
|
|
40
40
|
result.append(i.split("'")[0])
|
|
41
41
|
result.append("is")
|
|
42
|
-
elif "'d" in i.strip():
|
|
42
|
+
elif "'d" in i.strip().lower():
|
|
43
43
|
result.append(i.split("'")[0])
|
|
44
44
|
result.append("had")
|
|
45
|
-
elif "'m" in i.strip():
|
|
45
|
+
elif "'m" in i.strip().lower():
|
|
46
46
|
result.append(i.split("'")[0])
|
|
47
47
|
result.append("am")
|
|
48
|
-
elif i.strip() in fillers:
|
|
48
|
+
elif i.strip().lower() in fillers:
|
|
49
49
|
result.append("um")
|
|
50
|
-
elif "-" in i.strip():
|
|
51
|
-
result += [k.strip() for k in i.split("-")]
|
|
52
|
-
elif "ok" == i.strip():
|
|
50
|
+
elif "-" in i.strip().lower():
|
|
51
|
+
result += [k.strip() for k in i.split("-").lower()]
|
|
52
|
+
elif "ok" == i.strip().lower():
|
|
53
53
|
result.append("okay")
|
|
54
|
-
elif "gimme" == i.strip():
|
|
54
|
+
elif "gimme" == i.strip().lower():
|
|
55
55
|
result.append("give")
|
|
56
56
|
result.append("me")
|
|
57
|
-
elif "hafta" == i.strip() or "havta" == i.strip():
|
|
57
|
+
elif "hafta" == i.strip().lower() or "havta" == i.strip().lower():
|
|
58
58
|
result.append("have")
|
|
59
59
|
result.append("to")
|
|
60
|
-
elif i.strip() in names:
|
|
60
|
+
elif i.strip().lower() in names:
|
|
61
61
|
result.append("name")
|
|
62
|
-
elif "dunno" == i.strip():
|
|
62
|
+
elif "dunno" == i.strip().lower():
|
|
63
63
|
result.append("don't")
|
|
64
64
|
result.append("know")
|
|
65
|
-
elif "wanna" == i.strip():
|
|
65
|
+
elif "wanna" == i.strip().lower():
|
|
66
66
|
result.append("want")
|
|
67
67
|
result.append("to")
|
|
68
|
-
elif "mba" == i.strip():
|
|
68
|
+
elif "mba" == i.strip().lower():
|
|
69
69
|
result.append("m")
|
|
70
70
|
result.append("b")
|
|
71
71
|
result.append("a")
|
|
72
|
-
elif "tli" == i.strip():
|
|
72
|
+
elif "tli" == i.strip().lower():
|
|
73
73
|
result.append("t")
|
|
74
74
|
result.append("l")
|
|
75
75
|
result.append("i")
|
|
76
|
-
elif "bbc" == i.strip():
|
|
76
|
+
elif "bbc" == i.strip().lower():
|
|
77
77
|
result.append("b")
|
|
78
78
|
result.append("b")
|
|
79
79
|
result.append("c")
|
|
80
|
-
elif "ii" == i.strip():
|
|
80
|
+
elif "ii" == i.strip().lower():
|
|
81
81
|
result.append("i")
|
|
82
82
|
result.append("i")
|
|
83
|
-
elif "i'd" == i.strip():
|
|
83
|
+
elif "i'd" == i.strip().lower():
|
|
84
84
|
result.append("i")
|
|
85
85
|
result.append("had")
|
|
86
|
-
elif "alright" == i.strip():
|
|
86
|
+
elif "alright" == i.strip().lower():
|
|
87
87
|
result.append("all")
|
|
88
88
|
result.append("right")
|
|
89
|
-
elif "sorta" == i.strip():
|
|
89
|
+
elif "sorta" == i.strip().lower():
|
|
90
90
|
result.append("sort")
|
|
91
91
|
result.append("of")
|
|
92
|
-
elif "alrightie" == i.strip():
|
|
92
|
+
elif "alrightie" == i.strip().lower():
|
|
93
93
|
result.append("all")
|
|
94
94
|
result.append("right")
|
|
95
|
-
elif "mm" == i.strip():
|
|
95
|
+
elif "mm" == i.strip().lower():
|
|
96
96
|
result.append("hm")
|
|
97
|
-
elif "ai" == i.strip():
|
|
97
|
+
elif "ai" == i.strip().lower():
|
|
98
98
|
result.append("a")
|
|
99
99
|
result.append("i")
|
|
100
|
-
elif "this'll" == i.strip():
|
|
100
|
+
elif "this'll" == i.strip().lower():
|
|
101
101
|
result.append("this")
|
|
102
102
|
result.append("will")
|
|
103
|
-
elif "gotta" == i.strip():
|
|
103
|
+
elif "gotta" == i.strip().lower():
|
|
104
104
|
result.append("got")
|
|
105
105
|
result.append("to")
|
|
106
|
-
elif "hadta" == i.strip():
|
|
106
|
+
elif "hadta" == i.strip().lower():
|
|
107
107
|
result.append("had")
|
|
108
108
|
result.append("to")
|
|
109
|
-
elif "eh" == i.strip():
|
|
109
|
+
elif "eh" == i.strip().lower():
|
|
110
110
|
result.append("uh")
|
|
111
|
-
elif "kinda" == i.strip():
|
|
111
|
+
elif "kinda" == i.strip().lower():
|
|
112
112
|
result.append("kind")
|
|
113
113
|
result.append("of")
|
|
114
|
-
elif "ed" == i.strip():
|
|
114
|
+
elif "ed" == i.strip().lower():
|
|
115
115
|
result.append("education")
|
|
116
|
-
elif "til" == i.strip():
|
|
116
|
+
elif "til" == i.strip().lower():
|
|
117
117
|
result.append("until")
|
|
118
|
-
elif "gonna" == i.strip():
|
|
118
|
+
elif "gonna" == i.strip().lower():
|
|
119
119
|
result.append("going")
|
|
120
120
|
result.append("to")
|
|
121
|
-
elif "shoulda" == i.strip():
|
|
121
|
+
elif "shoulda" == i.strip().lower():
|
|
122
122
|
result.append("should")
|
|
123
123
|
result.append("have")
|
|
124
|
-
elif "sposta" == i.strip():
|
|
124
|
+
elif "sposta" == i.strip().lower():
|
|
125
125
|
result.append("supposed")
|
|
126
126
|
result.append("to")
|
|
127
|
-
elif "farmhouse" == i.strip():
|
|
127
|
+
elif "farmhouse" == i.strip().lower():
|
|
128
128
|
result.append("farm")
|
|
129
129
|
result.append("house")
|
|
130
|
-
elif "aa" == i.strip():
|
|
130
|
+
elif "aa" == i.strip().lower():
|
|
131
131
|
result.append("a")
|
|
132
132
|
result.append("a")
|
|
133
|
-
elif "aa" == i.strip():
|
|
133
|
+
elif "aa" == i.strip().lower():
|
|
134
134
|
result.append("a")
|
|
135
135
|
result.append("a")
|
|
136
|
-
elif "em" == i.strip():
|
|
136
|
+
elif "em" == i.strip().lower():
|
|
137
137
|
result.append("them")
|
|
138
|
-
elif "hmm" == i.strip():
|
|
138
|
+
elif "hmm" == i.strip().lower():
|
|
139
139
|
result.append("hm")
|
|
140
|
-
elif "_" in i.strip():
|
|
140
|
+
elif "_" in i.strip().lower():
|
|
141
141
|
for j in i.strip().split("_"):
|
|
142
142
|
result.append(j)
|
|
143
143
|
else:
|
|
144
|
-
result.append(i)
|
|
144
|
+
result.append(i.lower())
|
|
145
145
|
|
|
146
146
|
return result
|
|
147
147
|
|
|
148
148
|
def match_fn(x,y):
|
|
149
|
+
x = x.lower()
|
|
150
|
+
y = y.lower()
|
|
149
151
|
return (y == x or
|
|
150
152
|
y.replace("(", "").replace(")", "") == x.replace("(", "").replace(")", "") or
|
|
151
153
|
re.sub(r"\((.*)\)",r"", y) == x or re.sub(r"\((.*)\)",r"", x) == y)
|
|
@@ -156,8 +158,8 @@ class EvaluationEngine(BatchalignEngine):
|
|
|
156
158
|
@staticmethod
|
|
157
159
|
def __compute_wer(doc, gold):
|
|
158
160
|
# get the text of the document and get the text of the gold
|
|
159
|
-
forms = [ j.text
|
|
160
|
-
gold_forms = [ j.text
|
|
161
|
+
forms = [ j.text for i in doc.content for j in i.content if isinstance(i, Utterance)]
|
|
162
|
+
gold_forms = [ j.text for i in gold.content for j in i.content if isinstance(i, Utterance)]
|
|
161
163
|
|
|
162
164
|
forms = [i.replace("-", "") for i in forms if i.strip() not in MOR_PUNCT+ENDING_PUNCT]
|
|
163
165
|
gold_forms = [i.replace("-", "") for i in gold_forms if i.strip() not in MOR_PUNCT+ENDING_PUNCT]
|
|
@@ -86,6 +86,7 @@ class RevEngine(BatchalignEngine):
|
|
|
86
86
|
while status == JobStatus.IN_PROGRESS:
|
|
87
87
|
time.sleep(15)
|
|
88
88
|
status = client.get_job_details(job.id).status
|
|
89
|
+
L.debug(f"Rev.AI got '{status}'...")
|
|
89
90
|
|
|
90
91
|
# if we failed, report failure and give up
|
|
91
92
|
if status == JobStatus.FAILED:
|
|
@@ -60,10 +60,11 @@ def retokenize(intermediate_output):
|
|
|
60
60
|
word = word.replace("。", ".")
|
|
61
61
|
word = word.replace("¿", " ").replace("¡", " ")
|
|
62
62
|
tmp.append((word, bullet))
|
|
63
|
-
if len(word) > 0 and (word in ENDING_PUNCT
|
|
64
|
-
|
|
63
|
+
if len(word) > 0 and (word in ENDING_PUNCT+["؟", "۔", "،", "؛"]
|
|
64
|
+
or word[-1] in ENDING_PUNCT+["؟", "۔", "،", "؛"]):
|
|
65
|
+
if word in ENDING_PUNCT+["؟", "۔", "،", "؛"]:
|
|
65
66
|
final_outputs.append((speaker, tmp))
|
|
66
|
-
elif word[-1] in ENDING_PUNCT:
|
|
67
|
+
elif word[-1] in ENDING_PUNCT+["؟", "۔", "،", "؛"]:
|
|
67
68
|
# we want to seperate the ending punct out
|
|
68
69
|
final, time = tmp.pop(-1)
|
|
69
70
|
tmp.append((final[:-1], time))
|
|
@@ -102,7 +103,7 @@ def retokenize_with_engine(intermediate_output, engine):
|
|
|
102
103
|
# because we are using an utterance engine, we need
|
|
103
104
|
# to get rid of all the preexisting punctuation
|
|
104
105
|
for i in utterance:
|
|
105
|
-
for j in MOR_PUNCT+ENDING_PUNCT:
|
|
106
|
+
for j in MOR_PUNCT+ENDING_PUNCT+["؟", "۔", "،", "؛"]:
|
|
106
107
|
i[0] = i[0].strip(j).lower()
|
|
107
108
|
|
|
108
109
|
# remove everything that's now blank
|
|
@@ -118,7 +119,7 @@ def retokenize_with_engine(intermediate_output, engine):
|
|
|
118
119
|
# align the utterance against original splits and generate final outputs
|
|
119
120
|
for i in split:
|
|
120
121
|
# Check if the split has ending punctuation
|
|
121
|
-
if i[-1] in ENDING_PUNCT:
|
|
122
|
+
if i[-1] in ENDING_PUNCT+["؟", "۔", "،", "؛"]:
|
|
122
123
|
new_ut, delim = (i[:-1].split(" "), i[-1])
|
|
123
124
|
else:
|
|
124
125
|
new_ut, delim = (i.split(" "), ".")
|
|
@@ -264,16 +265,8 @@ def process_generation(output, lang="eng", utterance_engine=None):
|
|
|
264
265
|
seen_word = False
|
|
265
266
|
if word.strip() == "":
|
|
266
267
|
continue
|
|
267
|
-
if word not in ENDING_PUNCT+MOR_PUNCT:
|
|
268
|
+
if word not in ENDING_PUNCT+MOR_PUNCT+["؟", "۔", "،", "؛"]:
|
|
268
269
|
word_replaced = word
|
|
269
|
-
if word_replaced.strip() == "؟":
|
|
270
|
-
word_replaced = "?"
|
|
271
|
-
elif word_replaced.strip() == "۔":
|
|
272
|
-
word_replaced = "."
|
|
273
|
-
elif word_replaced.strip() == "،":
|
|
274
|
-
word_replaced = ","
|
|
275
|
-
elif word_replaced.strip() == "؛":
|
|
276
|
-
word_replaced = ";"
|
|
277
270
|
|
|
278
271
|
if start == None or end == None:
|
|
279
272
|
words.append(Form(text=word_replaced, time=None))
|
|
@@ -281,7 +274,15 @@ def process_generation(output, lang="eng", utterance_engine=None):
|
|
|
281
274
|
seen_word = True
|
|
282
275
|
words.append(Form(text=word_replaced, time=(int(start), int(end))))
|
|
283
276
|
else:
|
|
284
|
-
|
|
277
|
+
if word.strip() == "؟":
|
|
278
|
+
word = "?"
|
|
279
|
+
elif word.strip() == "۔":
|
|
280
|
+
word = "."
|
|
281
|
+
elif word.strip() == "،":
|
|
282
|
+
word = ","
|
|
283
|
+
elif word.strip() == "؛":
|
|
284
|
+
word = ";"
|
|
285
|
+
words.append(Form(text=word, time=None))
|
|
285
286
|
|
|
286
287
|
final_utterances.append(Utterance(
|
|
287
288
|
tier=participant,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/textgrid/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/textgrid/generator.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/utterance/__init__.py
RENAMED
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/utterance/cantonese_infer.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/analysis/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/asr/num2chinese.py
RENAMED
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/asr/oai_whisper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/cleanup/__init__.py
RENAMED
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/cleanup/cleanup.py
RENAMED
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/cleanup/disfluencies.py
RENAMED
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/cleanup/parse_support.py
RENAMED
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/cleanup/retrace.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/cleanup/support/test.test
RENAMED
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/diarization/__init__.py
RENAMED
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/diarization/pyannote.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/__init__.py
RENAMED
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/coref.py
RENAMED
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/en/irr.py
RENAMED
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/fr/apm.py
RENAMED
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/fr/apmn.py
RENAMED
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/fr/case.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/ud.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/speaker/__init__.py
RENAMED
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/speaker/nemo_speaker.py
RENAMED
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/translate/__init__.py
RENAMED
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/translate/gtrans.py
RENAMED
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/translate/seamless.py
RENAMED
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/translate/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/utr/whisper_utr.py
RENAMED
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/utterance/__init__.py
RENAMED
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/utterance/ud_utterance.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/formats/chat/test_chat_file.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/pipelines/test_pipeline.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|