batchalign 0.7.21.post9__tar.gz → 0.7.21.post11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of batchalign might be problematic. Click here for more details.

Files changed (129) hide show
  1. {batchalign-0.7.21.post9/batchalign.egg-info → batchalign-0.7.21.post11}/PKG-INFO +1 -1
  2. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/analysis/eval.py +48 -46
  3. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/asr/rev.py +1 -0
  4. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/asr/utils.py +16 -15
  5. batchalign-0.7.21.post11/batchalign/version +3 -0
  6. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11/batchalign.egg-info}/PKG-INFO +1 -1
  7. batchalign-0.7.21.post9/batchalign/version +0 -3
  8. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/LICENSE +0 -0
  9. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/MANIFEST.in +0 -0
  10. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/README.md +0 -0
  11. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/__init__.py +0 -0
  12. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/__main__.py +0 -0
  13. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/cli/__init__.py +0 -0
  14. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/cli/cli.py +0 -0
  15. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/cli/dispatch.py +0 -0
  16. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/constants.py +0 -0
  17. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/document.py +0 -0
  18. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/errors.py +0 -0
  19. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/__init__.py +0 -0
  20. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/base.py +0 -0
  21. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/chat/__init__.py +0 -0
  22. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/chat/file.py +0 -0
  23. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/chat/generator.py +0 -0
  24. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/chat/lexer.py +0 -0
  25. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/chat/parser.py +0 -0
  26. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/chat/utils.py +0 -0
  27. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/textgrid/__init__.py +0 -0
  28. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/textgrid/file.py +0 -0
  29. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/textgrid/generator.py +0 -0
  30. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/formats/textgrid/parser.py +0 -0
  31. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/__init__.py +0 -0
  32. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/resolve.py +0 -0
  33. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/speaker/__init__.py +0 -0
  34. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/speaker/config.yaml +0 -0
  35. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/speaker/infer.py +0 -0
  36. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/speaker/utils.py +0 -0
  37. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/training/__init__.py +0 -0
  38. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/training/run.py +0 -0
  39. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/training/utils.py +0 -0
  40. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/utils.py +0 -0
  41. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/utterance/__init__.py +0 -0
  42. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/utterance/cantonese_infer.py +0 -0
  43. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/utterance/dataset.py +0 -0
  44. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/utterance/execute.py +0 -0
  45. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/utterance/infer.py +0 -0
  46. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/utterance/prep.py +0 -0
  47. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/utterance/train.py +0 -0
  48. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/wave2vec/__init__.py +0 -0
  49. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/wave2vec/infer_fa.py +0 -0
  50. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/whisper/__init__.py +0 -0
  51. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/whisper/infer_asr.py +0 -0
  52. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/models/whisper/infer_fa.py +0 -0
  53. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/__init__.py +0 -0
  54. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/analysis/__init__.py +0 -0
  55. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/asr/__init__.py +0 -0
  56. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/asr/num2chinese.py +0 -0
  57. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/asr/oai_whisper.py +0 -0
  58. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/asr/whisper.py +0 -0
  59. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/asr/whisperx.py +0 -0
  60. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/avqi/__init__.py +0 -0
  61. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/avqi/engine.py +0 -0
  62. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/base.py +0 -0
  63. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/cleanup/__init__.py +0 -0
  64. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  65. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  66. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  67. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/cleanup/retrace.py +0 -0
  68. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  69. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  70. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/cleanup/support/test.test +0 -0
  71. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/diarization/__init__.py +0 -0
  72. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/diarization/pyannote.py +0 -0
  73. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/dispatch.py +0 -0
  74. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/fa/__init__.py +0 -0
  75. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/fa/wave2vec_fa.py +0 -0
  76. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  77. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  78. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/coref.py +0 -0
  79. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  80. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  81. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  82. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  83. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  84. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/morphosyntax/ud.py +0 -0
  85. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/pipeline.py +0 -0
  86. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/speaker/__init__.py +0 -0
  87. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  88. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/translate/__init__.py +0 -0
  89. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/translate/gtrans.py +0 -0
  90. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/translate/seamless.py +0 -0
  91. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/translate/utils.py +0 -0
  92. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/utr/__init__.py +0 -0
  93. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/utr/rev_utr.py +0 -0
  94. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/utr/utils.py +0 -0
  95. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  96. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/utterance/__init__.py +0 -0
  97. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  98. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/__init__.py +0 -0
  99. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/conftest.py +0 -0
  100. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  101. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  102. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  103. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  104. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  105. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  106. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  107. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  108. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  109. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  110. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  111. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  112. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/pipelines/fixures.py +0 -0
  113. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  114. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  115. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/tests/test_document.py +0 -0
  116. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/utils/__init__.py +0 -0
  117. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/utils/abbrev.py +0 -0
  118. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/utils/compounds.py +0 -0
  119. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/utils/config.py +0 -0
  120. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/utils/dp.py +0 -0
  121. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/utils/names.py +0 -0
  122. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign/utils/utils.py +0 -0
  123. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign.egg-info/SOURCES.txt +0 -0
  124. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign.egg-info/dependency_links.txt +0 -0
  125. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign.egg-info/entry_points.txt +0 -0
  126. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign.egg-info/requires.txt +0 -0
  127. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/batchalign.egg-info/top_level.txt +0 -0
  128. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/setup.cfg +0 -0
  129. {batchalign-0.7.21.post9 → batchalign-0.7.21.post11}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batchalign
3
- Version: 0.7.21.post9
3
+ Version: 0.7.21.post11
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -18,134 +18,136 @@ import logging
18
18
  L = logging.getLogger("batchalign")
19
19
 
20
20
  joined_compounds = ["".join(k) for k in compounds]
21
- lowered_abbrev = [k.lower() for k in abbrev]
21
+ lowered_abbrev = [k for k in abbrev]
22
22
 
23
23
  fillers = ["um", "uhm", "em", "mhm", "uhhm", "eh", "uh", "hm"]
24
24
  def conform(x):
25
25
  result = []
26
26
  for i in x:
27
- if i.strip() in joined_compounds:
27
+ if i.strip().lower() in joined_compounds:
28
28
  for k in compounds[joined_compounds.index(i.strip())]:
29
29
  result.append(k)
30
30
  elif i.strip() in lowered_abbrev:
31
31
  for j in i.strip():
32
- result.append(j)
33
- elif "'s" in i.strip():
32
+ result.append(j.strip())
33
+ elif "'s" in i.strip().lower():
34
34
  result.append(i.split("'")[0])
35
35
  result.append("is")
36
- elif "'ve" in i.strip():
36
+ elif "'ve" in i.strip().lower():
37
37
  result.append(i.split("'")[0])
38
38
  result.append("have")
39
- elif "'s" in i.strip():
39
+ elif "'s" in i.strip().lower():
40
40
  result.append(i.split("'")[0])
41
41
  result.append("is")
42
- elif "'d" in i.strip():
42
+ elif "'d" in i.strip().lower():
43
43
  result.append(i.split("'")[0])
44
44
  result.append("had")
45
- elif "'m" in i.strip():
45
+ elif "'m" in i.strip().lower():
46
46
  result.append(i.split("'")[0])
47
47
  result.append("am")
48
- elif i.strip() in fillers:
48
+ elif i.strip().lower() in fillers:
49
49
  result.append("um")
50
- elif "-" in i.strip():
51
- result += [k.strip() for k in i.split("-")]
52
- elif "ok" == i.strip():
50
+ elif "-" in i.strip().lower():
51
+ result += [k.strip() for k in i.split("-").lower()]
52
+ elif "ok" == i.strip().lower():
53
53
  result.append("okay")
54
- elif "gimme" == i.strip():
54
+ elif "gimme" == i.strip().lower():
55
55
  result.append("give")
56
56
  result.append("me")
57
- elif "hafta" == i.strip() or "havta" == i.strip():
57
+ elif "hafta" == i.strip().lower() or "havta" == i.strip().lower():
58
58
  result.append("have")
59
59
  result.append("to")
60
- elif i.strip() in names:
60
+ elif i.strip().lower() in names:
61
61
  result.append("name")
62
- elif "dunno" == i.strip():
62
+ elif "dunno" == i.strip().lower():
63
63
  result.append("don't")
64
64
  result.append("know")
65
- elif "wanna" == i.strip():
65
+ elif "wanna" == i.strip().lower():
66
66
  result.append("want")
67
67
  result.append("to")
68
- elif "mba" == i.strip():
68
+ elif "mba" == i.strip().lower():
69
69
  result.append("m")
70
70
  result.append("b")
71
71
  result.append("a")
72
- elif "tli" == i.strip():
72
+ elif "tli" == i.strip().lower():
73
73
  result.append("t")
74
74
  result.append("l")
75
75
  result.append("i")
76
- elif "bbc" == i.strip():
76
+ elif "bbc" == i.strip().lower():
77
77
  result.append("b")
78
78
  result.append("b")
79
79
  result.append("c")
80
- elif "ii" == i.strip():
80
+ elif "ii" == i.strip().lower():
81
81
  result.append("i")
82
82
  result.append("i")
83
- elif "i'd" == i.strip():
83
+ elif "i'd" == i.strip().lower():
84
84
  result.append("i")
85
85
  result.append("had")
86
- elif "alright" == i.strip():
86
+ elif "alright" == i.strip().lower():
87
87
  result.append("all")
88
88
  result.append("right")
89
- elif "sorta" == i.strip():
89
+ elif "sorta" == i.strip().lower():
90
90
  result.append("sort")
91
91
  result.append("of")
92
- elif "alrightie" == i.strip():
92
+ elif "alrightie" == i.strip().lower():
93
93
  result.append("all")
94
94
  result.append("right")
95
- elif "mm" == i.strip():
95
+ elif "mm" == i.strip().lower():
96
96
  result.append("hm")
97
- elif "ai" == i.strip():
97
+ elif "ai" == i.strip().lower():
98
98
  result.append("a")
99
99
  result.append("i")
100
- elif "this'll" == i.strip():
100
+ elif "this'll" == i.strip().lower():
101
101
  result.append("this")
102
102
  result.append("will")
103
- elif "gotta" == i.strip():
103
+ elif "gotta" == i.strip().lower():
104
104
  result.append("got")
105
105
  result.append("to")
106
- elif "hadta" == i.strip():
106
+ elif "hadta" == i.strip().lower():
107
107
  result.append("had")
108
108
  result.append("to")
109
- elif "eh" == i.strip():
109
+ elif "eh" == i.strip().lower():
110
110
  result.append("uh")
111
- elif "kinda" == i.strip():
111
+ elif "kinda" == i.strip().lower():
112
112
  result.append("kind")
113
113
  result.append("of")
114
- elif "ed" == i.strip():
114
+ elif "ed" == i.strip().lower():
115
115
  result.append("education")
116
- elif "til" == i.strip():
116
+ elif "til" == i.strip().lower():
117
117
  result.append("until")
118
- elif "gonna" == i.strip():
118
+ elif "gonna" == i.strip().lower():
119
119
  result.append("going")
120
120
  result.append("to")
121
- elif "shoulda" == i.strip():
121
+ elif "shoulda" == i.strip().lower():
122
122
  result.append("should")
123
123
  result.append("have")
124
- elif "sposta" == i.strip():
124
+ elif "sposta" == i.strip().lower():
125
125
  result.append("supposed")
126
126
  result.append("to")
127
- elif "farmhouse" == i.strip():
127
+ elif "farmhouse" == i.strip().lower():
128
128
  result.append("farm")
129
129
  result.append("house")
130
- elif "aa" == i.strip():
130
+ elif "aa" == i.strip().lower():
131
131
  result.append("a")
132
132
  result.append("a")
133
- elif "aa" == i.strip():
133
+ elif "aa" == i.strip().lower():
134
134
  result.append("a")
135
135
  result.append("a")
136
- elif "em" == i.strip():
136
+ elif "em" == i.strip().lower():
137
137
  result.append("them")
138
- elif "hmm" == i.strip():
138
+ elif "hmm" == i.strip().lower():
139
139
  result.append("hm")
140
- elif "_" in i.strip():
140
+ elif "_" in i.strip().lower():
141
141
  for j in i.strip().split("_"):
142
142
  result.append(j)
143
143
  else:
144
- result.append(i)
144
+ result.append(i.lower())
145
145
 
146
146
  return result
147
147
 
148
148
  def match_fn(x,y):
149
+ x = x.lower()
150
+ y = y.lower()
149
151
  return (y == x or
150
152
  y.replace("(", "").replace(")", "") == x.replace("(", "").replace(")", "") or
151
153
  re.sub(r"\((.*)\)",r"", y) == x or re.sub(r"\((.*)\)",r"", x) == y)
@@ -156,8 +158,8 @@ class EvaluationEngine(BatchalignEngine):
156
158
  @staticmethod
157
159
  def __compute_wer(doc, gold):
158
160
  # get the text of the document and get the text of the gold
159
- forms = [ j.text.lower() for i in doc.content for j in i.content if isinstance(i, Utterance)]
160
- gold_forms = [ j.text.lower() for i in gold.content for j in i.content if isinstance(i, Utterance)]
161
+ forms = [ j.text for i in doc.content for j in i.content if isinstance(i, Utterance)]
162
+ gold_forms = [ j.text for i in gold.content for j in i.content if isinstance(i, Utterance)]
161
163
 
162
164
  forms = [i.replace("-", "") for i in forms if i.strip() not in MOR_PUNCT+ENDING_PUNCT]
163
165
  gold_forms = [i.replace("-", "") for i in gold_forms if i.strip() not in MOR_PUNCT+ENDING_PUNCT]
@@ -86,6 +86,7 @@ class RevEngine(BatchalignEngine):
86
86
  while status == JobStatus.IN_PROGRESS:
87
87
  time.sleep(15)
88
88
  status = client.get_job_details(job.id).status
89
+ L.debug(f"Rev.AI got '{status}'...")
89
90
 
90
91
  # if we failed, report failure and give up
91
92
  if status == JobStatus.FAILED:
@@ -60,10 +60,11 @@ def retokenize(intermediate_output):
60
60
  word = word.replace("。", ".")
61
61
  word = word.replace("¿", " ").replace("¡", " ")
62
62
  tmp.append((word, bullet))
63
- if len(word) > 0 and (word in ENDING_PUNCT or word[-1] in ENDING_PUNCT):
64
- if word in ENDING_PUNCT:
63
+ if len(word) > 0 and (word in ENDING_PUNCT+["؟", "۔", "،", "؛"]
64
+ or word[-1] in ENDING_PUNCT+["؟", "۔", "،", "؛"]):
65
+ if word in ENDING_PUNCT+["؟", "۔", "،", "؛"]:
65
66
  final_outputs.append((speaker, tmp))
66
- elif word[-1] in ENDING_PUNCT:
67
+ elif word[-1] in ENDING_PUNCT+["؟", "۔", "،", "؛"]:
67
68
  # we want to seperate the ending punct out
68
69
  final, time = tmp.pop(-1)
69
70
  tmp.append((final[:-1], time))
@@ -102,7 +103,7 @@ def retokenize_with_engine(intermediate_output, engine):
102
103
  # because we are using an utterance engine, we need
103
104
  # to get rid of all the preexisting punctuation
104
105
  for i in utterance:
105
- for j in MOR_PUNCT+ENDING_PUNCT:
106
+ for j in MOR_PUNCT+ENDING_PUNCT+["؟", "۔", "،", "؛"]:
106
107
  i[0] = i[0].strip(j).lower()
107
108
 
108
109
  # remove everything that's now blank
@@ -118,7 +119,7 @@ def retokenize_with_engine(intermediate_output, engine):
118
119
  # align the utterance against original splits and generate final outputs
119
120
  for i in split:
120
121
  # Check if the split has ending punctuation
121
- if i[-1] in ENDING_PUNCT:
122
+ if i[-1] in ENDING_PUNCT+["؟", "۔", "،", "؛"]:
122
123
  new_ut, delim = (i[:-1].split(" "), i[-1])
123
124
  else:
124
125
  new_ut, delim = (i.split(" "), ".")
@@ -264,16 +265,8 @@ def process_generation(output, lang="eng", utterance_engine=None):
264
265
  seen_word = False
265
266
  if word.strip() == "":
266
267
  continue
267
- if word not in ENDING_PUNCT+MOR_PUNCT:
268
+ if word not in ENDING_PUNCT+MOR_PUNCT+["؟", "۔", "،", "؛"]:
268
269
  word_replaced = word
269
- if word_replaced.strip() == "؟":
270
- word_replaced = "?"
271
- elif word_replaced.strip() == "۔":
272
- word_replaced = "."
273
- elif word_replaced.strip() == "،":
274
- word_replaced = ","
275
- elif word_replaced.strip() == "؛":
276
- word_replaced = ";"
277
270
 
278
271
  if start == None or end == None:
279
272
  words.append(Form(text=word_replaced, time=None))
@@ -281,7 +274,15 @@ def process_generation(output, lang="eng", utterance_engine=None):
281
274
  seen_word = True
282
275
  words.append(Form(text=word_replaced, time=(int(start), int(end))))
283
276
  else:
284
- words.append(Form(text=word, time=None))
277
+ if word.strip() == "؟":
278
+ word = "?"
279
+ elif word.strip() == "۔":
280
+ word = "."
281
+ elif word.strip() == "،":
282
+ word = ","
283
+ elif word.strip() == "؛":
284
+ word = ";"
285
+ words.append(Form(text=word, time=None))
285
286
 
286
287
  final_utterances.append(Utterance(
287
288
  tier=participant,
@@ -0,0 +1,3 @@
1
+ 0.7.21-post.11
2
+ October 2st, 2025
3
+ Fix arabic punctuation
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batchalign
3
- Version: 0.7.21.post9
3
+ Version: 0.7.21.post11
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -1,3 +0,0 @@
1
- 0.7.21-post.9
2
- September 29th, 2025
3
- Arabic Commas