batchalign 0.8.2__tar.gz → 0.8.2.post2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. {batchalign-0.8.2/batchalign.egg-info → batchalign-0.8.2.post2}/PKG-INFO +1 -1
  2. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/analysis/compare.py +179 -63
  3. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/version +1 -1
  4. {batchalign-0.8.2 → batchalign-0.8.2.post2/batchalign.egg-info}/PKG-INFO +1 -1
  5. {batchalign-0.8.2 → batchalign-0.8.2.post2}/LICENSE +0 -0
  6. {batchalign-0.8.2 → batchalign-0.8.2.post2}/MANIFEST.in +0 -0
  7. {batchalign-0.8.2 → batchalign-0.8.2.post2}/README.md +0 -0
  8. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/__init__.py +0 -0
  9. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/__main__.py +0 -0
  10. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/cli/__init__.py +0 -0
  11. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/cli/bench.py +0 -0
  12. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/cli/cache.py +0 -0
  13. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/cli/cli.py +0 -0
  14. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/cli/dispatch.py +0 -0
  15. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/constants.py +0 -0
  16. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/document.py +0 -0
  17. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/errors.py +0 -0
  18. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/__init__.py +0 -0
  19. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/base.py +0 -0
  20. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/chat/__init__.py +0 -0
  21. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/chat/file.py +0 -0
  22. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/chat/generator.py +0 -0
  23. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/chat/lexer.py +0 -0
  24. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/chat/parser.py +0 -0
  25. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/chat/utils.py +0 -0
  26. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/textgrid/__init__.py +0 -0
  27. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/textgrid/file.py +0 -0
  28. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/textgrid/generator.py +0 -0
  29. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/formats/textgrid/parser.py +0 -0
  30. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/__init__.py +0 -0
  31. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/audio_io.py +0 -0
  32. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/resolve.py +0 -0
  33. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/speaker/__init__.py +0 -0
  34. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/speaker/config.yaml +0 -0
  35. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/speaker/infer.py +0 -0
  36. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/speaker/utils.py +0 -0
  37. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/training/__init__.py +0 -0
  38. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/training/run.py +0 -0
  39. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/training/utils.py +0 -0
  40. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/utils.py +0 -0
  41. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/utterance/__init__.py +0 -0
  42. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/utterance/cantonese_infer.py +0 -0
  43. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/utterance/dataset.py +0 -0
  44. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/utterance/execute.py +0 -0
  45. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/utterance/infer.py +0 -0
  46. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/utterance/prep.py +0 -0
  47. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/utterance/train.py +0 -0
  48. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/wave2vec/__init__.py +0 -0
  49. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/wave2vec/infer_fa.py +0 -0
  50. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/whisper/__init__.py +0 -0
  51. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/whisper/infer_asr.py +0 -0
  52. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/models/whisper/infer_fa.py +0 -0
  53. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/__init__.py +0 -0
  54. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/analysis/__init__.py +0 -0
  55. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/analysis/eval.py +0 -0
  56. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/__init__.py +0 -0
  57. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2chinese.py +0 -0
  58. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/__init__.py +0 -0
  59. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/deu.py +0 -0
  60. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/ell.py +0 -0
  61. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/eng.py +0 -0
  62. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/eus.py +0 -0
  63. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/fra.py +0 -0
  64. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/hrv.py +0 -0
  65. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/ind.py +0 -0
  66. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/jpn.py +0 -0
  67. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/nld.py +0 -0
  68. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/por.py +0 -0
  69. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/spa.py +0 -0
  70. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/num2lang/tha.py +0 -0
  71. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/oai_whisper.py +0 -0
  72. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/rev.py +0 -0
  73. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/utils.py +0 -0
  74. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/whisper.py +0 -0
  75. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/asr/whisperx.py +0 -0
  76. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/avqi/__init__.py +0 -0
  77. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/avqi/engine.py +0 -0
  78. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/base.py +0 -0
  79. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/cache.py +0 -0
  80. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/cleanup/__init__.py +0 -0
  81. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  82. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  83. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  84. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/cleanup/retrace.py +0 -0
  85. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  86. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  87. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/cleanup/support/test.test +0 -0
  88. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/diarization/__init__.py +0 -0
  89. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/diarization/pyannote.py +0 -0
  90. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/dispatch.py +0 -0
  91. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/fa/__init__.py +0 -0
  92. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/fa/wave2vec_fa.py +0 -0
  93. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  94. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  95. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/morphosyntax/coref.py +0 -0
  96. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  97. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  98. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  99. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  100. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  101. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/morphosyntax/ud.py +0 -0
  102. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/opensmile/__init__.py +0 -0
  103. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/opensmile/engine.py +0 -0
  104. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/pipeline.py +0 -0
  105. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/speaker/__init__.py +0 -0
  106. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  107. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/translate/__init__.py +0 -0
  108. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/translate/gtrans.py +0 -0
  109. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/translate/seamless.py +0 -0
  110. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/translate/utils.py +0 -0
  111. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/utr/__init__.py +0 -0
  112. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/utr/rev_utr.py +0 -0
  113. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/utr/utils.py +0 -0
  114. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  115. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/utterance/__init__.py +0 -0
  116. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  117. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/__init__.py +0 -0
  118. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/cli/test_dispatch_memory.py +0 -0
  119. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/conftest.py +0 -0
  120. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  121. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  122. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  123. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  124. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  125. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  126. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/models/test_audio_io.py +0 -0
  127. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/models/test_audio_lazy.py +0 -0
  128. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  129. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  130. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  131. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/cache/__init__.py +0 -0
  132. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/cache/test_cache.py +0 -0
  133. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  134. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  135. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  136. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/fa/test_fa_short_segments.py +0 -0
  137. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/fixures.py +0 -0
  138. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  139. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  140. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/tests/test_document.py +0 -0
  141. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/utils/__init__.py +0 -0
  142. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/utils/abbrev.py +0 -0
  143. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/utils/compounds.py +0 -0
  144. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/utils/config.py +0 -0
  145. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/utils/device.py +0 -0
  146. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/utils/dp.py +0 -0
  147. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/utils/names.py +0 -0
  148. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign/utils/utils.py +0 -0
  149. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign.egg-info/SOURCES.txt +0 -0
  150. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign.egg-info/dependency_links.txt +0 -0
  151. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign.egg-info/entry_points.txt +0 -0
  152. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign.egg-info/requires.txt +0 -0
  153. {batchalign-0.8.2 → batchalign-0.8.2.post2}/batchalign.egg-info/top_level.txt +0 -0
  154. {batchalign-0.8.2 → batchalign-0.8.2.post2}/setup.cfg +0 -0
  155. {batchalign-0.8.2 → batchalign-0.8.2.post2}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.8.2
3
+ Version: 0.8.2.post2
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -12,6 +12,7 @@ computes error-rate metrics for CSV output.
12
12
 
13
13
  import re
14
14
  import logging
15
+ from collections import Counter
15
16
  from batchalign.document import *
16
17
  from batchalign.pipelines.base import *
17
18
  from batchalign.utils.dp import align, ExtraType, Extra, Match
@@ -151,6 +152,65 @@ def match_fn(x, y):
151
152
  # --- End of eval.py duplicates ---
152
153
 
153
154
 
155
+ def _find_best_segment(gold_tokens, main_tokens, mfn):
156
+ """Find a rough window using bag-of-words overlap.
157
+
158
+ The rough pass is order-invariant: it scores contiguous windows by token
159
+ multiset overlap with the gold utterance, ignoring order. To keep common
160
+ words from swallowing later transcript material, it only considers windows
161
+ near the gold utterance length. Among equally good windows it prefers the
162
+ latest one, not the earliest. The caller then runs the full Levenshtein
163
+ aligner inside that window to produce token annotations.
164
+ """
165
+ if not gold_tokens or not main_tokens:
166
+ return 0, 0
167
+
168
+ gold_counts = Counter(gold_tokens)
169
+ gold_len = len(gold_tokens)
170
+ main_len = len(main_tokens)
171
+
172
+ min_window = max(1, gold_len - 2)
173
+ max_window = min(main_len, gold_len + 2)
174
+
175
+ best = (0, min(main_len, gold_len))
176
+ best_score = -1.0
177
+ best_len_delta = None
178
+
179
+ for span in range(min_window, max_window + 1):
180
+ window_counts = Counter(main_tokens[:span])
181
+ overlap = sum(min(window_counts[token], gold_counts[token]) for token in window_counts)
182
+
183
+ for start in range(0, main_len - span + 1):
184
+ if start > 0:
185
+ left = main_tokens[start - 1]
186
+ right = main_tokens[start + span - 1]
187
+
188
+ overlap -= min(window_counts[left], gold_counts[left])
189
+ window_counts[left] -= 1
190
+ overlap += min(window_counts[left], gold_counts[left])
191
+
192
+ overlap -= min(window_counts[right], gold_counts[right])
193
+ window_counts[right] += 1
194
+ overlap += min(window_counts[right], gold_counts[right])
195
+
196
+ score = overlap / gold_len
197
+ len_delta = abs(span - gold_len)
198
+ end = start + span
199
+
200
+ if score > best_score:
201
+ best = (start, end)
202
+ best_score = score
203
+ best_len_delta = len_delta
204
+ elif score == best_score:
205
+ if best_len_delta is None or len_delta < best_len_delta:
206
+ best = (start, end)
207
+ best_len_delta = len_delta
208
+ elif len_delta == best_len_delta and end > best[1]:
209
+ best = (start, end)
210
+
211
+ return best
212
+
213
+
154
214
  def _get_pos(form):
155
215
  """Extract uppercased POS from a Form's morphology, or '?' if absent."""
156
216
  if form is not None and form.morphology:
@@ -202,13 +262,10 @@ class CompareEngine(BatchalignEngine):
202
262
  ]
203
263
  main_info = [] # (utt_idx, form_idx, Form)
204
264
  main_words = []
205
- main_punct = {} # utt_idx -> list of (form_idx, Form)
206
265
 
207
266
  for utt_idx, utt in enumerate(main_utterances):
208
- main_punct[utt_idx] = []
209
267
  for form_idx, form in enumerate(utt.content):
210
268
  if form.text.strip() in MOR_PUNCT + ENDING_PUNCT:
211
- main_punct[utt_idx].append((form_idx, form))
212
269
  continue
213
270
  if form.text.strip().lower() in fillers:
214
271
  continue
@@ -221,10 +278,13 @@ class CompareEngine(BatchalignEngine):
221
278
  ]
222
279
  gold_info = [] # (utt_idx, form_idx, Form)
223
280
  gold_words = []
281
+ gold_punct = {} # utt_idx -> list of (form_idx, Form)
224
282
 
225
283
  for utt_idx, utt in enumerate(gold_utterances):
284
+ gold_punct[utt_idx] = []
226
285
  for form_idx, form in enumerate(utt.content):
227
286
  if form.text.strip() in MOR_PUNCT + ENDING_PUNCT:
287
+ gold_punct[utt_idx].append((form_idx, form))
228
288
  continue
229
289
  if form.text.strip().lower() in fillers:
230
290
  continue
@@ -235,102 +295,149 @@ class CompareEngine(BatchalignEngine):
235
295
  conformed_main, main_map = conform_with_mapping(main_words, conform)
236
296
  conformed_gold, gold_map = conform_with_mapping(gold_words, conform)
237
297
 
238
- # --- 4. Align ---
239
- alignment = align(conformed_main, conformed_gold, False, match_fn)
240
-
241
- # --- 5. Redistribute alignment results per main utterance ---
242
- # Store (position, CompareToken) pairs so we can interleave punct
243
- utt_positioned = {i: [] for i in range(len(main_utterances))}
244
- current_main_utt = 0
245
- last_main_form_idx = -1
246
- main_cursor = 0
247
- gold_cursor = 0
248
-
249
- for item in alignment:
250
- if isinstance(item, Match):
251
- orig_main_idx = main_map[main_cursor]
252
- main_utt_idx = main_info[orig_main_idx][0]
253
- main_form_idx = main_info[orig_main_idx][1]
254
- main_form = main_info[orig_main_idx][2]
255
- current_main_utt = main_utt_idx
256
- last_main_form_idx = main_form_idx
257
-
258
- utt_positioned[main_utt_idx].append((main_form_idx, CompareToken(
259
- text=item.key,
260
- pos=_get_pos(main_form),
261
- status="match"
262
- )))
263
- main_cursor += 1
264
- gold_cursor += 1
265
-
266
- elif isinstance(item, Extra):
267
- if item.extra_type == ExtraType.PAYLOAD:
268
- # Word in main but not in gold -> extra_main (+)
269
- orig_main_idx = main_map[main_cursor]
270
- main_utt_idx = main_info[orig_main_idx][0]
271
- main_form_idx = main_info[orig_main_idx][1]
272
- main_form = main_info[orig_main_idx][2]
273
- current_main_utt = main_utt_idx
274
- last_main_form_idx = main_form_idx
298
+ # --- 4. Partition conformed gold tokens by utterance ---
299
+ gold_utt_tokens = {i: [] for i in range(len(gold_utterances))}
300
+ gold_utt_maps = {i: [] for i in range(len(gold_utterances))}
301
+ for j in range(len(conformed_gold)):
302
+ orig_idx = gold_map[j]
303
+ utt_idx = gold_info[orig_idx][0]
304
+ gold_utt_tokens[utt_idx].append(conformed_gold[j])
305
+ gold_utt_maps[utt_idx].append(orig_idx)
306
+
307
+ # --- 5. Per-utterance alignment ---
308
+ # For each gold utterance, find a rough last-possible bag-of-words
309
+ # window in the remaining main tokens, then run Levenshtein inside
310
+ # that window to produce the annotations.
311
+ utt_positioned = {i: [] for i in range(len(gold_utterances))}
312
+ search_start = 0
313
+
314
+ for utt_idx in range(len(gold_utterances)):
315
+ g_tokens = gold_utt_tokens[utt_idx]
316
+ g_maps = gold_utt_maps[utt_idx]
317
+ G = len(g_tokens)
318
+
319
+ if G == 0:
320
+ continue
275
321
 
276
- utt_positioned[main_utt_idx].append((main_form_idx, CompareToken(
277
- text=item.key,
278
- pos=_get_pos(main_form),
279
- status="extra_main"
280
- )))
281
- main_cursor += 1
322
+ remaining_main = conformed_main[search_start:]
323
+ win_start, win_end = _find_best_segment(g_tokens, remaining_main, match_fn)
324
+
325
+ abs_start = search_start + win_start
326
+ abs_end = search_start + win_end
327
+
328
+ # Align the chosen window against this gold utterance
329
+ window_main = conformed_main[abs_start:abs_end]
330
+ utt_alignment = align(window_main, g_tokens, False, match_fn)
282
331
 
283
- else:
284
- # Word in gold but not in main -> extra_gold (-)
285
- orig_gold_idx = gold_map[gold_cursor]
332
+ local_main_cursor = 0
333
+ local_gold_cursor = 0
334
+ last_gold_form_idx = -1
335
+
336
+ for item in utt_alignment:
337
+ if isinstance(item, Match):
338
+ global_main_idx = abs_start + local_main_cursor
339
+ orig_main_idx = main_map[global_main_idx]
340
+ main_form = main_info[orig_main_idx][2]
341
+ orig_gold_idx = g_maps[local_gold_cursor]
342
+ gold_form_idx = gold_info[orig_gold_idx][1]
286
343
  gold_form = gold_info[orig_gold_idx][2]
344
+ last_gold_form_idx = gold_form_idx
287
345
 
288
- # Position just after last main form for correct ordering
289
- pos = last_main_form_idx + 0.5
290
- utt_positioned[current_main_utt].append((pos, CompareToken(
346
+ if main_form.time is not None:
347
+ gold_form.time = main_form.time
348
+ if main_form.morphology is not None:
349
+ gold_form.morphology = main_form.morphology
350
+ if main_form.dependency is not None:
351
+ gold_form.dependency = main_form.dependency
352
+
353
+ utt_positioned[utt_idx].append((gold_form_idx, CompareToken(
291
354
  text=item.key,
292
- pos=_get_pos(gold_form),
293
- status="extra_gold"
355
+ pos=_get_pos(main_form),
356
+ status="match"
294
357
  )))
295
- gold_cursor += 1
296
-
297
- # --- 6. Merge punctuation at original positions ---
298
- for utt_idx in range(len(main_utterances)):
299
- for form_idx, form in main_punct[utt_idx]:
358
+ local_main_cursor += 1
359
+ local_gold_cursor += 1
360
+
361
+ elif isinstance(item, Extra):
362
+ if item.extra_type == ExtraType.REFERENCE:
363
+ orig_gold_idx = g_maps[local_gold_cursor]
364
+ gold_form_idx = gold_info[orig_gold_idx][1]
365
+ gold_form = gold_info[orig_gold_idx][2]
366
+ last_gold_form_idx = gold_form_idx
367
+
368
+ utt_positioned[utt_idx].append((gold_form_idx, CompareToken(
369
+ text=item.key,
370
+ pos=_get_pos(gold_form),
371
+ status="extra_gold"
372
+ )))
373
+ local_gold_cursor += 1
374
+
375
+ else:
376
+ global_main_idx = abs_start + local_main_cursor
377
+ orig_main_idx = main_map[global_main_idx]
378
+ main_form = main_info[orig_main_idx][2]
379
+
380
+ pos = last_gold_form_idx + 0.5
381
+ utt_positioned[utt_idx].append((pos, CompareToken(
382
+ text=item.key,
383
+ pos=_get_pos(main_form),
384
+ status="extra_main"
385
+ )))
386
+ local_main_cursor += 1
387
+
388
+ search_start = abs_end
389
+
390
+ # --- 6. Merge punctuation from gold at original positions ---
391
+ for utt_idx in range(len(gold_utterances)):
392
+ for form_idx, form in gold_punct[utt_idx]:
300
393
  utt_positioned[utt_idx].append((form_idx, CompareToken(
301
394
  text=form.text,
302
395
  pos="PUNCT",
303
396
  status="match"
304
397
  )))
305
- # Stable sort by position preserves order within same form_idx
306
398
  utt_positioned[utt_idx].sort(key=lambda x: x[0])
307
399
 
308
- # --- 7. Set comparison on each utterance ---
309
- for utt_idx, utt in enumerate(main_utterances):
400
+ # --- 7. Set comparison on each gold utterance ---
401
+ for utt_idx, utt in enumerate(gold_utterances):
310
402
  tokens = [tok for _, tok in utt_positioned[utt_idx]]
311
403
  utt.comparison = tokens if tokens else None
312
404
 
313
- return doc
405
+ timed_forms = [form for form in utt.content if form.time is not None]
406
+ if timed_forms:
407
+ utt.time = (timed_forms[0].time[0], timed_forms[-1].time[1])
408
+ utt.text = None
409
+
410
+ return gold
314
411
 
315
412
 
316
413
  class CompareAnalysisEngine(BatchalignEngine):
317
414
  tasks = [Task.COMPARE_ANALYSIS]
318
415
 
319
416
  def analyze(self, doc, **kwargs):
417
+ from collections import defaultdict
418
+
320
419
  matches = 0
321
420
  extra_main = 0
322
421
  extra_gold = 0
323
422
 
423
+ # Per-POS counters: pos -> {matches, insertions, deletions}
424
+ pos_counts = defaultdict(lambda: {"matches": 0, "insertions": 0, "deletions": 0})
425
+
324
426
  for utt in doc.content:
325
427
  if not isinstance(utt, Utterance) or utt.comparison is None:
326
428
  continue
327
429
  for tok in utt.comparison:
430
+ if tok.pos == "PUNCT":
431
+ continue
328
432
  if tok.status == "match":
329
433
  matches += 1
434
+ pos_counts[tok.pos]["matches"] += 1
330
435
  elif tok.status == "extra_main":
331
436
  extra_main += 1
437
+ pos_counts[tok.pos]["insertions"] += 1
332
438
  elif tok.status == "extra_gold":
333
439
  extra_gold += 1
440
+ pos_counts[tok.pos]["deletions"] += 1
334
441
 
335
442
  total_gold = matches + extra_gold
336
443
  total_main = matches + extra_main
@@ -347,6 +454,15 @@ class CompareAnalysisEngine(BatchalignEngine):
347
454
  "total_main_words": total_main,
348
455
  }
349
456
 
457
+ # Add per-POS breakdown
458
+ for pos in sorted(pos_counts.keys()):
459
+ counts = pos_counts[pos]
460
+ total = counts["matches"] + counts["deletions"]
461
+ metrics[f"{pos}:matches"] = counts["matches"]
462
+ metrics[f"{pos}:insertions"] = counts["insertions"]
463
+ metrics[f"{pos}:deletions"] = counts["deletions"]
464
+ metrics[f"{pos}:total"] = total
465
+
350
466
  return {
351
467
  "doc": doc,
352
468
  "metrics": metrics,
@@ -1,3 +1,3 @@
1
- 0.8.2
1
+ 0.8.2-post.2
2
2
  Feb 27th 2026
3
3
  adds new feature regarding compare
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.8.2
3
+ Version: 0.8.2.post2
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
File without changes
File without changes
File without changes
File without changes
File without changes