batchalign 0.8.0.post4__tar.gz → 0.8.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of batchalign might be problematic. Click here for more details.

Files changed (168) hide show
  1. {batchalign-0.8.0.post4/batchalign.egg-info → batchalign-0.8.1}/PKG-INFO +3 -1
  2. batchalign-0.8.1/batchalign/__init__.py +48 -0
  3. batchalign-0.8.1/batchalign/cli/cache.py +263 -0
  4. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/cli/cli.py +5 -0
  5. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/cli/dispatch.py +6 -3
  6. batchalign-0.8.1/batchalign/formats/__init__.py +11 -0
  7. batchalign-0.8.1/batchalign/models/__init__.py +33 -0
  8. batchalign-0.8.1/batchalign/models/speaker/__init__.py +7 -0
  9. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/utils.py +31 -0
  10. batchalign-0.8.1/batchalign/models/utterance/__init__.py +13 -0
  11. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/utterance/cantonese_infer.py +28 -40
  12. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/utterance/infer.py +13 -23
  13. batchalign-0.8.1/batchalign/models/wave2vec/__init__.py +7 -0
  14. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/wave2vec/infer_fa.py +16 -31
  15. batchalign-0.8.1/batchalign/models/whisper/__init__.py +11 -0
  16. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/whisper/infer_asr.py +16 -30
  17. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/whisper/infer_fa.py +21 -17
  18. batchalign-0.8.1/batchalign/pipelines/__init__.py +37 -0
  19. batchalign-0.8.1/batchalign/pipelines/analysis/__init__.py +15 -0
  20. batchalign-0.8.1/batchalign/pipelines/asr/__init__.py +24 -0
  21. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/rev.py +6 -1
  22. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/whisperx.py +9 -17
  23. batchalign-0.8.1/batchalign/pipelines/avqi/__init__.py +15 -0
  24. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/avqi/engine.py +6 -5
  25. batchalign-0.8.1/batchalign/pipelines/cache.py +735 -0
  26. batchalign-0.8.1/batchalign/pipelines/cleanup/__init__.py +18 -0
  27. batchalign-0.8.1/batchalign/pipelines/diarization/__init__.py +15 -0
  28. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/diarization/pyannote.py +5 -17
  29. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/dispatch.py +19 -6
  30. batchalign-0.8.1/batchalign/pipelines/fa/__init__.py +18 -0
  31. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/fa/wave2vec_fa.py +49 -10
  32. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/fa/whisper_fa.py +52 -10
  33. batchalign-0.8.1/batchalign/pipelines/morphosyntax/__init__.py +18 -0
  34. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/morphosyntax/coref.py +1 -1
  35. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/morphosyntax/ud.py +147 -21
  36. batchalign-0.8.1/batchalign/pipelines/opensmile/__init__.py +15 -0
  37. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/opensmile/engine.py +22 -12
  38. batchalign-0.8.1/batchalign/pipelines/speaker/__init__.py +15 -0
  39. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/speaker/nemo_speaker.py +4 -2
  40. batchalign-0.8.1/batchalign/pipelines/translate/__init__.py +18 -0
  41. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/translate/gtrans.py +2 -1
  42. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/translate/seamless.py +2 -1
  43. batchalign-0.8.1/batchalign/pipelines/utr/__init__.py +18 -0
  44. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/utr/rev_utr.py +8 -2
  45. batchalign-0.8.1/batchalign/pipelines/utterance/__init__.py +15 -0
  46. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/utterance/ud_utterance.py +95 -41
  47. batchalign-0.8.1/batchalign/tests/pipelines/cache/__init__.py +1 -0
  48. batchalign-0.8.1/batchalign/tests/pipelines/cache/test_cache.py +407 -0
  49. batchalign-0.8.1/batchalign/version +3 -0
  50. {batchalign-0.8.0.post4 → batchalign-0.8.1/batchalign.egg-info}/PKG-INFO +3 -1
  51. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign.egg-info/SOURCES.txt +4 -0
  52. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign.egg-info/requires.txt +2 -0
  53. {batchalign-0.8.0.post4 → batchalign-0.8.1}/setup.py +2 -0
  54. batchalign-0.8.0.post4/batchalign/__init__.py +0 -19
  55. batchalign-0.8.0.post4/batchalign/formats/__init__.py +0 -2
  56. batchalign-0.8.0.post4/batchalign/models/__init__.py +0 -6
  57. batchalign-0.8.0.post4/batchalign/models/speaker/__init__.py +0 -1
  58. batchalign-0.8.0.post4/batchalign/models/utterance/__init__.py +0 -4
  59. batchalign-0.8.0.post4/batchalign/models/wave2vec/__init__.py +0 -1
  60. batchalign-0.8.0.post4/batchalign/models/whisper/__init__.py +0 -2
  61. batchalign-0.8.0.post4/batchalign/pipelines/__init__.py +0 -19
  62. batchalign-0.8.0.post4/batchalign/pipelines/analysis/__init__.py +0 -1
  63. batchalign-0.8.0.post4/batchalign/pipelines/asr/__init__.py +0 -4
  64. batchalign-0.8.0.post4/batchalign/pipelines/avqi/__init__.py +0 -8
  65. batchalign-0.8.0.post4/batchalign/pipelines/cleanup/__init__.py +0 -3
  66. batchalign-0.8.0.post4/batchalign/pipelines/diarization/__init__.py +0 -1
  67. batchalign-0.8.0.post4/batchalign/pipelines/fa/__init__.py +0 -2
  68. batchalign-0.8.0.post4/batchalign/pipelines/morphosyntax/__init__.py +0 -3
  69. batchalign-0.8.0.post4/batchalign/pipelines/opensmile/__init__.py +0 -7
  70. batchalign-0.8.0.post4/batchalign/pipelines/speaker/__init__.py +0 -1
  71. batchalign-0.8.0.post4/batchalign/pipelines/translate/__init__.py +0 -2
  72. batchalign-0.8.0.post4/batchalign/pipelines/utr/__init__.py +0 -2
  73. batchalign-0.8.0.post4/batchalign/pipelines/utterance/__init__.py +0 -1
  74. batchalign-0.8.0.post4/batchalign/version +0 -3
  75. {batchalign-0.8.0.post4 → batchalign-0.8.1}/LICENSE +0 -0
  76. {batchalign-0.8.0.post4 → batchalign-0.8.1}/MANIFEST.in +0 -0
  77. {batchalign-0.8.0.post4 → batchalign-0.8.1}/README.md +0 -0
  78. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/__main__.py +0 -0
  79. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/cli/__init__.py +0 -0
  80. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/constants.py +0 -0
  81. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/document.py +0 -0
  82. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/errors.py +0 -0
  83. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/formats/base.py +0 -0
  84. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/formats/chat/__init__.py +0 -0
  85. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/formats/chat/file.py +0 -0
  86. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/formats/chat/generator.py +0 -0
  87. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/formats/chat/lexer.py +0 -0
  88. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/formats/chat/parser.py +0 -0
  89. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/formats/chat/utils.py +0 -0
  90. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/formats/textgrid/__init__.py +0 -0
  91. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/formats/textgrid/file.py +0 -0
  92. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/formats/textgrid/generator.py +0 -0
  93. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/formats/textgrid/parser.py +0 -0
  94. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/resolve.py +0 -0
  95. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/speaker/config.yaml +0 -0
  96. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/speaker/infer.py +0 -0
  97. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/speaker/utils.py +0 -0
  98. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/training/__init__.py +0 -0
  99. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/training/run.py +0 -0
  100. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/training/utils.py +0 -0
  101. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/utterance/dataset.py +0 -0
  102. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/utterance/execute.py +0 -0
  103. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/utterance/prep.py +0 -0
  104. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/utterance/train.py +0 -0
  105. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/analysis/eval.py +0 -0
  106. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2chinese.py +0 -0
  107. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/__init__.py +0 -0
  108. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/deu.py +0 -0
  109. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/ell.py +0 -0
  110. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/eng.py +0 -0
  111. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/eus.py +0 -0
  112. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/fra.py +0 -0
  113. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/hrv.py +0 -0
  114. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/ind.py +0 -0
  115. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/jpn.py +0 -0
  116. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/nld.py +0 -0
  117. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/por.py +0 -0
  118. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/spa.py +0 -0
  119. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/tha.py +0 -0
  120. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/oai_whisper.py +0 -0
  121. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/utils.py +0 -0
  122. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/whisper.py +0 -0
  123. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/base.py +0 -0
  124. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  125. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  126. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  127. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/cleanup/retrace.py +0 -0
  128. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  129. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  130. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/cleanup/support/test.test +0 -0
  131. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  132. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  133. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  134. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  135. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  136. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/pipeline.py +0 -0
  137. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/translate/utils.py +0 -0
  138. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/utr/utils.py +0 -0
  139. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  140. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/__init__.py +0 -0
  141. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/conftest.py +0 -0
  142. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  143. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  144. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  145. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  146. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  147. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  148. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  149. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  150. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  151. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  152. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  153. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  154. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/pipelines/fixures.py +0 -0
  155. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  156. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  157. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/test_document.py +0 -0
  158. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/utils/__init__.py +0 -0
  159. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/utils/abbrev.py +0 -0
  160. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/utils/compounds.py +0 -0
  161. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/utils/config.py +0 -0
  162. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/utils/dp.py +0 -0
  163. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/utils/names.py +0 -0
  164. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/utils/utils.py +0 -0
  165. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign.egg-info/dependency_links.txt +0 -0
  166. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign.egg-info/entry_points.txt +0 -0
  167. {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign.egg-info/top_level.txt +0 -0
  168. {batchalign-0.8.0.post4 → batchalign-0.8.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.8.0.post4
3
+ Version: 0.8.1
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -9,6 +9,8 @@ Classifier: Topic :: Utilities
9
9
  Description-Content-Type: text/markdown
10
10
  License-File: LICENSE
11
11
  Requires-Dist: pydantic>=2.4
12
+ Requires-Dist: platformdirs>=4.3.0
13
+ Requires-Dist: filelock>=3.0.0
12
14
  Requires-Dist: nltk>=3.8
13
15
  Requires-Dist: praatio<6.1.0,>=6.0.0
14
16
  Requires-Dist: torch>=2.6.0
@@ -0,0 +1,48 @@
1
+ import os
2
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = str(1)
3
+
4
+ import logging
5
+
6
+ # clear all of nemo's loggers
7
+ logging.getLogger().handlers.clear()
8
+ logging.getLogger('nemo_logger').handlers.clear()
9
+ logging.getLogger().setLevel(logging.CRITICAL)
10
+ logging.getLogger('nemo_logger').disabled = True
11
+
12
+ from .document import *
13
+ from .constants import *
14
+ from .errors import *
15
+
16
+ # Defer slow imports
17
+ # from .formats import *
18
+ # from .pipelines import *
19
+ # from .models import *
20
+ # from .cli import batchalign as cli
21
+
22
+ def __getattr__(name):
23
+ if name == 'cli':
24
+ from .cli import batchalign
25
+ return batchalign
26
+ if name == 'BatchalignPipeline':
27
+ from .pipelines import BatchalignPipeline
28
+ return BatchalignPipeline
29
+ if name == 'CHATFile':
30
+ from .formats.chat import CHATFile
31
+ return CHATFile
32
+ # Add other common engines if needed for dispatch.py
33
+ if name in ['WhisperEngine', 'WhisperFAEngine', 'StanzaEngine', 'RevEngine',
34
+ 'NgramRetraceEngine', 'DisfluencyReplacementEngine', 'WhisperUTREngine',
35
+ 'RevUTREngine', 'EvaluationEngine', 'WhisperXEngine', 'NemoSpeakerEngine',
36
+ 'StanzaUtteranceEngine', 'CorefEngine', 'Wave2VecFAEngine', 'SeamlessTranslationModel',
37
+ 'GoogleTranslateEngine', 'OAIWhisperEngine', 'PyannoteEngine']:
38
+ from .pipelines import dispatch
39
+ # This is a bit recursive, let's just let dispatch import them locally
40
+ # which it already does now.
41
+ import importlib
42
+ # We need to find which subpackage it's in.
43
+ # Actually, if we use local imports in dispatch.py, we don't need these here.
44
+ pass
45
+
46
+ raise AttributeError(f"module {__name__} has no attribute {name}")
47
+
48
+ logging.getLogger('nemo_logger').disabled = False
@@ -0,0 +1,263 @@
1
+ """
2
+ cache.py
3
+ CLI subcommand for managing the Batchalign cache.
4
+
5
+ Provides commands to:
6
+ - Show cache statistics (--stats)
7
+ - Clear all cached data (--clear)
8
+ - Prepopulate cache from existing CHAT files (--warm)
9
+ """
10
+
11
+ import os
12
+ from pathlib import Path
13
+
14
+ import rich_click as click
15
+ from rich.console import Console
16
+
17
+ C = Console()
18
+
19
+
20
+ def _format_bytes(count: int | None, precision: int = 2) -> str:
21
+ """Format byte count as human-readable string."""
22
+ if count is None:
23
+ return "unknown"
24
+ units = ["B", "KB", "MB", "GB", "TB"]
25
+ idx = 0
26
+ size = float(count)
27
+ while size >= 1024 and idx < len(units) - 1:
28
+ size /= 1024
29
+ idx += 1
30
+ if idx == 0:
31
+ return f"{int(size)} {units[idx]}"
32
+ return f"{size:.{precision}f} {units[idx]}"
33
+
34
+
35
+ @click.group(invoke_without_command=True)
36
+ @click.option("--stats", is_flag=True, help="Show cache statistics.")
37
+ @click.option(
38
+ "--clear",
39
+ is_flag=True,
40
+ help="Clear all cached data (requires confirmation)."
41
+ )
42
+ @click.pass_context
43
+ def cache(ctx, stats, clear):
44
+ """Manage the Batchalign cache.
45
+
46
+ The cache stores per-utterance analysis results to avoid redundant
47
+ computation when re-processing unchanged content.
48
+
49
+ Examples:
50
+ batchalign cache --stats
51
+ batchalign cache --clear
52
+ batchalign cache warm INPUT_DIR --lang eng
53
+ """
54
+ # Handle --stats flag
55
+ if stats:
56
+ ctx.invoke(show_stats)
57
+ return
58
+
59
+ # Handle --clear flag
60
+ if clear:
61
+ ctx.invoke(clear_cache)
62
+ return
63
+
64
+ # If no flags and no subcommand, show help
65
+ if ctx.invoked_subcommand is None:
66
+ click.echo(ctx.get_help())
67
+
68
+
69
+ @cache.command("stats")
70
+ def show_stats():
71
+ """Show cache statistics."""
72
+ from batchalign.pipelines.cache import CacheManager
73
+
74
+ manager = CacheManager()
75
+ stats = manager.stats()
76
+
77
+ C.print()
78
+ C.print("[bold]Batchalign Cache Statistics[/bold]")
79
+ C.print("-" * 35)
80
+ C.print(f"[cyan]Location:[/cyan] {stats['location']}")
81
+ C.print(f"[cyan]Size:[/cyan] {_format_bytes(stats['size_bytes'])}")
82
+ C.print(f"[cyan]Entries:[/cyan] {stats['total_entries']:,}")
83
+ C.print()
84
+
85
+ # Show breakdown by task
86
+ if stats["by_task"]:
87
+ C.print("[bold]By task:[/bold]")
88
+ for task, count in sorted(stats["by_task"].items()):
89
+ C.print(f" {task}: {count:,} entries")
90
+ C.print()
91
+
92
+ # Show breakdown by engine version
93
+ if stats["by_engine_version"]:
94
+ # Get current stanza version to mark outdated entries
95
+ try:
96
+ import stanza
97
+ current_stanza = stanza.__version__
98
+ except ImportError:
99
+ current_stanza = None
100
+
101
+ C.print("[bold]Engine versions:[/bold]")
102
+ for key, count in sorted(stats["by_engine_version"].items()):
103
+ # Check if this version is outdated
104
+ outdated = ""
105
+ if current_stanza and "morphosyntax" in key:
106
+ version_part = key.split()[-1] if " " in key else ""
107
+ if version_part and version_part != current_stanza:
108
+ outdated = " [dim](outdated)[/dim]"
109
+ C.print(f" {key}: {count:,} entries{outdated}")
110
+ C.print()
111
+
112
+
113
+ @cache.command("clear")
114
+ @click.confirmation_option(
115
+ prompt="Are you sure you want to clear all cached data?"
116
+ )
117
+ def clear_cache():
118
+ """Clear all cached data."""
119
+ from batchalign.pipelines.cache import CacheManager
120
+
121
+ manager = CacheManager()
122
+ stats = manager.stats()
123
+ entries_before = stats["total_entries"]
124
+
125
+ bytes_freed = manager.clear()
126
+
127
+ C.print()
128
+ C.print(f"[bold green]Cache cleared.[/bold green]")
129
+ C.print(f" Entries removed: {entries_before:,}")
130
+ C.print(f" Space freed: {_format_bytes(bytes_freed)}")
131
+ C.print()
132
+
133
+
134
+ @cache.command("warm")
135
+ @click.argument("input_dir", type=click.Path(exists=True, file_okay=False))
136
+ @click.option(
137
+ "--lang",
138
+ default="eng",
139
+ help="Language code (3-letter ISO). Default: eng"
140
+ )
141
+ @click.option(
142
+ "--retokenize/--keeptokens",
143
+ default=False,
144
+ help="Whether files were processed with retokenization."
145
+ )
146
+ def warm_cache(input_dir, lang, retokenize):
147
+ """Prepopulate cache from existing CHAT files with %mor/%gra tiers.
148
+
149
+ Reads CHAT files that already have morphosyntactic analysis (%mor and %gra
150
+ tiers) and populates the cache with their content. This allows subsequent
151
+ processing of identical utterances to use cached results.
152
+
153
+ IMPORTANT: The command trusts the input files. It does not validate that
154
+ the %mor/%gra content is correct.
155
+ """
156
+ from batchalign.pipelines.cache import (
157
+ CacheManager, MorphotagCacheKey, _get_batchalign_version
158
+ )
159
+ from batchalign.formats.chat import CHATFile
160
+ from batchalign.document import Utterance
161
+
162
+ # Get engine version
163
+ try:
164
+ import stanza
165
+ engine_version = stanza.__version__
166
+ except ImportError:
167
+ C.print("[bold red]Error:[/bold red] stanza is not installed. Cannot warm cache.")
168
+ return
169
+
170
+ manager = CacheManager()
171
+ key_gen = MorphotagCacheKey()
172
+ ba_version = _get_batchalign_version()
173
+
174
+ # Collect all .cha files
175
+ cha_files = []
176
+ for root, dirs, files in os.walk(input_dir):
177
+ for f in files:
178
+ if f.lower().endswith(".cha"):
179
+ cha_files.append(os.path.join(root, f))
180
+
181
+ if not cha_files:
182
+ C.print(f"[bold yellow]No .cha files found in {input_dir}[/bold yellow]")
183
+ return
184
+
185
+ C.print(f"\nWarming cache from {len(cha_files)} CHAT file(s)...")
186
+ C.print(f" Language: {lang}")
187
+ C.print(f" Retokenize: {retokenize}")
188
+ C.print(f" Stanza version: {engine_version}")
189
+ C.print()
190
+
191
+ entries_added = 0
192
+ entries_skipped = 0
193
+ files_processed = 0
194
+
195
+ for cha_path in cha_files:
196
+ try:
197
+ cf = CHATFile(path=cha_path, special_mor_=True)
198
+ doc = cf.doc
199
+
200
+ # Map for batching within a file
201
+ utterances_to_check = []
202
+ idx_to_key = {}
203
+
204
+ for idx, item in enumerate(doc.content):
205
+ if not isinstance(item, Utterance):
206
+ continue
207
+
208
+ # Check if utterance has morphology/dependency
209
+ has_morphology = any(
210
+ form.morphology and len(form.morphology) > 0
211
+ for form in item.content
212
+ )
213
+ has_dependency = any(
214
+ form.dependency and len(form.dependency) > 0
215
+ for form in item.content
216
+ )
217
+
218
+ if not (has_morphology or has_dependency):
219
+ continue
220
+
221
+ # Generate cache key
222
+ key = key_gen.generate_key(
223
+ item,
224
+ lang=lang,
225
+ retokenize=retokenize,
226
+ mwt={}
227
+ )
228
+ utterances_to_check.append((idx, key))
229
+ idx_to_key[idx] = key
230
+
231
+ if not utterances_to_check:
232
+ files_processed += 1
233
+ continue
234
+
235
+ # Batch check
236
+ keys = [k for _, k in utterances_to_check]
237
+ cached_results = manager.get_batch(keys, "morphosyntax", engine_version)
238
+
239
+ entries_skipped += len(cached_results)
240
+
241
+ # Filter out already cached ones and prepare for batch put
242
+ to_put = []
243
+ for idx, key in utterances_to_check:
244
+ if key not in cached_results:
245
+ item = doc.content[idx]
246
+ data = key_gen.serialize_output(item)
247
+ to_put.append((key, data))
248
+
249
+ if to_put:
250
+ manager.put_batch(to_put, "morphosyntax", engine_version, ba_version)
251
+ entries_added += len(to_put)
252
+
253
+ files_processed += 1
254
+
255
+ except Exception as e:
256
+ C.print(f"[yellow]Warning:[/yellow] Could not process {cha_path}: {e}")
257
+ continue
258
+
259
+ C.print(f"[bold green]Cache warming complete.[/bold green]")
260
+ C.print(f" Files processed: {files_processed}")
261
+ C.print(f" Entries added: {entries_added}")
262
+ C.print(f" Entries skipped (already cached): {entries_skipped}")
263
+ C.print()
@@ -92,6 +92,9 @@ def batchalign(ctx, verbose, workers):
92
92
 
93
93
  batchalign.add_command(train, "models")
94
94
 
95
+ from batchalign.cli.cache import cache
96
+ batchalign.add_command(cache, "cache")
97
+
95
98
  #################### ALIGN ################################
96
99
 
97
100
  @batchalign.command()
@@ -230,6 +233,8 @@ def translate(ctx, in_dir, out_dir, **kwargs):
230
233
  type=click.Path(exists=True,
231
234
  file_okay=True, dir_okay=False),
232
235
  help="Comma seperated manual lexicon override")
236
+ @click.option("--override-cache/--use-cache",
237
+ default=False, help="Bypass cache and recompute all utterances.")
233
238
  @click.pass_context
234
239
  def morphotag(ctx, in_dir, out_dir, **kwargs):
235
240
  """Perform morphosyntactic analysis on transcripts."""
@@ -94,8 +94,9 @@ def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_inf
94
94
  else:
95
95
  baL.setLevel(logging.DEBUG)
96
96
 
97
- # Always capture output to avoid interleaving with progress rendering.
98
- should_capture = True
97
+ # Always capture output to avoid interleaving with progress rendering,
98
+ # unless high verbosity is requested for debugging.
99
+ should_capture = verbose < 2
99
100
 
100
101
  if should_capture:
101
102
  # Use a temporary file to capture ALL output at the FD level
@@ -128,6 +129,7 @@ def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_inf
128
129
  mwt = kwargs.pop("mwt", {})
129
130
  retokenize = kwargs.pop("retokenize", False)
130
131
  skipmultilang = kwargs.pop("skipmultilang", False)
132
+ override_cache = kwargs.pop("override_cache", False)
131
133
 
132
134
  cf = CHATFile(path=os.path.abspath(file), special_mor_=True)
133
135
  doc = cf.doc
@@ -138,7 +140,8 @@ def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_inf
138
140
  pipeline_kwargs = {
139
141
  "retokenize": retokenize,
140
142
  "skipmultilang": skipmultilang,
141
- "mwt": mwt
143
+ "mwt": mwt,
144
+ "override_cache": override_cache
142
145
  }
143
146
  # Add any remaining kwargs
144
147
  pipeline_kwargs.update(kwargs)
@@ -0,0 +1,11 @@
1
+ # from .chat import CHATFile
2
+ # from .textgrid import TextGridFile
3
+
4
+ def __getattr__(name):
5
+ if name == 'CHATFile':
6
+ from .chat import CHATFile
7
+ return CHATFile
8
+ if name == 'TextGridFile':
9
+ from .textgrid import TextGridFile
10
+ return TextGridFile
11
+ raise AttributeError(f"module {__name__} has no attribute {name}")
@@ -0,0 +1,33 @@
1
+ # from .utterance import BertUtteranceModel, BertCantoneseUtteranceModel
2
+ # from .whisper import WhisperASRModel, WhisperFAModel
3
+ # from .speaker import NemoSpeakerModel
4
+ # from .utils import ASRAudioFile
5
+ # from .resolve import resolve
6
+ # from .wave2vec import Wave2VecFAModel
7
+
8
+ def __getattr__(name):
9
+ if name == 'BertUtteranceModel':
10
+ from .utterance import BertUtteranceModel
11
+ return BertUtteranceModel
12
+ if name == 'BertCantoneseUtteranceModel':
13
+ from .utterance import BertCantoneseUtteranceModel
14
+ return BertCantoneseUtteranceModel
15
+ if name == 'WhisperASRModel':
16
+ from .whisper import WhisperASRModel
17
+ return WhisperASRModel
18
+ if name == 'WhisperFAModel':
19
+ from .whisper import WhisperFAModel
20
+ return WhisperFAModel
21
+ if name == 'NemoSpeakerModel':
22
+ from .speaker import NemoSpeakerModel
23
+ return NemoSpeakerModel
24
+ if name == 'ASRAudioFile':
25
+ from .utils import ASRAudioFile
26
+ return ASRAudioFile
27
+ if name == 'resolve':
28
+ from .resolve import resolve
29
+ return resolve
30
+ if name == 'Wave2VecFAModel':
31
+ from .wave2vec import Wave2VecFAModel
32
+ return Wave2VecFAModel
33
+ raise AttributeError(f"module {__name__} has no attribute {name}")
@@ -0,0 +1,7 @@
1
+ # from .infer import NemoSpeakerModel
2
+
3
+ def __getattr__(name):
4
+ if name == 'NemoSpeakerModel':
5
+ from .infer import NemoSpeakerModel
6
+ return NemoSpeakerModel
7
+ raise AttributeError(f"module {__name__} has no attribute {name}")
@@ -187,6 +187,37 @@ class ASRAudioFile:
187
187
 
188
188
  return data
189
189
 
190
+ def hash_chunk(self, begin_ms, end_ms):
191
+ """Generate a tiny SHA256 hash of a chunk of audio for caching."""
192
+ import hashlib
193
+ data = self.chunk(begin_ms, end_ms)
194
+ num_samples = data.numel()
195
+
196
+ # Tiny fingerprint: 100 samples from the middle + total length
197
+ if num_samples > 100:
198
+ mid = num_samples // 2
199
+ samples = data[mid-50:mid+50]
200
+ else:
201
+ samples = data
202
+
203
+ # Include length to catch simple duration changes
204
+ header = f"{num_samples}|".encode()
205
+ return hashlib.sha256(header + samples.cpu().numpy().tobytes()).hexdigest()
206
+
207
+ def hash_all(self):
208
+ """Generate a tiny SHA256 hash of the entire audio file."""
209
+ import hashlib
210
+ num_samples = self.tensor.numel()
211
+
212
+ if num_samples > 100:
213
+ mid = num_samples // 2
214
+ samples = self.tensor[mid-50:mid+50]
215
+ else:
216
+ samples = self.tensor
217
+
218
+ header = f"{num_samples}|".encode()
219
+ return hashlib.sha256(header + samples.cpu().numpy().tobytes()).hexdigest()
220
+
190
221
  def all(self):
191
222
  """Get the audio in its entirety
192
223
 
@@ -0,0 +1,13 @@
1
+ # from .infer import BertUtteranceModel
2
+ # from .cantonese_infer import BertCantoneseUtteranceModel
3
+
4
+ def __getattr__(name):
5
+ if name == 'BertUtteranceModel':
6
+ from .infer import BertUtteranceModel
7
+ return BertUtteranceModel
8
+ if name == 'BertCantoneseUtteranceModel':
9
+ from .cantonese_infer import BertCantoneseUtteranceModel
10
+ return BertCantoneseUtteranceModel
11
+ raise AttributeError(f"module {__name__} has no attribute {name}")
12
+
13
+
@@ -1,43 +1,35 @@
1
1
  import re
2
2
  import string
3
3
  import random
4
+ import logging
4
5
 
5
- # tokenization utilities
6
- import nltk
7
- from nltk import word_tokenize, sent_tokenize
6
+ L = logging.getLogger("batchalign")
8
7
 
9
- # torch
10
- import torch
11
- from torch.utils.data import dataset
12
- from torch.utils.data.dataloader import DataLoader
13
- from torch.optim import AdamW
14
-
15
- # import huggingface utils
16
- from transformers import AutoTokenizer, BertForTokenClassification
17
- from transformers import DataCollatorForTokenClassification
18
-
19
- # tqdm
20
- from tqdm import tqdm
21
-
22
- # seed device and tokens
23
- DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
8
+ # heavy imports moved to local scope
24
9
 
25
10
  # seed model
26
11
  class BertCantoneseUtteranceModel(object):
27
12
 
28
13
  def __init__(self, model):
14
+ import torch
15
+ from transformers import AutoTokenizer, BertForTokenClassification
16
+
17
+ device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
18
+
29
19
  # seed tokenizers and model
30
20
  self.tokenizer = AutoTokenizer.from_pretrained(model)
31
- self.model = BertForTokenClassification.from_pretrained(model).to(DEVICE)
21
+ self.model = BertForTokenClassification.from_pretrained(model).to(device)
22
+ self.device = device
32
23
  self.max_length = 512
33
24
  self.overlap = 20
34
25
 
35
26
  # eval mode
36
27
  self.model.eval()
37
- print(f"Model and tokenizer initialized on device: {DEVICE}")
38
- print(f"Max length set to {self.max_length} with overlap of {self.overlap}")
28
+ L.debug(f"Model and tokenizer initialized on device: {device}")
29
+ L.debug(f"Max length set to {self.max_length} with overlap of {self.overlap}")
39
30
 
40
31
  def __call__(self, passage):
32
+ import torch
41
33
  # Step 1: Clean up passage
42
34
  passage = passage.lower()
43
35
  passage = passage.replace('.','')
@@ -78,15 +70,14 @@ class BertCantoneseUtteranceModel(object):
78
70
  chunks.append(passage[start:])
79
71
  break
80
72
 
81
- # Debugging: Print number of chunks and their content
82
- print(f"Created {len(chunks)} chunks based on keywords.")
73
+ L.debug(f"Created {len(chunks)} chunks based on keywords.")
83
74
  for i, chunk in enumerate(chunks):
84
- print(f"Chunk {i + 1}: {chunk[:100]}...") # Print the first 100 characters of each chunk
85
-
75
+ L.debug(f"Chunk {i + 1}: {chunk[:100]}...")
76
+
86
77
  # Step 3: Process each chunk and restore punctuation
87
78
  final_passage = []
88
79
  for chunk_index, chunk in enumerate(chunks):
89
- print(f"Processing chunk {chunk_index + 1}/{len(chunks)}...")
80
+ L.debug(f"Processing chunk {chunk_index + 1}/{len(chunks)}...")
90
81
 
91
82
  # Step 3.1: Split chunk by characters (Chinese tokenization)
92
83
  tokenized_chunk = list(chunk) # Simply split by characters for Chinese text
@@ -97,13 +88,13 @@ class BertCantoneseUtteranceModel(object):
97
88
  truncation=True,
98
89
  padding=True,
99
90
  max_length=self.max_length,
100
- is_split_into_words=True).to(DEVICE)
91
+ is_split_into_words=True).to(self.device)
101
92
 
102
93
  try:
103
94
  # Pass it through the model
104
95
  res = self.model(**tokd).logits
105
96
  except Exception as e:
106
- print(f"Error during model inference: {e}")
97
+ L.error(f"Error during model inference: {e}")
107
98
  return []
108
99
 
109
100
  # Argmax for classification
@@ -152,7 +143,7 @@ class BertCantoneseUtteranceModel(object):
152
143
  # Step 4: Join processed chunks together into the final passage
153
144
  final_passage = ' '.join(final_passage)
154
145
 
155
- print("Text processing completed. Generating final output...")
146
+ L.debug("Text processing completed. Generating final output...")
156
147
 
157
148
  # Optionally, tokenize the final text into sentences based on punctuation
158
149
  def custom_sent_tokenize(text):
@@ -163,32 +154,29 @@ class BertCantoneseUtteranceModel(object):
163
154
  # Split the passage based on punctuation marks and keep them
164
155
  parts = re.split(sentence_endings, text)
165
156
 
166
- # Debug: Output the parts after splitting
167
- print(f"Parts after splitting: {parts}")
157
+ L.debug(f"Parts after splitting: {parts}")
168
158
 
169
159
  # Combine parts and punctuation together
170
160
  for i in range(0, len(parts) - 1, 2):
171
161
  sentence = parts[i] + parts[i + 1] # Join sentence with punctuation
172
- print(f"Sentence formed: {sentence}") # Debug: Output the current sentence
173
-
162
+ L.debug(f"Sentence formed: {sentence}")
163
+
174
164
  if sentence.strip(): # Only add non-empty sentences (check for non-whitespace content)
175
165
  split_passage.append(sentence)
176
166
 
177
167
  # If the last part doesn't have punctuation, we handle it here
178
168
  if len(parts) % 2 != 0: # If there's no punctuation at the end
179
169
  last_part = parts[-1].strip()
180
- print(f"Last part without punctuation: {last_part}") # Debug: Output the last part
181
-
170
+ L.debug(f"Last part without punctuation: {last_part}")
171
+
182
172
  if last_part: # Only add non-empty sentences
183
173
  split_passage.append(last_part)
184
-
185
- # Final output
186
- print(f"Final split passage: {split_passage}")
174
+
175
+ L.debug(f"Final split passage: {split_passage}")
187
176
  return split_passage
188
177
 
189
178
  split_passage = custom_sent_tokenize(final_passage)
190
179
 
191
- # Debugging: Output the sentences after splitting
192
- print(f"Final sentences: {split_passage}")
180
+ L.debug(f"Final sentences: {split_passage}")
193
181
 
194
182
  return split_passage