BatchalignHK 0.8.0.post6__tar.gz → 0.8.1.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (208) hide show
  1. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/BatchalignHK.egg-info/PKG-INFO +3 -1
  2. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/BatchalignHK.egg-info/SOURCES.txt +4 -0
  3. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/BatchalignHK.egg-info/requires.txt +2 -0
  4. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/PKG-INFO +3 -1
  5. batchalignhk-0.8.1.post1/batchalign/__init__.py +48 -0
  6. batchalignhk-0.8.1.post1/batchalign/cli/cache.py +263 -0
  7. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/cli/cli.py +5 -0
  8. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/cli/dispatch.py +6 -3
  9. batchalignhk-0.8.1.post1/batchalign/formats/__init__.py +11 -0
  10. batchalignhk-0.8.1.post1/batchalign/models/__init__.py +33 -0
  11. batchalignhk-0.8.1.post1/batchalign/models/speaker/__init__.py +7 -0
  12. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/models/utils.py +31 -0
  13. batchalignhk-0.8.1.post1/batchalign/models/utterance/__init__.py +13 -0
  14. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/models/utterance/cantonese_infer.py +17 -31
  15. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/models/utterance/infer.py +13 -23
  16. batchalignhk-0.8.1.post1/batchalign/models/wave2vec/__init__.py +7 -0
  17. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/models/wave2vec/infer_fa.py +16 -31
  18. batchalignhk-0.8.1.post1/batchalign/models/whisper/__init__.py +11 -0
  19. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/models/whisper/infer_asr.py +16 -30
  20. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/models/whisper/infer_fa.py +21 -17
  21. batchalignhk-0.8.1.post1/batchalign/pipelines/__init__.py +49 -0
  22. batchalignhk-0.8.1.post1/batchalign/pipelines/analysis/__init__.py +15 -0
  23. batchalignhk-0.8.1.post1/batchalign/pipelines/asr/__init__.py +24 -0
  24. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/asr/rev.py +6 -1
  25. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/asr/whisperx.py +9 -17
  26. batchalignhk-0.8.1.post1/batchalign/pipelines/avqi/__init__.py +15 -0
  27. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/avqi/engine.py +6 -5
  28. batchalignhk-0.8.1.post1/batchalign/pipelines/cache.py +735 -0
  29. batchalignhk-0.8.1.post1/batchalign/pipelines/cleanup/__init__.py +18 -0
  30. batchalignhk-0.8.1.post1/batchalign/pipelines/diarization/__init__.py +15 -0
  31. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/diarization/pyannote.py +5 -17
  32. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/dispatch.py +26 -11
  33. batchalignhk-0.8.1.post1/batchalign/pipelines/fa/__init__.py +18 -0
  34. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/fa/wave2vec_fa.py +49 -10
  35. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/fa/whisper_fa.py +52 -10
  36. batchalignhk-0.8.1.post1/batchalign/pipelines/morphosyntax/__init__.py +18 -0
  37. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/morphosyntax/coref.py +1 -1
  38. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/morphosyntax/ud.py +147 -21
  39. batchalignhk-0.8.1.post1/batchalign/pipelines/opensmile/__init__.py +15 -0
  40. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/opensmile/engine.py +22 -12
  41. batchalignhk-0.8.1.post1/batchalign/pipelines/speaker/__init__.py +15 -0
  42. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/speaker/nemo_speaker.py +4 -2
  43. batchalignhk-0.8.1.post1/batchalign/pipelines/translate/__init__.py +18 -0
  44. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/translate/gtrans.py +2 -1
  45. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/translate/seamless.py +2 -1
  46. batchalignhk-0.8.1.post1/batchalign/pipelines/utr/__init__.py +18 -0
  47. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/utr/rev_utr.py +8 -2
  48. batchalignhk-0.8.1.post1/batchalign/pipelines/utterance/__init__.py +15 -0
  49. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/utterance/ud_utterance.py +95 -41
  50. batchalignhk-0.8.1.post1/batchalign/tests/pipelines/cache/__init__.py +1 -0
  51. batchalignhk-0.8.1.post1/batchalign/tests/pipelines/cache/test_cache.py +407 -0
  52. batchalignhk-0.8.1.post1/batchalign/version +3 -0
  53. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/setup.py +2 -0
  54. batchalignhk-0.8.0.post6/batchalign/__init__.py +0 -19
  55. batchalignhk-0.8.0.post6/batchalign/formats/__init__.py +0 -2
  56. batchalignhk-0.8.0.post6/batchalign/models/__init__.py +0 -6
  57. batchalignhk-0.8.0.post6/batchalign/models/speaker/__init__.py +0 -1
  58. batchalignhk-0.8.0.post6/batchalign/models/utterance/__init__.py +0 -4
  59. batchalignhk-0.8.0.post6/batchalign/models/wave2vec/__init__.py +0 -1
  60. batchalignhk-0.8.0.post6/batchalign/models/whisper/__init__.py +0 -2
  61. batchalignhk-0.8.0.post6/batchalign/pipelines/__init__.py +0 -20
  62. batchalignhk-0.8.0.post6/batchalign/pipelines/analysis/__init__.py +0 -1
  63. batchalignhk-0.8.0.post6/batchalign/pipelines/asr/__init__.py +0 -7
  64. batchalignhk-0.8.0.post6/batchalign/pipelines/avqi/__init__.py +0 -8
  65. batchalignhk-0.8.0.post6/batchalign/pipelines/cleanup/__init__.py +0 -3
  66. batchalignhk-0.8.0.post6/batchalign/pipelines/diarization/__init__.py +0 -1
  67. batchalignhk-0.8.0.post6/batchalign/pipelines/fa/__init__.py +0 -4
  68. batchalignhk-0.8.0.post6/batchalign/pipelines/morphosyntax/__init__.py +0 -3
  69. batchalignhk-0.8.0.post6/batchalign/pipelines/opensmile/__init__.py +0 -7
  70. batchalignhk-0.8.0.post6/batchalign/pipelines/speaker/__init__.py +0 -1
  71. batchalignhk-0.8.0.post6/batchalign/pipelines/translate/__init__.py +0 -2
  72. batchalignhk-0.8.0.post6/batchalign/pipelines/utr/__init__.py +0 -4
  73. batchalignhk-0.8.0.post6/batchalign/pipelines/utterance/__init__.py +0 -1
  74. batchalignhk-0.8.0.post6/batchalign/version +0 -3
  75. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/BatchalignHK.egg-info/dependency_links.txt +0 -0
  76. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/BatchalignHK.egg-info/entry_points.txt +0 -0
  77. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/BatchalignHK.egg-info/top_level.txt +0 -0
  78. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/LICENSE +0 -0
  79. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/MANIFEST.in +0 -0
  80. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/README.md +0 -0
  81. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/__main__.py +0 -0
  82. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/cli/__init__.py +0 -0
  83. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/constants.py +0 -0
  84. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/document.py +0 -0
  85. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/errors.py +0 -0
  86. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/__init__.py +0 -0
  87. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/core.py +0 -0
  88. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/exception.py +0 -0
  89. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/logging.py +0 -0
  90. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/realtime_meeting.py +0 -0
  91. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/speech_recognizer.py +0 -0
  92. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/speech_synthesizer.py +0 -0
  93. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/speech_transcriber.py +0 -0
  94. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/stream_input_tts.py +0 -0
  95. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/token.py +0 -0
  96. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/util.py +0 -0
  97. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/version.py +0 -0
  98. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/websocket/__init__.py +0 -0
  99. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/websocket/_abnf.py +0 -0
  100. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/websocket/_app.py +0 -0
  101. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/websocket/_cookiejar.py +0 -0
  102. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/websocket/_core.py +0 -0
  103. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/websocket/_exceptions.py +0 -0
  104. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/websocket/_handshake.py +0 -0
  105. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/websocket/_http.py +0 -0
  106. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/websocket/_logging.py +0 -0
  107. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/websocket/_socket.py +0 -0
  108. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/websocket/_ssl_compat.py +0 -0
  109. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/websocket/_url.py +0 -0
  110. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/websocket/_utils.py +0 -0
  111. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/websocket/tests/__init__.py +0 -0
  112. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/websocket/tests/echo-server.py +0 -0
  113. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/websocket/tests/test_abnf.py +0 -0
  114. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/websocket/tests/test_app.py +0 -0
  115. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/websocket/tests/test_cookiejar.py +0 -0
  116. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/websocket/tests/test_http.py +0 -0
  117. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/websocket/tests/test_url.py +0 -0
  118. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/extern/nls/websocket/tests/test_websocket.py +0 -0
  119. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/formats/base.py +0 -0
  120. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/formats/chat/__init__.py +0 -0
  121. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/formats/chat/file.py +0 -0
  122. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/formats/chat/generator.py +0 -0
  123. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/formats/chat/lexer.py +0 -0
  124. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/formats/chat/parser.py +0 -0
  125. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/formats/chat/utils.py +0 -0
  126. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/formats/textgrid/__init__.py +0 -0
  127. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/formats/textgrid/file.py +0 -0
  128. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/formats/textgrid/generator.py +0 -0
  129. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/formats/textgrid/parser.py +0 -0
  130. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/models/resolve.py +0 -0
  131. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/models/speaker/config.yaml +0 -0
  132. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/models/speaker/infer.py +0 -0
  133. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/models/speaker/utils.py +0 -0
  134. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/models/training/__init__.py +0 -0
  135. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/models/training/run.py +0 -0
  136. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/models/training/utils.py +0 -0
  137. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/models/utterance/dataset.py +0 -0
  138. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/models/utterance/execute.py +0 -0
  139. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/models/utterance/prep.py +0 -0
  140. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/models/utterance/train.py +0 -0
  141. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/analysis/eval.py +0 -0
  142. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/asr/aliyun.py +0 -0
  143. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/asr/funaudio.py +0 -0
  144. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/asr/num2chinese.py +0 -0
  145. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/asr/num2lang/__init__.py +0 -0
  146. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/asr/num2lang/deu.py +0 -0
  147. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/asr/num2lang/ell.py +0 -0
  148. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/asr/num2lang/eng.py +0 -0
  149. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/asr/num2lang/eus.py +0 -0
  150. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/asr/num2lang/fra.py +0 -0
  151. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/asr/num2lang/hrv.py +0 -0
  152. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/asr/num2lang/ind.py +0 -0
  153. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/asr/num2lang/jpn.py +0 -0
  154. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/asr/num2lang/nld.py +0 -0
  155. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/asr/num2lang/por.py +0 -0
  156. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/asr/num2lang/spa.py +0 -0
  157. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/asr/num2lang/tha.py +0 -0
  158. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/asr/oai_whisper.py +0 -0
  159. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/asr/tencent.py +0 -0
  160. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/asr/utils.py +0 -0
  161. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/asr/whisper.py +0 -0
  162. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/base.py +0 -0
  163. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  164. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  165. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  166. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/cleanup/retrace.py +0 -0
  167. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  168. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  169. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/cleanup/support/test.test +0 -0
  170. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/fa/iic_fa.py +0 -0
  171. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/fa/wave2vec_fa_canto.py +0 -0
  172. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  173. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  174. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  175. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  176. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  177. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/pipeline.py +0 -0
  178. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/translate/utils.py +0 -0
  179. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/utr/funaudio_utr.py +0 -0
  180. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/utr/tencent_utr.py +0 -0
  181. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/utr/utils.py +0 -0
  182. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  183. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/tests/__init__.py +0 -0
  184. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/tests/conftest.py +0 -0
  185. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  186. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  187. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  188. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  189. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  190. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  191. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  192. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  193. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  194. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  195. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  196. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  197. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/tests/pipelines/fixures.py +0 -0
  198. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  199. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  200. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/tests/test_document.py +0 -0
  201. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/utils/__init__.py +0 -0
  202. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/utils/abbrev.py +0 -0
  203. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/utils/compounds.py +0 -0
  204. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/utils/config.py +0 -0
  205. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/utils/dp.py +0 -0
  206. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/utils/names.py +0 -0
  207. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/batchalign/utils/utils.py +0 -0
  208. {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1.post1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: BatchalignHK
3
- Version: 0.8.0.post6
3
+ Version: 0.8.1.post1
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -9,6 +9,8 @@ Classifier: Topic :: Utilities
9
9
  Description-Content-Type: text/markdown
10
10
  License-File: LICENSE
11
11
  Requires-Dist: pydantic>=2.4
12
+ Requires-Dist: platformdirs>=4.3.0
13
+ Requires-Dist: filelock>=3.0.0
12
14
  Requires-Dist: nltk>=3.8
13
15
  Requires-Dist: praatio<6.1.0,>=6.0.0
14
16
  Requires-Dist: torch>=2.6.0
@@ -15,6 +15,7 @@ batchalign/document.py
15
15
  batchalign/errors.py
16
16
  batchalign/version
17
17
  batchalign/cli/__init__.py
18
+ batchalign/cli/cache.py
18
19
  batchalign/cli/cli.py
19
20
  batchalign/cli/dispatch.py
20
21
  batchalign/extern/nls/__init__.py
@@ -86,6 +87,7 @@ batchalign/models/whisper/infer_asr.py
86
87
  batchalign/models/whisper/infer_fa.py
87
88
  batchalign/pipelines/__init__.py
88
89
  batchalign/pipelines/base.py
90
+ batchalign/pipelines/cache.py
89
91
  batchalign/pipelines/dispatch.py
90
92
  batchalign/pipelines/pipeline.py
91
93
  batchalign/pipelines/analysis/__init__.py
@@ -169,6 +171,8 @@ batchalign/tests/pipelines/test_pipeline_models.py
169
171
  batchalign/tests/pipelines/analysis/test_eval.py
170
172
  batchalign/tests/pipelines/asr/test_asr_pipeline.py
171
173
  batchalign/tests/pipelines/asr/test_asr_utils.py
174
+ batchalign/tests/pipelines/cache/__init__.py
175
+ batchalign/tests/pipelines/cache/test_cache.py
172
176
  batchalign/tests/pipelines/cleanup/test_disfluency.py
173
177
  batchalign/tests/pipelines/cleanup/test_parse_support.py
174
178
  batchalign/tests/pipelines/fa/test_fa_pipeline.py
@@ -1,4 +1,6 @@
1
1
  pydantic>=2.4
2
+ platformdirs>=4.3.0
3
+ filelock>=3.0.0
2
4
  nltk>=3.8
3
5
  praatio<6.1.0,>=6.0.0
4
6
  torch>=2.6.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: BatchalignHK
3
- Version: 0.8.0.post6
3
+ Version: 0.8.1.post1
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -9,6 +9,8 @@ Classifier: Topic :: Utilities
9
9
  Description-Content-Type: text/markdown
10
10
  License-File: LICENSE
11
11
  Requires-Dist: pydantic>=2.4
12
+ Requires-Dist: platformdirs>=4.3.0
13
+ Requires-Dist: filelock>=3.0.0
12
14
  Requires-Dist: nltk>=3.8
13
15
  Requires-Dist: praatio<6.1.0,>=6.0.0
14
16
  Requires-Dist: torch>=2.6.0
@@ -0,0 +1,48 @@
1
+ import os
2
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = str(1)
3
+
4
+ import logging
5
+
6
+ # clear all of nemo's loggers
7
+ logging.getLogger().handlers.clear()
8
+ logging.getLogger('nemo_logger').handlers.clear()
9
+ logging.getLogger().setLevel(logging.CRITICAL)
10
+ logging.getLogger('nemo_logger').disabled = True
11
+
12
+ from .document import *
13
+ from .constants import *
14
+ from .errors import *
15
+
16
+ # Defer slow imports
17
+ # from .formats import *
18
+ # from .pipelines import *
19
+ # from .models import *
20
+ # from .cli import batchalign as cli
21
+
22
+ def __getattr__(name):
23
+ if name == 'cli':
24
+ from .cli import batchalign
25
+ return batchalign
26
+ if name == 'BatchalignPipeline':
27
+ from .pipelines import BatchalignPipeline
28
+ return BatchalignPipeline
29
+ if name == 'CHATFile':
30
+ from .formats.chat import CHATFile
31
+ return CHATFile
32
+ # Add other common engines if needed for dispatch.py
33
+ if name in ['WhisperEngine', 'WhisperFAEngine', 'StanzaEngine', 'RevEngine',
34
+ 'NgramRetraceEngine', 'DisfluencyReplacementEngine', 'WhisperUTREngine',
35
+ 'RevUTREngine', 'EvaluationEngine', 'WhisperXEngine', 'NemoSpeakerEngine',
36
+ 'StanzaUtteranceEngine', 'CorefEngine', 'Wave2VecFAEngine', 'SeamlessTranslationModel',
37
+ 'GoogleTranslateEngine', 'OAIWhisperEngine', 'PyannoteEngine']:
38
+ from .pipelines import dispatch
39
+ # This is a bit recursive, let's just let dispatch import them locally
40
+ # which it already does now.
41
+ import importlib
42
+ # We need to find which subpackage it's in.
43
+ # Actually, if we use local imports in dispatch.py, we don't need these here.
44
+ pass
45
+
46
+ raise AttributeError(f"module {__name__} has no attribute {name}")
47
+
48
+ logging.getLogger('nemo_logger').disabled = False
@@ -0,0 +1,263 @@
1
+ """
2
+ cache.py
3
+ CLI subcommand for managing the Batchalign cache.
4
+
5
+ Provides commands to:
6
+ - Show cache statistics (--stats)
7
+ - Clear all cached data (--clear)
8
+ - Prepopulate cache from existing CHAT files (--warm)
9
+ """
10
+
11
+ import os
12
+ from pathlib import Path
13
+
14
+ import rich_click as click
15
+ from rich.console import Console
16
+
17
+ C = Console()
18
+
19
+
20
+ def _format_bytes(count: int | None, precision: int = 2) -> str:
21
+ """Format byte count as human-readable string."""
22
+ if count is None:
23
+ return "unknown"
24
+ units = ["B", "KB", "MB", "GB", "TB"]
25
+ idx = 0
26
+ size = float(count)
27
+ while size >= 1024 and idx < len(units) - 1:
28
+ size /= 1024
29
+ idx += 1
30
+ if idx == 0:
31
+ return f"{int(size)} {units[idx]}"
32
+ return f"{size:.{precision}f} {units[idx]}"
33
+
34
+
35
+ @click.group(invoke_without_command=True)
36
+ @click.option("--stats", is_flag=True, help="Show cache statistics.")
37
+ @click.option(
38
+ "--clear",
39
+ is_flag=True,
40
+ help="Clear all cached data (requires confirmation)."
41
+ )
42
+ @click.pass_context
43
+ def cache(ctx, stats, clear):
44
+ """Manage the Batchalign cache.
45
+
46
+ The cache stores per-utterance analysis results to avoid redundant
47
+ computation when re-processing unchanged content.
48
+
49
+ Examples:
50
+ batchalign cache --stats
51
+ batchalign cache --clear
52
+ batchalign cache warm INPUT_DIR --lang eng
53
+ """
54
+ # Handle --stats flag
55
+ if stats:
56
+ ctx.invoke(show_stats)
57
+ return
58
+
59
+ # Handle --clear flag
60
+ if clear:
61
+ ctx.invoke(clear_cache)
62
+ return
63
+
64
+ # If no flags and no subcommand, show help
65
+ if ctx.invoked_subcommand is None:
66
+ click.echo(ctx.get_help())
67
+
68
+
69
+ @cache.command("stats")
70
+ def show_stats():
71
+ """Show cache statistics."""
72
+ from batchalign.pipelines.cache import CacheManager
73
+
74
+ manager = CacheManager()
75
+ stats = manager.stats()
76
+
77
+ C.print()
78
+ C.print("[bold]Batchalign Cache Statistics[/bold]")
79
+ C.print("-" * 35)
80
+ C.print(f"[cyan]Location:[/cyan] {stats['location']}")
81
+ C.print(f"[cyan]Size:[/cyan] {_format_bytes(stats['size_bytes'])}")
82
+ C.print(f"[cyan]Entries:[/cyan] {stats['total_entries']:,}")
83
+ C.print()
84
+
85
+ # Show breakdown by task
86
+ if stats["by_task"]:
87
+ C.print("[bold]By task:[/bold]")
88
+ for task, count in sorted(stats["by_task"].items()):
89
+ C.print(f" {task}: {count:,} entries")
90
+ C.print()
91
+
92
+ # Show breakdown by engine version
93
+ if stats["by_engine_version"]:
94
+ # Get current stanza version to mark outdated entries
95
+ try:
96
+ import stanza
97
+ current_stanza = stanza.__version__
98
+ except ImportError:
99
+ current_stanza = None
100
+
101
+ C.print("[bold]Engine versions:[/bold]")
102
+ for key, count in sorted(stats["by_engine_version"].items()):
103
+ # Check if this version is outdated
104
+ outdated = ""
105
+ if current_stanza and "morphosyntax" in key:
106
+ version_part = key.split()[-1] if " " in key else ""
107
+ if version_part and version_part != current_stanza:
108
+ outdated = " [dim](outdated)[/dim]"
109
+ C.print(f" {key}: {count:,} entries{outdated}")
110
+ C.print()
111
+
112
+
113
+ @cache.command("clear")
114
+ @click.confirmation_option(
115
+ prompt="Are you sure you want to clear all cached data?"
116
+ )
117
+ def clear_cache():
118
+ """Clear all cached data."""
119
+ from batchalign.pipelines.cache import CacheManager
120
+
121
+ manager = CacheManager()
122
+ stats = manager.stats()
123
+ entries_before = stats["total_entries"]
124
+
125
+ bytes_freed = manager.clear()
126
+
127
+ C.print()
128
+ C.print(f"[bold green]Cache cleared.[/bold green]")
129
+ C.print(f" Entries removed: {entries_before:,}")
130
+ C.print(f" Space freed: {_format_bytes(bytes_freed)}")
131
+ C.print()
132
+
133
+
134
+ @cache.command("warm")
135
+ @click.argument("input_dir", type=click.Path(exists=True, file_okay=False))
136
+ @click.option(
137
+ "--lang",
138
+ default="eng",
139
+ help="Language code (3-letter ISO). Default: eng"
140
+ )
141
+ @click.option(
142
+ "--retokenize/--keeptokens",
143
+ default=False,
144
+ help="Whether files were processed with retokenization."
145
+ )
146
+ def warm_cache(input_dir, lang, retokenize):
147
+ """Prepopulate cache from existing CHAT files with %mor/%gra tiers.
148
+
149
+ Reads CHAT files that already have morphosyntactic analysis (%mor and %gra
150
+ tiers) and populates the cache with their content. This allows subsequent
151
+ processing of identical utterances to use cached results.
152
+
153
+ IMPORTANT: The command trusts the input files. It does not validate that
154
+ the %mor/%gra content is correct.
155
+ """
156
+ from batchalign.pipelines.cache import (
157
+ CacheManager, MorphotagCacheKey, _get_batchalign_version
158
+ )
159
+ from batchalign.formats.chat import CHATFile
160
+ from batchalign.document import Utterance
161
+
162
+ # Get engine version
163
+ try:
164
+ import stanza
165
+ engine_version = stanza.__version__
166
+ except ImportError:
167
+ C.print("[bold red]Error:[/bold red] stanza is not installed. Cannot warm cache.")
168
+ return
169
+
170
+ manager = CacheManager()
171
+ key_gen = MorphotagCacheKey()
172
+ ba_version = _get_batchalign_version()
173
+
174
+ # Collect all .cha files
175
+ cha_files = []
176
+ for root, dirs, files in os.walk(input_dir):
177
+ for f in files:
178
+ if f.lower().endswith(".cha"):
179
+ cha_files.append(os.path.join(root, f))
180
+
181
+ if not cha_files:
182
+ C.print(f"[bold yellow]No .cha files found in {input_dir}[/bold yellow]")
183
+ return
184
+
185
+ C.print(f"\nWarming cache from {len(cha_files)} CHAT file(s)...")
186
+ C.print(f" Language: {lang}")
187
+ C.print(f" Retokenize: {retokenize}")
188
+ C.print(f" Stanza version: {engine_version}")
189
+ C.print()
190
+
191
+ entries_added = 0
192
+ entries_skipped = 0
193
+ files_processed = 0
194
+
195
+ for cha_path in cha_files:
196
+ try:
197
+ cf = CHATFile(path=cha_path, special_mor_=True)
198
+ doc = cf.doc
199
+
200
+ # Map for batching within a file
201
+ utterances_to_check = []
202
+ idx_to_key = {}
203
+
204
+ for idx, item in enumerate(doc.content):
205
+ if not isinstance(item, Utterance):
206
+ continue
207
+
208
+ # Check if utterance has morphology/dependency
209
+ has_morphology = any(
210
+ form.morphology and len(form.morphology) > 0
211
+ for form in item.content
212
+ )
213
+ has_dependency = any(
214
+ form.dependency and len(form.dependency) > 0
215
+ for form in item.content
216
+ )
217
+
218
+ if not (has_morphology or has_dependency):
219
+ continue
220
+
221
+ # Generate cache key
222
+ key = key_gen.generate_key(
223
+ item,
224
+ lang=lang,
225
+ retokenize=retokenize,
226
+ mwt={}
227
+ )
228
+ utterances_to_check.append((idx, key))
229
+ idx_to_key[idx] = key
230
+
231
+ if not utterances_to_check:
232
+ files_processed += 1
233
+ continue
234
+
235
+ # Batch check
236
+ keys = [k for _, k in utterances_to_check]
237
+ cached_results = manager.get_batch(keys, "morphosyntax", engine_version)
238
+
239
+ entries_skipped += len(cached_results)
240
+
241
+ # Filter out already cached ones and prepare for batch put
242
+ to_put = []
243
+ for idx, key in utterances_to_check:
244
+ if key not in cached_results:
245
+ item = doc.content[idx]
246
+ data = key_gen.serialize_output(item)
247
+ to_put.append((key, data))
248
+
249
+ if to_put:
250
+ manager.put_batch(to_put, "morphosyntax", engine_version, ba_version)
251
+ entries_added += len(to_put)
252
+
253
+ files_processed += 1
254
+
255
+ except Exception as e:
256
+ C.print(f"[yellow]Warning:[/yellow] Could not process {cha_path}: {e}")
257
+ continue
258
+
259
+ C.print(f"[bold green]Cache warming complete.[/bold green]")
260
+ C.print(f" Files processed: {files_processed}")
261
+ C.print(f" Entries added: {entries_added}")
262
+ C.print(f" Entries skipped (already cached): {entries_skipped}")
263
+ C.print()
@@ -92,6 +92,9 @@ def batchalign(ctx, verbose, workers):
92
92
 
93
93
  batchalign.add_command(train, "models")
94
94
 
95
+ from batchalign.cli.cache import cache
96
+ batchalign.add_command(cache, "cache")
97
+
95
98
  #################### ALIGN ################################
96
99
 
97
100
  @batchalign.command()
@@ -254,6 +257,8 @@ def translate(ctx, in_dir, out_dir, **kwargs):
254
257
  type=click.Path(exists=True,
255
258
  file_okay=True, dir_okay=False),
256
259
  help="Comma seperated manual lexicon override")
260
+ @click.option("--override-cache/--use-cache",
261
+ default=False, help="Bypass cache and recompute all utterances.")
257
262
  @click.pass_context
258
263
  def morphotag(ctx, in_dir, out_dir, **kwargs):
259
264
  """Perform morphosyntactic analysis on transcripts."""
@@ -95,8 +95,9 @@ def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_inf
95
95
  else:
96
96
  baL.setLevel(logging.DEBUG)
97
97
 
98
- # Always capture output to avoid interleaving with progress rendering.
99
- should_capture = True
98
+ # Always capture output to avoid interleaving with progress rendering,
99
+ # unless high verbosity is requested for debugging.
100
+ should_capture = verbose < 2
100
101
 
101
102
  if should_capture:
102
103
  # Use a temporary file to capture ALL output at the FD level
@@ -129,6 +130,7 @@ def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_inf
129
130
  mwt = kwargs.pop("mwt", {})
130
131
  retokenize = kwargs.pop("retokenize", False)
131
132
  skipmultilang = kwargs.pop("skipmultilang", False)
133
+ override_cache = kwargs.pop("override_cache", False)
132
134
 
133
135
  cf = CHATFile(path=os.path.abspath(file), special_mor_=True)
134
136
  doc = cf.doc
@@ -139,7 +141,8 @@ def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_inf
139
141
  pipeline_kwargs = {
140
142
  "retokenize": retokenize,
141
143
  "skipmultilang": skipmultilang,
142
- "mwt": mwt
144
+ "mwt": mwt,
145
+ "override_cache": override_cache
143
146
  }
144
147
  # Add any remaining kwargs
145
148
  pipeline_kwargs.update(kwargs)
@@ -0,0 +1,11 @@
1
+ # from .chat import CHATFile
2
+ # from .textgrid import TextGridFile
3
+
4
+ def __getattr__(name):
5
+ if name == 'CHATFile':
6
+ from .chat import CHATFile
7
+ return CHATFile
8
+ if name == 'TextGridFile':
9
+ from .textgrid import TextGridFile
10
+ return TextGridFile
11
+ raise AttributeError(f"module {__name__} has no attribute {name}")
@@ -0,0 +1,33 @@
1
+ # from .utterance import BertUtteranceModel, BertCantoneseUtteranceModel
2
+ # from .whisper import WhisperASRModel, WhisperFAModel
3
+ # from .speaker import NemoSpeakerModel
4
+ # from .utils import ASRAudioFile
5
+ # from .resolve import resolve
6
+ # from .wave2vec import Wave2VecFAModel
7
+
8
+ def __getattr__(name):
9
+ if name == 'BertUtteranceModel':
10
+ from .utterance import BertUtteranceModel
11
+ return BertUtteranceModel
12
+ if name == 'BertCantoneseUtteranceModel':
13
+ from .utterance import BertCantoneseUtteranceModel
14
+ return BertCantoneseUtteranceModel
15
+ if name == 'WhisperASRModel':
16
+ from .whisper import WhisperASRModel
17
+ return WhisperASRModel
18
+ if name == 'WhisperFAModel':
19
+ from .whisper import WhisperFAModel
20
+ return WhisperFAModel
21
+ if name == 'NemoSpeakerModel':
22
+ from .speaker import NemoSpeakerModel
23
+ return NemoSpeakerModel
24
+ if name == 'ASRAudioFile':
25
+ from .utils import ASRAudioFile
26
+ return ASRAudioFile
27
+ if name == 'resolve':
28
+ from .resolve import resolve
29
+ return resolve
30
+ if name == 'Wave2VecFAModel':
31
+ from .wave2vec import Wave2VecFAModel
32
+ return Wave2VecFAModel
33
+ raise AttributeError(f"module {__name__} has no attribute {name}")
@@ -0,0 +1,7 @@
1
+ # from .infer import NemoSpeakerModel
2
+
3
+ def __getattr__(name):
4
+ if name == 'NemoSpeakerModel':
5
+ from .infer import NemoSpeakerModel
6
+ return NemoSpeakerModel
7
+ raise AttributeError(f"module {__name__} has no attribute {name}")
@@ -187,6 +187,37 @@ class ASRAudioFile:
187
187
 
188
188
  return data
189
189
 
190
+ def hash_chunk(self, begin_ms, end_ms):
191
+ """Generate a tiny SHA256 hash of a chunk of audio for caching."""
192
+ import hashlib
193
+ data = self.chunk(begin_ms, end_ms)
194
+ num_samples = data.numel()
195
+
196
+ # Tiny fingerprint: 100 samples from the middle + total length
197
+ if num_samples > 100:
198
+ mid = num_samples // 2
199
+ samples = data[mid-50:mid+50]
200
+ else:
201
+ samples = data
202
+
203
+ # Include length to catch simple duration changes
204
+ header = f"{num_samples}|".encode()
205
+ return hashlib.sha256(header + samples.cpu().numpy().tobytes()).hexdigest()
206
+
207
+ def hash_all(self):
208
+ """Generate a tiny SHA256 hash of the entire audio file."""
209
+ import hashlib
210
+ num_samples = self.tensor.numel()
211
+
212
+ if num_samples > 100:
213
+ mid = num_samples // 2
214
+ samples = self.tensor[mid-50:mid+50]
215
+ else:
216
+ samples = self.tensor
217
+
218
+ header = f"{num_samples}|".encode()
219
+ return hashlib.sha256(header + samples.cpu().numpy().tobytes()).hexdigest()
220
+
190
221
  def all(self):
191
222
  """Get the audio in its entirety
192
223
 
@@ -0,0 +1,13 @@
1
+ # from .infer import BertUtteranceModel
2
+ # from .cantonese_infer import BertCantoneseUtteranceModel
3
+
4
+ def __getattr__(name):
5
+ if name == 'BertUtteranceModel':
6
+ from .infer import BertUtteranceModel
7
+ return BertUtteranceModel
8
+ if name == 'BertCantoneseUtteranceModel':
9
+ from .cantonese_infer import BertCantoneseUtteranceModel
10
+ return BertCantoneseUtteranceModel
11
+ raise AttributeError(f"module {__name__} has no attribute {name}")
12
+
13
+
@@ -1,46 +1,38 @@
1
1
  import re
2
2
  import string
3
3
  import random
4
+ import logging
4
5
 
5
- # tokenization utilities
6
- import nltk
7
- from nltk import word_tokenize, sent_tokenize
8
-
9
- # torch
10
- import torch
11
- from torch.utils.data import dataset
12
- from torch.utils.data.dataloader import DataLoader
13
- from torch.optim import AdamW
14
-
15
- # import huggingface utils
16
- from transformers import AutoTokenizer, BertForTokenClassification
17
- from transformers import DataCollatorForTokenClassification
18
-
19
- # tqdm
20
- from tqdm import tqdm
6
+ L = logging.getLogger("batchalign")
21
7
 
22
8
  import logging
23
9
  L = logging.getLogger("batchalign")
24
10
 
25
11
  # seed device and tokens
26
- DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
27
12
 
28
13
  # seed model
29
14
  class BertCantoneseUtteranceModel(object):
30
15
 
31
16
  def __init__(self, model):
17
+ import torch
18
+ from transformers import AutoTokenizer, BertForTokenClassification
19
+
20
+ device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
21
+
32
22
  # seed tokenizers and model
33
23
  self.tokenizer = AutoTokenizer.from_pretrained(model)
34
- self.model = BertForTokenClassification.from_pretrained(model).to(DEVICE)
24
+ self.model = BertForTokenClassification.from_pretrained(model).to(device)
25
+ self.device = device
35
26
  self.max_length = 512
36
27
  self.overlap = 20
37
28
 
38
29
  # eval mode
39
30
  self.model.eval()
40
- L.debug(f"Model and tokenizer initialized on device: {DEVICE}")
31
+ L.debug(f"Model and tokenizer initialized on device: {device}")
41
32
  L.debug(f"Max length set to {self.max_length} with overlap of {self.overlap}")
42
33
 
43
34
  def __call__(self, passage):
35
+ import torch
44
36
  # Step 1: Clean up passage
45
37
  passage = passage.lower()
46
38
  passage = passage.replace('.','')
@@ -81,11 +73,9 @@ class BertCantoneseUtteranceModel(object):
81
73
  chunks.append(passage[start:])
82
74
  break
83
75
 
84
- # Debugging: Print number of chunks and their content
85
76
  L.debug(f"Created {len(chunks)} chunks based on keywords.")
86
77
  for i, chunk in enumerate(chunks):
87
- L.debug(f"Chunk {i + 1}: {chunk[:100]}...") # Print the first 100 characters of each chunk
88
-
78
+ L.debug(f"Chunk {i + 1}: {chunk[:100]}...")
89
79
  # Step 3: Process each chunk and restore punctuation
90
80
  final_passage = []
91
81
  for chunk_index, chunk in enumerate(chunks):
@@ -100,7 +90,7 @@ class BertCantoneseUtteranceModel(object):
100
90
  truncation=True,
101
91
  padding=True,
102
92
  max_length=self.max_length,
103
- is_split_into_words=True).to(DEVICE)
93
+ is_split_into_words=True).to(self.device)
104
94
 
105
95
  try:
106
96
  # Pass it through the model
@@ -155,7 +145,7 @@ class BertCantoneseUtteranceModel(object):
155
145
  # Step 4: Join processed chunks together into the final passage
156
146
  final_passage = ' '.join(final_passage)
157
147
 
158
- L.info("Text processing completed. Generating final output...")
148
+ L.debug("Text processing completed. Generating final output...")
159
149
 
160
150
  # Optionally, tokenize the final text into sentences based on punctuation
161
151
  def custom_sent_tokenize(text):
@@ -166,32 +156,28 @@ class BertCantoneseUtteranceModel(object):
166
156
  # Split the passage based on punctuation marks and keep them
167
157
  parts = re.split(sentence_endings, text)
168
158
 
169
- # Debug: Output the parts after splitting
170
159
  L.debug(f"Parts after splitting: {parts}")
171
160
 
172
161
  # Combine parts and punctuation together
173
162
  for i in range(0, len(parts) - 1, 2):
174
163
  sentence = parts[i] + parts[i + 1] # Join sentence with punctuation
175
- L.debug(f"Sentence formed: {sentence}") # Debug: Output the current sentence
176
164
 
165
+ L.debug(f"Sentence formed: {sentence}")
177
166
  if sentence.strip(): # Only add non-empty sentences (check for non-whitespace content)
178
167
  split_passage.append(sentence)
179
168
 
180
169
  # If the last part doesn't have punctuation, we handle it here
181
170
  if len(parts) % 2 != 0: # If there's no punctuation at the end
182
171
  last_part = parts[-1].strip()
183
- L.debug(f"Last part without punctuation: {last_part}") # Debug: Output the last part
184
-
172
+ L.debug(f"Last part without punctuation: {last_part}")
173
+
185
174
  if last_part: # Only add non-empty sentences
186
175
  split_passage.append(last_part)
187
-
188
- # Final output
189
176
  L.debug(f"Final split passage: {split_passage}")
190
177
  return split_passage
191
178
 
192
179
  split_passage = custom_sent_tokenize(final_passage)
193
180
 
194
- # Debugging: Output the sentences after splitting
195
181
  L.debug(f"Final sentences: {split_passage}")
196
182
 
197
183
  return split_passage