BatchalignHK 0.7.23.post1__tar.gz → 0.8.0.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/BatchalignHK.egg-info/PKG-INFO +1 -1
  2. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/PKG-INFO +1 -1
  3. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/cli/cli.py +22 -21
  4. batchalignhk-0.8.0.post1/batchalign/cli/dispatch.py +419 -0
  5. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/formats/chat/generator.py +2 -1
  6. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/morphosyntax/ud.py +115 -81
  7. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/translate/gtrans.py +1 -0
  8. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/utterance/ud_utterance.py +1 -1
  9. batchalignhk-0.8.0.post1/batchalign/utils/dp.py +225 -0
  10. batchalignhk-0.8.0.post1/batchalign/version +3 -0
  11. batchalignhk-0.7.23.post1/batchalign/cli/dispatch.py +0 -223
  12. batchalignhk-0.7.23.post1/batchalign/utils/dp.py +0 -225
  13. batchalignhk-0.7.23.post1/batchalign/version +0 -3
  14. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/BatchalignHK.egg-info/SOURCES.txt +0 -0
  15. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/BatchalignHK.egg-info/dependency_links.txt +0 -0
  16. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/BatchalignHK.egg-info/entry_points.txt +0 -0
  17. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/BatchalignHK.egg-info/requires.txt +0 -0
  18. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/BatchalignHK.egg-info/top_level.txt +0 -0
  19. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/LICENSE +0 -0
  20. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/MANIFEST.in +0 -0
  21. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/README.md +0 -0
  22. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/__init__.py +0 -0
  23. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/__main__.py +0 -0
  24. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/cli/__init__.py +0 -0
  25. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/constants.py +0 -0
  26. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/document.py +0 -0
  27. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/errors.py +0 -0
  28. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/__init__.py +0 -0
  29. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/core.py +0 -0
  30. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/exception.py +0 -0
  31. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/logging.py +0 -0
  32. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/realtime_meeting.py +0 -0
  33. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/speech_recognizer.py +0 -0
  34. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/speech_synthesizer.py +0 -0
  35. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/speech_transcriber.py +0 -0
  36. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/stream_input_tts.py +0 -0
  37. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/token.py +0 -0
  38. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/util.py +0 -0
  39. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/version.py +0 -0
  40. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/websocket/__init__.py +0 -0
  41. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/websocket/_abnf.py +0 -0
  42. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/websocket/_app.py +0 -0
  43. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/websocket/_cookiejar.py +0 -0
  44. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/websocket/_core.py +0 -0
  45. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/websocket/_exceptions.py +0 -0
  46. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/websocket/_handshake.py +0 -0
  47. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/websocket/_http.py +0 -0
  48. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/websocket/_logging.py +0 -0
  49. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/websocket/_socket.py +0 -0
  50. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/websocket/_ssl_compat.py +0 -0
  51. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/websocket/_url.py +0 -0
  52. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/websocket/_utils.py +0 -0
  53. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/websocket/tests/__init__.py +0 -0
  54. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/websocket/tests/echo-server.py +0 -0
  55. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/websocket/tests/test_abnf.py +0 -0
  56. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/websocket/tests/test_app.py +0 -0
  57. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/websocket/tests/test_cookiejar.py +0 -0
  58. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/websocket/tests/test_http.py +0 -0
  59. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/websocket/tests/test_url.py +0 -0
  60. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/extern/nls/websocket/tests/test_websocket.py +0 -0
  61. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/formats/__init__.py +0 -0
  62. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/formats/base.py +0 -0
  63. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/formats/chat/__init__.py +0 -0
  64. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/formats/chat/file.py +0 -0
  65. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/formats/chat/lexer.py +0 -0
  66. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/formats/chat/parser.py +0 -0
  67. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/formats/chat/utils.py +0 -0
  68. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/formats/textgrid/__init__.py +0 -0
  69. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/formats/textgrid/file.py +0 -0
  70. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/formats/textgrid/generator.py +0 -0
  71. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/formats/textgrid/parser.py +0 -0
  72. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/models/__init__.py +0 -0
  73. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/models/resolve.py +0 -0
  74. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/models/speaker/__init__.py +0 -0
  75. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/models/speaker/config.yaml +0 -0
  76. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/models/speaker/infer.py +0 -0
  77. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/models/speaker/utils.py +0 -0
  78. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/models/training/__init__.py +0 -0
  79. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/models/training/run.py +0 -0
  80. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/models/training/utils.py +0 -0
  81. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/models/utils.py +0 -0
  82. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/models/utterance/__init__.py +0 -0
  83. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/models/utterance/cantonese_infer.py +0 -0
  84. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/models/utterance/dataset.py +0 -0
  85. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/models/utterance/execute.py +0 -0
  86. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/models/utterance/infer.py +0 -0
  87. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/models/utterance/prep.py +0 -0
  88. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/models/utterance/train.py +0 -0
  89. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/models/wave2vec/__init__.py +0 -0
  90. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/models/wave2vec/infer_fa.py +0 -0
  91. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/models/whisper/__init__.py +0 -0
  92. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/models/whisper/infer_asr.py +0 -0
  93. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/models/whisper/infer_fa.py +0 -0
  94. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/__init__.py +0 -0
  95. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/analysis/__init__.py +0 -0
  96. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/analysis/eval.py +0 -0
  97. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/__init__.py +0 -0
  98. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/aliyun.py +0 -0
  99. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/funaudio.py +0 -0
  100. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/num2chinese.py +0 -0
  101. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/num2lang/__init__.py +0 -0
  102. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/num2lang/deu.py +0 -0
  103. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/num2lang/ell.py +0 -0
  104. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/num2lang/eng.py +0 -0
  105. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/num2lang/eus.py +0 -0
  106. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/num2lang/fra.py +0 -0
  107. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/num2lang/hrv.py +0 -0
  108. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/num2lang/ind.py +0 -0
  109. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/num2lang/jpn.py +0 -0
  110. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/num2lang/nld.py +0 -0
  111. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/num2lang/por.py +0 -0
  112. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/num2lang/spa.py +0 -0
  113. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/num2lang/tha.py +0 -0
  114. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/oai_whisper.py +0 -0
  115. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/rev.py +0 -0
  116. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/tencent.py +0 -0
  117. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/utils.py +0 -0
  118. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/whisper.py +0 -0
  119. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/asr/whisperx.py +0 -0
  120. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/avqi/__init__.py +0 -0
  121. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/avqi/engine.py +0 -0
  122. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/base.py +0 -0
  123. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/cleanup/__init__.py +0 -0
  124. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  125. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  126. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  127. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/cleanup/retrace.py +0 -0
  128. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  129. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  130. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/cleanup/support/test.test +0 -0
  131. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/diarization/__init__.py +0 -0
  132. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/diarization/pyannote.py +0 -0
  133. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/dispatch.py +0 -0
  134. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/fa/__init__.py +0 -0
  135. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/fa/iic_fa.py +0 -0
  136. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/fa/wave2vec_fa.py +0 -0
  137. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/fa/wave2vec_fa_canto.py +0 -0
  138. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  139. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  140. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/morphosyntax/coref.py +0 -0
  141. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  142. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  143. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  144. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  145. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  146. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/opensmile/__init__.py +0 -0
  147. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/opensmile/engine.py +0 -0
  148. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/pipeline.py +0 -0
  149. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/speaker/__init__.py +0 -0
  150. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  151. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/translate/__init__.py +0 -0
  152. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/translate/seamless.py +0 -0
  153. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/translate/utils.py +0 -0
  154. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/utr/__init__.py +0 -0
  155. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/utr/funaudio_utr.py +0 -0
  156. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/utr/rev_utr.py +0 -0
  157. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/utr/tencent_utr.py +0 -0
  158. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/utr/utils.py +0 -0
  159. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  160. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/pipelines/utterance/__init__.py +0 -0
  161. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/tests/__init__.py +0 -0
  162. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/tests/conftest.py +0 -0
  163. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  164. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  165. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  166. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  167. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  168. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  169. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  170. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  171. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  172. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  173. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  174. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  175. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/tests/pipelines/fixures.py +0 -0
  176. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  177. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  178. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/tests/test_document.py +0 -0
  179. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/utils/__init__.py +0 -0
  180. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/utils/abbrev.py +0 -0
  181. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/utils/compounds.py +0 -0
  182. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/utils/config.py +0 -0
  183. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/utils/names.py +0 -0
  184. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/batchalign/utils/utils.py +0 -0
  185. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/setup.cfg +0 -0
  186. {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0.post1}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: BatchalignHK
3
- Version: 0.7.23.post1
3
+ Version: 0.8.0.post1
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: BatchalignHK
3
- Version: 0.7.23.post1
3
+ Version: 0.8.0.post1
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -3,37 +3,23 @@ cli.py
3
3
  The Batchalign command-line interface
4
4
  """
5
5
 
6
- import multiprocessing
7
6
  import rich_click as click
8
7
  import functools
9
8
 
10
9
  import os
11
- from glob import glob
12
10
 
13
- from multiprocessing import Process, freeze_support
14
-
15
- from batchalign.pipelines import BatchalignPipeline
11
+ from multiprocessing import freeze_support
16
12
 
13
+ from pathlib import Path
17
14
  from rich.traceback import install
18
15
  from rich.console import Console
19
- from rich.panel import Panel
20
- from pathlib import Path
21
- from batchalign.document import *
22
- from batchalign.formats.chat import CHATFile
23
- from batchalign.utils import config
24
16
  from rich.logging import RichHandler
25
17
 
26
18
  from batchalign.cli.dispatch import _dispatch
27
19
  from batchalign.models.training.run import cli as train
28
20
 
29
- from enum import Enum
30
-
31
- import traceback
32
-
33
21
  import pyfiglet
34
- from rich import pretty
35
- import logging as L
36
- baL = L.getLogger('batchalign')
22
+ import logging as L
37
23
 
38
24
  C = Console()
39
25
 
@@ -62,7 +48,7 @@ def handle_verbosity(verbosity):
62
48
  L.getLogger('stanza').handlers.clear()
63
49
  L.getLogger('transformers').handlers.clear()
64
50
  L.getLogger('nemo_logger').handlers.clear()
65
- L.getLogger("stanza").setLevel(L.INFO)
51
+ L.getLogger("stanza").setLevel(L.WARN)
66
52
  L.getLogger('nemo_logger').setLevel(L.CRITICAL)
67
53
  L.getLogger('batchalign').setLevel(L.WARN)
68
54
  L.getLogger('lightning.pytorch.utilities.migration.utils').setLevel(L.ERROR)
@@ -73,6 +59,7 @@ def handle_verbosity(verbosity):
73
59
  L.getLogger('batchalign').setLevel(L.INFO)
74
60
  if verbosity >= 3:
75
61
  L.getLogger('batchalign').setLevel(L.DEBUG)
62
+ L.getLogger("stanza").setLevel(L.INFO)
76
63
  if verbosity >= 4:
77
64
  L.getLogger('batchalign').setLevel(L.DEBUG)
78
65
  L.getLogger('transformers').setLevel(L.INFO)
@@ -81,7 +68,8 @@ def handle_verbosity(verbosity):
81
68
  @click.pass_context
82
69
  @click.version_option(VERSION_NUMBER)
83
70
  @click.option("-v", "--verbose", type=int, count=True, default=0, help="How loquacious Batchalign should be.")
84
- def batchalign(ctx, verbose):
71
+ @click.option("--workers", type=int, default=os.cpu_count(), help="Number of worker processes to use.")
72
+ def batchalign(ctx, verbose, workers):
85
73
  """process .cha and/or audio files in IN_DIR and dumps them to OUT_DIR using recipe COMMAND"""
86
74
 
87
75
  ## setup commands ##
@@ -93,7 +81,9 @@ def batchalign(ctx, verbose):
93
81
  handle_verbosity(verbose)
94
82
  # add to arguments
95
83
  ctx.obj["verbose"] = verbose
84
+ ctx.obj["workers"] = workers
96
85
  # setup config
86
+ from batchalign.utils import config
97
87
  ctx.obj["config"] = config.config_read(True)
98
88
  # make everything look better
99
89
  # pretty.install()
@@ -122,6 +112,7 @@ batchalign.add_command(train, "models")
122
112
  @click.pass_context
123
113
  def align(ctx, in_dir, out_dir, whisper, wav2vec, iic, wav2vec_yue, tencent, funaudio, **kwargs):
124
114
  """Align transcripts against corresponding media files."""
115
+ from batchalign.formats.chat import CHATFile
125
116
  def loader(file):
126
117
  return (
127
118
  CHATFile(path=os.path.abspath(file)).doc,
@@ -180,6 +171,8 @@ def align(ctx, in_dir, out_dir, whisper, wav2vec, iic, wav2vec_yue, tencent, fun
180
171
  @click.pass_context
181
172
  def transcribe(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
182
173
  """Create a transcript from audio files."""
174
+ from batchalign.document import CustomLine, CustomLineType
175
+ from batchalign.formats.chat import CHATFile
183
176
  def loader(file):
184
177
  return file
185
178
 
@@ -229,6 +222,7 @@ def transcribe(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
229
222
  @click.pass_context
230
223
  def translate(ctx, in_dir, out_dir, **kwargs):
231
224
  """Translate the transcript to English."""
225
+ from batchalign.formats.chat import CHATFile
232
226
 
233
227
  def loader(file):
234
228
  cf = CHATFile(path=os.path.abspath(file), special_mor_=True)
@@ -259,6 +253,7 @@ def translate(ctx, in_dir, out_dir, **kwargs):
259
253
  @click.pass_context
260
254
  def morphotag(ctx, in_dir, out_dir, **kwargs):
261
255
  """Perform morphosyntactic analysis on transcripts."""
256
+ from batchalign.formats.chat import CHATFile
262
257
 
263
258
  def loader(file):
264
259
  mwt = {}
@@ -285,7 +280,7 @@ def morphotag(ctx, in_dir, out_dir, **kwargs):
285
280
 
286
281
  _dispatch("morphotag", "eng", 1, ["cha"], ctx,
287
282
  in_dir, out_dir,
288
- loader, writer, C)
283
+ loader, writer, C, **kwargs)
289
284
 
290
285
 
291
286
  #################### MORPHOTAG ################################
@@ -295,6 +290,7 @@ def morphotag(ctx, in_dir, out_dir, **kwargs):
295
290
  @click.pass_context
296
291
  def coref(ctx, in_dir, out_dir, **kwargs):
297
292
  """Perform coreference analysis on transcripts."""
293
+ from batchalign.formats.chat import CHATFile
298
294
 
299
295
  def loader(file):
300
296
  cf = CHATFile(path=os.path.abspath(file))
@@ -322,6 +318,7 @@ def coref(ctx, in_dir, out_dir, **kwargs):
322
318
  @click.pass_context
323
319
  def utseg(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
324
320
  """Perform morphosyntactic analysis on transcripts."""
321
+ from batchalign.formats.chat import CHATFile
325
322
 
326
323
  def loader(file):
327
324
  return CHATFile(path=os.path.abspath(file)).doc
@@ -354,6 +351,7 @@ def utseg(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
354
351
  @click.pass_context
355
352
  def benchmark(ctx, in_dir, out_dir, lang, num_speakers, whisper, tencent, funaudio, whisper_oai, **kwargs):
356
353
  """Benchmark ASR utilities for their word accuracy"""
354
+ from batchalign.formats.chat import CHATFile
357
355
  def loader(file):
358
356
  # try to find a .cha in the same directory
359
357
  p = Path(file)
@@ -397,6 +395,7 @@ def avqi(ctx, input_dir, output_dir, lang, **kwargs):
397
395
  """Calculate AVQI from paired .cs and .sv audio files in input directory."""
398
396
 
399
397
  from batchalign.pipelines.avqi import AVQIEngine
398
+ from batchalign.document import Document
400
399
  from pathlib import Path
401
400
  import os
402
401
 
@@ -464,6 +463,7 @@ def avqi(ctx, input_dir, output_dir, lang, **kwargs):
464
463
  @click.pass_context
465
464
  def opensmile(ctx, input_dir, output_dir, feature_set, lang, **kwargs):
466
465
  """Extract openSMILE audio features from speech samples."""
466
+ from batchalign.document import Document
467
467
 
468
468
  def loader(file):
469
469
  doc = Document.new(media_path=file, lang=lang)
@@ -491,6 +491,7 @@ def opensmile(ctx, input_dir, output_dir, feature_set, lang, **kwargs):
491
491
  def setup(ctx):
492
492
  """Reconfigure Batchalign settings, such as Rev.AI key."""
493
493
 
494
+ from batchalign.utils import config
494
495
  config.interactive_setup()
495
496
 
496
497
  #################### VERSION ################################
@@ -503,5 +504,5 @@ def version(ctx, **kwargs):
503
504
  ptr = (pyfiglet.figlet_format("Batchalign2")+"\n" +
504
505
  f"Version: [bold]{VERSION_NUMBER.strip()}[/bold], released {RELEASE_DATE.strip()}\n" +
505
506
  f"[italic]{RELEASE_NOTES.strip()}[/italic]"+"\n" +
506
- "\nDeveloped by Brian MacWhinney and Houjun Liu")
507
+ "\nDeveloped by Brian MacWhinney and Houjun Liu\ncontributions from Sebastian Song and Franklin Chen")
507
508
  C.print("\n\n"+ptr+"\n\n")
@@ -0,0 +1,419 @@
1
+ """
2
+ dispatch.py
3
+ CLI runner dispatch. Essentially the translation layer between `command` in CLI
4
+ and actual BatchalignPipeline.
5
+ """
6
+
7
+ from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn, BarColumn
8
+ from urllib.parse import urlparse
9
+
10
+ import warnings
11
+
12
+ import shutil
13
+ import os
14
+ import glob
15
+ import queue
16
+
17
+ from rich.console import Console
18
+ from rich.markup import escape
19
+
20
+ from pathlib import Path
21
+
22
+ import concurrent.futures
23
+ import multiprocessing
24
+ from functools import partial
25
+
26
+ # Oneliner of directory-based glob and replace
27
+ globase = lambda path, statement: glob(os.path.join(path, statement))
28
+ repath_file = lambda file_path, new_dir: os.path.join(new_dir, Path(file_path).name)
29
+
30
+ import tempfile
31
+ import time
32
+
33
+ import traceback
34
+ import logging as L
35
+ baL = L.getLogger('batchalign')
36
+
37
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
38
+
39
+ # Global cache for the pipeline in worker processes
40
+ _worker_pipeline = None
41
+
42
+ def _get_worker_pipeline(command, lang, num_speakers, **kwargs):
43
+ global _worker_pipeline
44
+ if _worker_pipeline is None:
45
+ from batchalign.pipelines import BatchalignPipeline
46
+ _worker_pipeline = BatchalignPipeline.new(Cmd2Task[command],
47
+ lang=lang, num_speakers=num_speakers, **kwargs)
48
+ return _worker_pipeline
49
+
50
+ def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_info, progress_queue=None, verbose=0, **kwargs):
51
+ """The task executed in each worker process."""
52
+ import sys
53
+ import os
54
+ import tempfile
55
+ import logging
56
+
57
+ file, output = file_info
58
+ pid = os.getpid()
59
+
60
+ # Configure logging in this worker process
61
+ if verbose >= 1:
62
+ # Ensure basicConfig is called so logging works
63
+ logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.ERROR)
64
+
65
+ # Configure batchalign logger level in this worker process
66
+ baL = logging.getLogger('batchalign')
67
+ if verbose == 0:
68
+ baL.setLevel(logging.WARN)
69
+ elif verbose == 1:
70
+ baL.setLevel(logging.INFO)
71
+ else:
72
+ baL.setLevel(logging.DEBUG)
73
+
74
+ # Only capture output if not in verbose mode
75
+ # In verbose mode, let logs stream naturally to the console
76
+ should_capture = verbose == 0
77
+
78
+ if should_capture:
79
+ # Use a temporary file to capture ALL output at the FD level
80
+ # This is the most robust way to prevent interleaved output
81
+ log_file = tempfile.TemporaryFile(mode='w+')
82
+ old_stdout_fd = os.dup(sys.stdout.fileno())
83
+ old_stderr_fd = os.dup(sys.stderr.fileno())
84
+
85
+ # Redirect FD 1 and 2 to our temp file
86
+ os.dup2(log_file.fileno(), sys.stdout.fileno())
87
+ os.dup2(log_file.fileno(), sys.stderr.fileno())
88
+
89
+ try:
90
+ pipeline = _get_worker_pipeline(command, lang, num_speakers, **kwargs)
91
+
92
+ def progress_callback(completed, total, tasks):
93
+ if not progress_queue:
94
+ return
95
+ try:
96
+ progress_queue.put((file, completed, total, tasks))
97
+ except Exception:
98
+ pass
99
+
100
+ # For now, we'll re-import what we need
101
+ from batchalign.formats.chat import CHATFile
102
+
103
+ # Morphosyntax specific loader/writer logic moved here for picklability
104
+ if command == "morphotag":
105
+ # Extract morphotag-specific arguments from kwargs
106
+ mwt = kwargs.pop("mwt", {})
107
+ retokenize = kwargs.pop("retokenize", False)
108
+ skipmultilang = kwargs.pop("skipmultilang", False)
109
+
110
+ cf = CHATFile(path=os.path.abspath(file), special_mor_=True)
111
+ doc = cf.doc
112
+ if str(cf).count("%mor") > 0:
113
+ doc.ba_special_["special_mor_notation"] = True
114
+
115
+ # Prepare arguments for the pipeline
116
+ pipeline_kwargs = {
117
+ "retokenize": retokenize,
118
+ "skipmultilang": skipmultilang,
119
+ "mwt": mwt
120
+ }
121
+ # Add any remaining kwargs
122
+ pipeline_kwargs.update(kwargs)
123
+
124
+ # Process
125
+ doc = pipeline(doc, callback=progress_callback, **pipeline_kwargs)
126
+
127
+ # Write
128
+ CHATFile(doc=doc, special_mor_=doc.ba_special_.get("special_mor_notation", False)).write(output)
129
+
130
+ # Add other commands as needed, or use a more generic registry
131
+ elif command == "align":
132
+ cf = CHATFile(path=os.path.abspath(file))
133
+ doc = cf.doc
134
+ kw = {"pauses": kwargs.get("pauses", False)}
135
+ doc = pipeline(doc, callback=progress_callback, **kw)
136
+ CHATFile(doc=doc).write(output, write_wor=kwargs.get("wor", True))
137
+
138
+ else:
139
+ loader, writer = loader_info, writer_info
140
+ doc = loader(os.path.abspath(file))
141
+ kw = {}
142
+ if isinstance(doc, tuple) and len(doc) > 1:
143
+ doc, kw = doc
144
+ doc = pipeline(doc, callback=progress_callback, **kw)
145
+ writer(doc, output)
146
+
147
+ # Flush and read captured output if we were capturing
148
+ if should_capture:
149
+ sys.stdout.flush()
150
+ sys.stderr.flush()
151
+ log_file.seek(0)
152
+ captured = log_file.read()
153
+ else:
154
+ captured = ""
155
+
156
+ return file, None, None, captured
157
+ except Exception as e:
158
+ # Flush and read captured output if we were capturing
159
+ if should_capture:
160
+ sys.stdout.flush()
161
+ sys.stderr.flush()
162
+ log_file.seek(0)
163
+ captured = log_file.read()
164
+ else:
165
+ captured = ""
166
+ return file, traceback.format_exc(), e, captured
167
+ finally:
168
+ # Restore original FDs only if we redirected them
169
+ if should_capture:
170
+ os.dup2(old_stdout_fd, sys.stdout.fileno())
171
+ os.dup2(old_stderr_fd, sys.stderr.fileno())
172
+ os.close(old_stdout_fd)
173
+ os.close(old_stderr_fd)
174
+ log_file.close()
175
+
176
+ # this dictionary maps what commands are executed
177
+ # against what BatchalignPipeline tasks are actually ran
178
+ Cmd2Task = {
179
+ "align": "fa",
180
+ "transcribe": "asr",
181
+ "transcribe_s": "asr,speaker",
182
+ "morphotag": "morphosyntax",
183
+ "benchmark": "asr,eval",
184
+ "utseg": "utterance",
185
+ "coref": "coref",
186
+ "translate": "translate",
187
+ "opensmile": "opensmile",
188
+ }
189
+
190
+ # this is the main runner used by all functions
191
+ def _dispatch(command, lang, num_speakers,
192
+ extensions, ctx, in_dir, out_dir,
193
+ loader:callable, writer:callable, console,
194
+ **kwargs):
195
+
196
+ C = console
197
+ from batchalign.constants import FORCED_CONVERSION
198
+ from batchalign.document import TaskFriendlyName
199
+
200
+ # get files by walking the directory
201
+ files = []
202
+ outputs = []
203
+
204
+ if kwargs.get("data"):
205
+ url = kwargs.get("data")
206
+ with open(url.strip()) as data:
207
+ data = data.readlines()
208
+ data = [i.strip() for i in data if i.strip() != ""]
209
+ for url in data:
210
+ url = urlparse(url)
211
+ if url.scheme == "":
212
+ url = url._replace(scheme="http")
213
+ base = os.path.basename(url.path)
214
+ files.append(url)
215
+ outputs.append(os.path.join(out_dir, base))
216
+
217
+ extr_data_mapping = {}
218
+
219
+ for basedir, _, fs in os.walk(in_dir):
220
+ for f in fs:
221
+ path = Path(os.path.join(basedir, f))
222
+ ext = path.suffix.strip(".").strip().lower()
223
+
224
+ # calculate input path, convert if needed
225
+ inp_path = str(path)
226
+ if ext in FORCED_CONVERSION:
227
+ # check for ffmpeg
228
+ if not shutil.which("ffmpeg"):
229
+ raise ValueError(f"ffmpeg not found in Path! Cannot load input media at {inp_path}.\nHint: Please convert your input audio sample to .wav before proceeding witch Batchalign, or install ffmpeg (https://ffmpeg.org/download.html)")
230
+ # convert
231
+ from pydub import AudioSegment
232
+ seg = AudioSegment.from_file(inp_path, ext)
233
+ seg.export(inp_path.replace(f".{ext}", ".wav"), format="wav")
234
+ inp_path = inp_path.replace(f".{ext}", ".wav")
235
+
236
+ # repath the file to the output
237
+ rel = os.path.relpath(inp_path, in_dir)
238
+ repathed = Path(os.path.join(out_dir, rel))
239
+ # make the repathed dir, if it doesn't exist
240
+ parent = repathed.parent.absolute()
241
+ os.makedirs(parent, exist_ok=True)
242
+
243
+ # HACK check for @Options:\tdummy in the file
244
+ # and simply copy it
245
+ if ext == "cha":
246
+ with open(inp_path, 'r', encoding="utf-8") as df:
247
+ data = df.read()
248
+ if "@Options:\tdummy" in data:
249
+ shutil.copy2(inp_path, str(repathed))
250
+ continue
251
+ elif "This is a dummy file to permit playback from the TalkBank browser" in data:
252
+ shutil.copy2(inp_path, str(repathed))
253
+ continue
254
+
255
+ # if the file needs to get processed, append it to the list
256
+ # to be processed and compute the output
257
+ if ext in extensions:
258
+ for indx, i in enumerate(files):
259
+ # check if this is a duplicate file
260
+ if (not isinstance(i, str) and
261
+ Path(i.geturl()).stem == Path(inp_path).stem):
262
+ extr_data_mapping[inp_path] = i.geturl()
263
+
264
+ files.pop(indx)
265
+ outputs.pop(indx)
266
+ break
267
+
268
+ files.append(inp_path)
269
+ outputs.append(str(repathed))
270
+ # otherwise just copy the file
271
+ else:
272
+ shutil.copy2(inp_path, str(repathed))
273
+
274
+ __tf = None
275
+ # output file
276
+ if ctx.obj["verbose"] > 1:
277
+ __tf = tempfile.NamedTemporaryFile(delete=True, mode='w')
278
+ C = Console(file=__tf)
279
+
280
+ # process largest inputs first to avoid late stragglers
281
+ file_pairs = list(zip(files, outputs))
282
+ file_pairs.sort(key=lambda fo: os.path.getsize(fo[0]) if os.path.exists(fo[0]) else 0, reverse=True)
283
+ files, outputs = zip(*file_pairs) if file_pairs else ([], [])
284
+
285
+ C.print(f"\nMode: [blue]{command}[/blue]; got [bold cyan]{len(files)}[/bold cyan] transcript{'s' if len(files) > 1 else ''} to process from {in_dir}:\n")
286
+
287
+ # Determine number of workers
288
+ num_workers = kwargs.get("num_workers", ctx.obj.get("workers", os.cpu_count()))
289
+
290
+ # Pre-download stanza resources if needed to avoid interleaved downloads in workers
291
+ if command in ["morphotag", "utseg", "coref"]:
292
+ try:
293
+ import stanza
294
+ stanza.download_resources_json()
295
+ except Exception:
296
+ pass
297
+
298
+ # For some commands or environments, we might want to limit this
299
+ if command in ["transcribe", "transcribe_s"]:
300
+ num_workers = min(num_workers, 2) # GPU memory limits
301
+
302
+ C.print(f"Using [bold]{num_workers}[/bold] worker processes.\n")
303
+
304
+ manager = multiprocessing.Manager() if files else None
305
+ progress_queue = manager.Queue() if manager else None
306
+
307
+ def render_stage(stage_tasks):
308
+ if not stage_tasks:
309
+ return "Processing..."
310
+ if not isinstance(stage_tasks, (list, tuple)):
311
+ stage_tasks = [stage_tasks]
312
+ names = [TaskFriendlyName.get(task, str(task)) for task in stage_tasks]
313
+ return ", ".join(names)
314
+
315
+ # create the spinner
316
+ prog = Progress(SpinnerColumn(), *Progress.get_default_columns()[:-1],
317
+ TimeElapsedColumn(),
318
+ TextColumn("[cyan]{task.fields[processor]}[/cyan]"), console=C)
319
+ errors = []
320
+
321
+ try:
322
+ with prog as prog:
323
+ tasks = {}
324
+ task_totals = {}
325
+
326
+ for f in files:
327
+ tasks[f] = prog.add_task(Path(f).name, start=False, total=1, processor="Waiting...")
328
+ task_totals[f] = 1
329
+
330
+ def drain_progress_queue():
331
+ if not progress_queue:
332
+ return
333
+ while True:
334
+ try:
335
+ file, completed, total, stage_tasks = progress_queue.get_nowait()
336
+ except queue.Empty:
337
+ break
338
+ except Exception:
339
+ break
340
+ if file not in tasks:
341
+ continue
342
+ task_total = max(int(total) if total else task_totals.get(file, 1), 1)
343
+ task_totals[file] = task_total
344
+ prog.update(tasks[file],
345
+ total=task_total,
346
+ completed=min(int(completed), task_total),
347
+ processor=render_stage(stage_tasks))
348
+
349
+ with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
350
+ worker_func = partial(_worker_task,
351
+ command=command,
352
+ lang=lang,
353
+ num_speakers=num_speakers,
354
+ loader_info=None,
355
+ writer_info=None,
356
+ progress_queue=progress_queue,
357
+ verbose=ctx.obj["verbose"],
358
+ **kwargs)
359
+
360
+ future_to_file = {executor.submit(worker_func, (f, o)): f for f, o in zip(files, outputs)}
361
+
362
+ for f in files:
363
+ prog.start_task(tasks[f])
364
+ prog.update(tasks[f], processor="Processing...")
365
+
366
+ pending = set(future_to_file.keys())
367
+ while pending:
368
+ done, pending = concurrent.futures.wait(
369
+ pending,
370
+ timeout=0.1,
371
+ return_when=concurrent.futures.FIRST_COMPLETED,
372
+ )
373
+ drain_progress_queue()
374
+
375
+ for future in done:
376
+ file = future_to_file[future]
377
+ try:
378
+ res_file, trcbk, e, captured = future.result()
379
+ final_total = max(task_totals.get(file, 1), 1)
380
+ if e:
381
+ prog.update(tasks[file], total=final_total, completed=final_total, processor="[bold red]FAIL[/bold red]")
382
+ errors.append((res_file, trcbk, e, captured))
383
+ else:
384
+ prog.update(tasks[file], total=final_total, completed=final_total, processor="[bold green]DONE[/bold green]")
385
+ if ctx.obj["verbose"] >= 1 and captured.strip():
386
+ errors.append((res_file, "Logs only (Success)", None, captured))
387
+ except Exception as e:
388
+ final_total = max(task_totals.get(file, 1), 1)
389
+ prog.update(tasks[file], total=final_total, completed=final_total, processor="[bold red]FAIL[/bold red]")
390
+ errors.append((file, traceback.format_exc(), e, ""))
391
+
392
+ drain_progress_queue()
393
+ finally:
394
+ if manager:
395
+ manager.shutdown()
396
+
397
+ if len(errors) > 0:
398
+ C.print()
399
+ for file, trcbk, e, captured in errors:
400
+ rel_path = os.path.relpath(str(Path(file).absolute()), in_dir)
401
+ if e:
402
+ C.print(f"[bold red]ERROR[/bold red] on file [italic]{rel_path}[/italic]: {escape(str(e))}\n")
403
+ if captured.strip():
404
+ C.print(f"[dim]Captured Worker Output:[/dim]\n{escape(captured.strip())}\n")
405
+ if ctx.obj["verbose"] == 1:
406
+ C.print(escape(str(trcbk)))
407
+ elif ctx.obj["verbose"] > 1:
408
+ Console().print(escape(str(trcbk)))
409
+ elif captured.strip():
410
+ C.print(f"[bold blue]INFO[/bold blue] on file [italic]{rel_path}[/italic]:\n")
411
+ C.print(f"{escape(captured.strip())}\n")
412
+ else:
413
+ C.print(f"\nAll done. Results saved to {out_dir}!\n")
414
+
415
+ if ctx.obj["verbose"] > 1:
416
+ C.end_capture()
417
+
418
+ if __tf:
419
+ __tf.close()
@@ -42,10 +42,11 @@ def generate_chat_utterance(utterance: Utterance, special_mor=False, write_wor=T
42
42
  main_line = re.sub(r"(?:[a-z]) ?\(([a-z]+) ?\)", r"(\1)", main_line)
43
43
  main_line = re.sub(r"([a-z]) _", r"\1_", main_line)
44
44
  main_line = re.sub(r" ", r" ", main_line)
45
+ main_line = re.sub(r"^,", "", main_line.strip()) # remove initial commas
45
46
  main_line = re.sub(r"«", "“", main_line)
46
47
  main_line = re.sub(r"»", "”", main_line)
47
48
  main_line = re.sub(r"—", "-", main_line)
48
- main_line = re.sub(r"–", "-", main_line)
49
+ main_line = re.sub(r"–", "-", main_line).strip()
49
50
  tier = utterance.tier
50
51
 
51
52
  mors = []