batchalign 0.7.22.post20__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of batchalign might be problematic. Click here for more details.

Files changed (146) hide show
  1. {batchalign-0.7.22.post20/batchalign.egg-info → batchalign-0.8.0}/PKG-INFO +1 -1
  2. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/cli/cli.py +22 -21
  3. batchalign-0.8.0/batchalign/cli/dispatch.py +364 -0
  4. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/morphosyntax/ud.py +115 -81
  5. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/utterance/ud_utterance.py +1 -1
  6. batchalign-0.8.0/batchalign/utils/dp.py +225 -0
  7. batchalign-0.8.0/batchalign/version +3 -0
  8. {batchalign-0.7.22.post20 → batchalign-0.8.0/batchalign.egg-info}/PKG-INFO +1 -1
  9. batchalign-0.7.22.post20/batchalign/cli/dispatch.py +0 -196
  10. batchalign-0.7.22.post20/batchalign/utils/dp.py +0 -225
  11. batchalign-0.7.22.post20/batchalign/version +0 -3
  12. {batchalign-0.7.22.post20 → batchalign-0.8.0}/LICENSE +0 -0
  13. {batchalign-0.7.22.post20 → batchalign-0.8.0}/MANIFEST.in +0 -0
  14. {batchalign-0.7.22.post20 → batchalign-0.8.0}/README.md +0 -0
  15. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/__init__.py +0 -0
  16. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/__main__.py +0 -0
  17. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/cli/__init__.py +0 -0
  18. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/constants.py +0 -0
  19. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/document.py +0 -0
  20. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/errors.py +0 -0
  21. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/formats/__init__.py +0 -0
  22. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/formats/base.py +0 -0
  23. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/formats/chat/__init__.py +0 -0
  24. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/formats/chat/file.py +0 -0
  25. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/formats/chat/generator.py +0 -0
  26. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/formats/chat/lexer.py +0 -0
  27. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/formats/chat/parser.py +0 -0
  28. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/formats/chat/utils.py +0 -0
  29. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/formats/textgrid/__init__.py +0 -0
  30. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/formats/textgrid/file.py +0 -0
  31. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/formats/textgrid/generator.py +0 -0
  32. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/formats/textgrid/parser.py +0 -0
  33. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/models/__init__.py +0 -0
  34. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/models/resolve.py +0 -0
  35. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/models/speaker/__init__.py +0 -0
  36. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/models/speaker/config.yaml +0 -0
  37. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/models/speaker/infer.py +0 -0
  38. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/models/speaker/utils.py +0 -0
  39. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/models/training/__init__.py +0 -0
  40. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/models/training/run.py +0 -0
  41. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/models/training/utils.py +0 -0
  42. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/models/utils.py +0 -0
  43. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/models/utterance/__init__.py +0 -0
  44. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/models/utterance/cantonese_infer.py +0 -0
  45. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/models/utterance/dataset.py +0 -0
  46. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/models/utterance/execute.py +0 -0
  47. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/models/utterance/infer.py +0 -0
  48. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/models/utterance/prep.py +0 -0
  49. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/models/utterance/train.py +0 -0
  50. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/models/wave2vec/__init__.py +0 -0
  51. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/models/wave2vec/infer_fa.py +0 -0
  52. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/models/whisper/__init__.py +0 -0
  53. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/models/whisper/infer_asr.py +0 -0
  54. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/models/whisper/infer_fa.py +0 -0
  55. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/__init__.py +0 -0
  56. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/analysis/__init__.py +0 -0
  57. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/analysis/eval.py +0 -0
  58. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/asr/__init__.py +0 -0
  59. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/asr/num2chinese.py +0 -0
  60. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/asr/num2lang/__init__.py +0 -0
  61. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/asr/num2lang/deu.py +0 -0
  62. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/asr/num2lang/ell.py +0 -0
  63. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/asr/num2lang/eng.py +0 -0
  64. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/asr/num2lang/eus.py +0 -0
  65. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/asr/num2lang/fra.py +0 -0
  66. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/asr/num2lang/hrv.py +0 -0
  67. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/asr/num2lang/ind.py +0 -0
  68. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/asr/num2lang/jpn.py +0 -0
  69. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/asr/num2lang/nld.py +0 -0
  70. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/asr/num2lang/por.py +0 -0
  71. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/asr/num2lang/spa.py +0 -0
  72. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/asr/num2lang/tha.py +0 -0
  73. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/asr/oai_whisper.py +0 -0
  74. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/asr/rev.py +0 -0
  75. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/asr/utils.py +0 -0
  76. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/asr/whisper.py +0 -0
  77. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/asr/whisperx.py +0 -0
  78. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/avqi/__init__.py +0 -0
  79. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/avqi/engine.py +0 -0
  80. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/base.py +0 -0
  81. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/cleanup/__init__.py +0 -0
  82. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  83. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  84. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  85. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/cleanup/retrace.py +0 -0
  86. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  87. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  88. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/cleanup/support/test.test +0 -0
  89. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/diarization/__init__.py +0 -0
  90. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/diarization/pyannote.py +0 -0
  91. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/dispatch.py +0 -0
  92. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/fa/__init__.py +0 -0
  93. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/fa/wave2vec_fa.py +0 -0
  94. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  95. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  96. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/morphosyntax/coref.py +0 -0
  97. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  98. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  99. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  100. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  101. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  102. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/opensmile/__init__.py +0 -0
  103. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/opensmile/engine.py +0 -0
  104. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/pipeline.py +0 -0
  105. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/speaker/__init__.py +0 -0
  106. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  107. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/translate/__init__.py +0 -0
  108. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/translate/gtrans.py +0 -0
  109. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/translate/seamless.py +0 -0
  110. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/translate/utils.py +0 -0
  111. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/utr/__init__.py +0 -0
  112. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/utr/rev_utr.py +0 -0
  113. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/utr/utils.py +0 -0
  114. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  115. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/pipelines/utterance/__init__.py +0 -0
  116. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/tests/__init__.py +0 -0
  117. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/tests/conftest.py +0 -0
  118. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  119. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  120. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  121. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  122. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  123. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  124. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  125. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  126. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  127. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  128. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  129. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  130. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/tests/pipelines/fixures.py +0 -0
  131. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  132. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  133. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/tests/test_document.py +0 -0
  134. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/utils/__init__.py +0 -0
  135. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/utils/abbrev.py +0 -0
  136. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/utils/compounds.py +0 -0
  137. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/utils/config.py +0 -0
  138. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/utils/names.py +0 -0
  139. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign/utils/utils.py +0 -0
  140. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign.egg-info/SOURCES.txt +0 -0
  141. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign.egg-info/dependency_links.txt +0 -0
  142. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign.egg-info/entry_points.txt +0 -0
  143. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign.egg-info/requires.txt +0 -0
  144. {batchalign-0.7.22.post20 → batchalign-0.8.0}/batchalign.egg-info/top_level.txt +0 -0
  145. {batchalign-0.7.22.post20 → batchalign-0.8.0}/setup.cfg +0 -0
  146. {batchalign-0.7.22.post20 → batchalign-0.8.0}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.22.post20
3
+ Version: 0.8.0
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -3,37 +3,23 @@ cli.py
3
3
  The Batchalign command-line interface
4
4
  """
5
5
 
6
- import multiprocessing
7
6
  import rich_click as click
8
7
  import functools
9
8
 
10
9
  import os
11
- from glob import glob
12
10
 
13
- from multiprocessing import Process, freeze_support
14
-
15
- from batchalign.pipelines import BatchalignPipeline
11
+ from multiprocessing import freeze_support
16
12
 
13
+ from pathlib import Path
17
14
  from rich.traceback import install
18
15
  from rich.console import Console
19
- from rich.panel import Panel
20
- from pathlib import Path
21
- from batchalign.document import *
22
- from batchalign.formats.chat import CHATFile
23
- from batchalign.utils import config
24
16
  from rich.logging import RichHandler
25
17
 
26
18
  from batchalign.cli.dispatch import _dispatch
27
19
  from batchalign.models.training.run import cli as train
28
20
 
29
- from enum import Enum
30
-
31
- import traceback
32
-
33
21
  import pyfiglet
34
- from rich import pretty
35
- import logging as L
36
- baL = L.getLogger('batchalign')
22
+ import logging as L
37
23
 
38
24
  C = Console()
39
25
 
@@ -62,7 +48,7 @@ def handle_verbosity(verbosity):
62
48
  L.getLogger('stanza').handlers.clear()
63
49
  L.getLogger('transformers').handlers.clear()
64
50
  L.getLogger('nemo_logger').handlers.clear()
65
- L.getLogger("stanza").setLevel(L.INFO)
51
+ L.getLogger("stanza").setLevel(L.WARN)
66
52
  L.getLogger('nemo_logger').setLevel(L.CRITICAL)
67
53
  L.getLogger('batchalign').setLevel(L.WARN)
68
54
  L.getLogger('lightning.pytorch.utilities.migration.utils').setLevel(L.ERROR)
@@ -73,6 +59,7 @@ def handle_verbosity(verbosity):
73
59
  L.getLogger('batchalign').setLevel(L.INFO)
74
60
  if verbosity >= 3:
75
61
  L.getLogger('batchalign').setLevel(L.DEBUG)
62
+ L.getLogger("stanza").setLevel(L.INFO)
76
63
  if verbosity >= 4:
77
64
  L.getLogger('batchalign').setLevel(L.DEBUG)
78
65
  L.getLogger('transformers').setLevel(L.INFO)
@@ -81,7 +68,8 @@ def handle_verbosity(verbosity):
81
68
  @click.pass_context
82
69
  @click.version_option(VERSION_NUMBER)
83
70
  @click.option("-v", "--verbose", type=int, count=True, default=0, help="How loquacious Batchalign should be.")
84
- def batchalign(ctx, verbose):
71
+ @click.option("--workers", type=int, default=os.cpu_count(), help="Number of worker processes to use.")
72
+ def batchalign(ctx, verbose, workers):
85
73
  """process .cha and/or audio files in IN_DIR and dumps them to OUT_DIR using recipe COMMAND"""
86
74
 
87
75
  ## setup commands ##
@@ -93,7 +81,9 @@ def batchalign(ctx, verbose):
93
81
  handle_verbosity(verbose)
94
82
  # add to arguments
95
83
  ctx.obj["verbose"] = verbose
84
+ ctx.obj["workers"] = workers
96
85
  # setup config
86
+ from batchalign.utils import config
97
87
  ctx.obj["config"] = config.config_read(True)
98
88
  # make everything look better
99
89
  # pretty.install()
@@ -116,6 +106,7 @@ batchalign.add_command(train, "models")
116
106
  @click.pass_context
117
107
  def align(ctx, in_dir, out_dir, whisper, wav2vec, **kwargs):
118
108
  """Align transcripts against corresponding media files."""
109
+ from batchalign.formats.chat import CHATFile
119
110
  def loader(file):
120
111
  return (
121
112
  CHATFile(path=os.path.abspath(file)).doc,
@@ -165,6 +156,8 @@ def align(ctx, in_dir, out_dir, whisper, wav2vec, **kwargs):
165
156
  @click.pass_context
166
157
  def transcribe(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
167
158
  """Create a transcript from audio files."""
159
+ from batchalign.document import CustomLine, CustomLineType
160
+ from batchalign.formats.chat import CHATFile
168
161
  def loader(file):
169
162
  return file
170
163
 
@@ -209,6 +202,7 @@ def transcribe(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
209
202
  @click.pass_context
210
203
  def translate(ctx, in_dir, out_dir, **kwargs):
211
204
  """Translate the transcript to English."""
205
+ from batchalign.formats.chat import CHATFile
212
206
 
213
207
  def loader(file):
214
208
  cf = CHATFile(path=os.path.abspath(file), special_mor_=True)
@@ -239,6 +233,7 @@ def translate(ctx, in_dir, out_dir, **kwargs):
239
233
  @click.pass_context
240
234
  def morphotag(ctx, in_dir, out_dir, **kwargs):
241
235
  """Perform morphosyntactic analysis on transcripts."""
236
+ from batchalign.formats.chat import CHATFile
242
237
 
243
238
  def loader(file):
244
239
  mwt = {}
@@ -265,7 +260,7 @@ def morphotag(ctx, in_dir, out_dir, **kwargs):
265
260
 
266
261
  _dispatch("morphotag", "eng", 1, ["cha"], ctx,
267
262
  in_dir, out_dir,
268
- loader, writer, C)
263
+ loader, writer, C, **kwargs)
269
264
 
270
265
 
271
266
  #################### MORPHOTAG ################################
@@ -275,6 +270,7 @@ def morphotag(ctx, in_dir, out_dir, **kwargs):
275
270
  @click.pass_context
276
271
  def coref(ctx, in_dir, out_dir, **kwargs):
277
272
  """Perform coreference analysis on transcripts."""
273
+ from batchalign.formats.chat import CHATFile
278
274
 
279
275
  def loader(file):
280
276
  cf = CHATFile(path=os.path.abspath(file))
@@ -302,6 +298,7 @@ def coref(ctx, in_dir, out_dir, **kwargs):
302
298
  @click.pass_context
303
299
  def utseg(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
304
300
  """Perform morphosyntactic analysis on transcripts."""
301
+ from batchalign.formats.chat import CHATFile
305
302
 
306
303
  def loader(file):
307
304
  return CHATFile(path=os.path.abspath(file)).doc
@@ -332,6 +329,7 @@ def utseg(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
332
329
  @click.pass_context
333
330
  def benchmark(ctx, in_dir, out_dir, lang, num_speakers, whisper, whisper_oai, **kwargs):
334
331
  """Benchmark ASR utilities for their word accuracy"""
332
+ from batchalign.formats.chat import CHATFile
335
333
  def loader(file):
336
334
  # try to find a .cha in the same directory
337
335
  p = Path(file)
@@ -374,6 +372,7 @@ def avqi(ctx, input_dir, output_dir, lang, **kwargs):
374
372
  """Calculate AVQI from paired .cs and .sv audio files in input directory."""
375
373
 
376
374
  from batchalign.pipelines.avqi import AVQIEngine
375
+ from batchalign.document import Document
377
376
  from pathlib import Path
378
377
  import os
379
378
 
@@ -441,6 +440,7 @@ def avqi(ctx, input_dir, output_dir, lang, **kwargs):
441
440
  @click.pass_context
442
441
  def opensmile(ctx, input_dir, output_dir, feature_set, lang, **kwargs):
443
442
  """Extract openSMILE audio features from speech samples."""
443
+ from batchalign.document import Document
444
444
 
445
445
  def loader(file):
446
446
  doc = Document.new(media_path=file, lang=lang)
@@ -468,6 +468,7 @@ def opensmile(ctx, input_dir, output_dir, feature_set, lang, **kwargs):
468
468
  def setup(ctx):
469
469
  """Reconfigure Batchalign settings, such as Rev.AI key."""
470
470
 
471
+ from batchalign.utils import config
471
472
  config.interactive_setup()
472
473
 
473
474
  #################### VERSION ################################
@@ -480,5 +481,5 @@ def version(ctx, **kwargs):
480
481
  ptr = (pyfiglet.figlet_format("Batchalign2")+"\n" +
481
482
  f"Version: [bold]{VERSION_NUMBER.strip()}[/bold], released {RELEASE_DATE.strip()}\n" +
482
483
  f"[italic]{RELEASE_NOTES.strip()}[/italic]"+"\n" +
483
- "\nDeveloped by Brian MacWhinney and Houjun Liu")
484
+ "\nDeveloped by Brian MacWhinney and Houjun Liu\ncontributions from Sebastian Song and Franklin Chen")
484
485
  C.print("\n\n"+ptr+"\n\n")
@@ -0,0 +1,364 @@
1
+ """
2
+ dispatch.py
3
+ CLI runner dispatch. Essentially the translation layer between `command` in CLI
4
+ and actual BatchalignPipeline.
5
+ """
6
+
7
+ from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
8
+
9
+ import warnings
10
+
11
+ import shutil
12
+ import os
13
+ import glob
14
+ import queue
15
+
16
+ from rich.console import Console
17
+ from rich.markup import escape
18
+
19
+ from pathlib import Path
20
+
21
+ import concurrent.futures
22
+ import multiprocessing
23
+ from functools import partial
24
+
25
+ # Oneliner of directory-based glob and replace
26
+ globase = lambda path, statement: glob(os.path.join(path, statement))
27
+ repath_file = lambda file_path, new_dir: os.path.join(new_dir, Path(file_path).name)
28
+
29
+ import tempfile
30
+ import time
31
+
32
+ import traceback
33
+ import logging as L
34
+ baL = L.getLogger('batchalign')
35
+
36
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
37
+
38
+ # Global cache for the pipeline in worker processes
39
+ _worker_pipeline = None
40
+
41
+ def _get_worker_pipeline(command, lang, num_speakers, **kwargs):
42
+ global _worker_pipeline
43
+ if _worker_pipeline is None:
44
+ from batchalign.pipelines import BatchalignPipeline
45
+ _worker_pipeline = BatchalignPipeline.new(Cmd2Task[command],
46
+ lang=lang, num_speakers=num_speakers, **kwargs)
47
+ return _worker_pipeline
48
+
49
+ def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_info, progress_queue=None, **kwargs):
50
+ """The task executed in each worker process."""
51
+ import sys
52
+ import os
53
+ import tempfile
54
+
55
+ file, output = file_info
56
+ pid = os.getpid()
57
+
58
+ # Use a temporary file to capture ALL output at the FD level
59
+ # This is the most robust way to prevent interleaved output
60
+ with tempfile.TemporaryFile(mode='w+') as log_file:
61
+ old_stdout_fd = os.dup(sys.stdout.fileno())
62
+ old_stderr_fd = os.dup(sys.stderr.fileno())
63
+
64
+ try:
65
+ # Redirect FD 1 and 2 to our temp file
66
+ os.dup2(log_file.fileno(), sys.stdout.fileno())
67
+ os.dup2(log_file.fileno(), sys.stderr.fileno())
68
+
69
+ pipeline = _get_worker_pipeline(command, lang, num_speakers, **kwargs)
70
+
71
+ def progress_callback(completed, total, tasks):
72
+ if not progress_queue:
73
+ return
74
+ try:
75
+ progress_queue.put((file, completed, total, tasks))
76
+ except Exception:
77
+ pass
78
+
79
+ # For now, we'll re-import what we need
80
+ from batchalign.formats.chat import CHATFile
81
+
82
+ # Morphosyntax specific loader/writer logic moved here for picklability
83
+ if command == "morphotag":
84
+ # Extract morphotag-specific arguments from kwargs
85
+ mwt = kwargs.pop("mwt", {})
86
+ retokenize = kwargs.pop("retokenize", False)
87
+ skipmultilang = kwargs.pop("skipmultilang", False)
88
+
89
+ cf = CHATFile(path=os.path.abspath(file), special_mor_=True)
90
+ doc = cf.doc
91
+ if str(cf).count("%mor") > 0:
92
+ doc.ba_special_["special_mor_notation"] = True
93
+
94
+ # Prepare arguments for the pipeline
95
+ pipeline_kwargs = {
96
+ "retokenize": retokenize,
97
+ "skipmultilang": skipmultilang,
98
+ "mwt": mwt
99
+ }
100
+ # Add any remaining kwargs
101
+ pipeline_kwargs.update(kwargs)
102
+
103
+ # Process
104
+ doc = pipeline(doc, callback=progress_callback, **pipeline_kwargs)
105
+
106
+ # Write
107
+ CHATFile(doc=doc, special_mor_=doc.ba_special_.get("special_mor_notation", False)).write(output)
108
+
109
+ # Add other commands as needed, or use a more generic registry
110
+ elif command == "align":
111
+ cf = CHATFile(path=os.path.abspath(file))
112
+ doc = cf.doc
113
+ kw = {"pauses": kwargs.get("pauses", False)}
114
+ doc = pipeline(doc, callback=progress_callback, **kw)
115
+ CHATFile(doc=doc).write(output, write_wor=kwargs.get("wor", True))
116
+
117
+ else:
118
+ loader, writer = loader_info, writer_info
119
+ doc = loader(os.path.abspath(file))
120
+ kw = {}
121
+ if isinstance(doc, tuple) and len(doc) > 1:
122
+ doc, kw = doc
123
+ doc = pipeline(doc, callback=progress_callback, **kw)
124
+ writer(doc, output)
125
+
126
+ # Flush everything before reading back
127
+ sys.stdout.flush()
128
+ sys.stderr.flush()
129
+ log_file.seek(0)
130
+ captured = log_file.read()
131
+
132
+ return file, None, None, captured
133
+ except Exception as e:
134
+ # Flush everything before reading back
135
+ sys.stdout.flush()
136
+ sys.stderr.flush()
137
+ log_file.seek(0)
138
+ captured = log_file.read()
139
+ return file, traceback.format_exc(), e, captured
140
+ finally:
141
+ # Restore original FDs
142
+ os.dup2(old_stdout_fd, sys.stdout.fileno())
143
+ os.dup2(old_stderr_fd, sys.stderr.fileno())
144
+ os.close(old_stdout_fd)
145
+ os.close(old_stderr_fd)
146
+
147
+ # this dictionary maps what commands are executed
148
+ # against what BatchalignPipeline tasks are actually ran
149
+ Cmd2Task = {
150
+ "align": "fa",
151
+ "transcribe": "asr",
152
+ "transcribe_s": "asr,speaker",
153
+ "morphotag": "morphosyntax",
154
+ "benchmark": "asr,eval",
155
+ "utseg": "utterance",
156
+ "coref": "coref",
157
+ "translate": "translate",
158
+ "opensmile": "opensmile",
159
+ }
160
+
161
+ # this is the main runner used by all functions
162
+ def _dispatch(command, lang, num_speakers,
163
+ extensions, ctx, in_dir, out_dir,
164
+ loader:callable, writer:callable, console,
165
+ **kwargs):
166
+
167
+ C = console
168
+ from batchalign.constants import FORCED_CONVERSION
169
+ from batchalign.document import TaskFriendlyName
170
+
171
+ # get files by walking the directory
172
+ files = []
173
+ outputs = []
174
+
175
+ for basedir, _, fs in os.walk(in_dir):
176
+ for f in fs:
177
+ path = Path(os.path.join(basedir, f))
178
+ ext = path.suffix.strip(".").strip().lower()
179
+
180
+ # calculate input path, convert if needed
181
+ inp_path = str(path)
182
+ if ext in FORCED_CONVERSION:
183
+ # check for ffmpeg
184
+ if not shutil.which("ffmpeg"):
185
+ raise ValueError(f"ffmpeg not found in Path! Cannot load input media at {inp_path}.\nHint: Please convert your input audio sample to .wav before proceeding witch Batchalign, or install ffmpeg (https://ffmpeg.org/download.html)")
186
+ # convert
187
+ from pydub import AudioSegment
188
+ seg = AudioSegment.from_file(inp_path, ext)
189
+ seg.export(inp_path.replace(f".{ext}", ".wav"), format="wav")
190
+ inp_path = inp_path.replace(f".{ext}", ".wav")
191
+
192
+ # repath the file to the output
193
+ rel = os.path.relpath(inp_path, in_dir)
194
+ repathed = Path(os.path.join(out_dir, rel))
195
+ # make the repathed dir, if it doesn't exist
196
+ parent = repathed.parent.absolute()
197
+ os.makedirs(parent, exist_ok=True)
198
+
199
+ # HACK check for @Options:\tdummy in the file
200
+ # and simply copy it
201
+ if ext == "cha":
202
+ with open(inp_path, 'r', encoding="utf-8") as df:
203
+ data = df.read()
204
+ if "@Options:\tdummy" in data:
205
+ shutil.copy2(inp_path, str(repathed))
206
+ continue
207
+ elif "This is a dummy file to permit playback from the TalkBank browser" in data:
208
+ shutil.copy2(inp_path, str(repathed))
209
+ continue
210
+
211
+ # if the file needs to get processed, append it to the list
212
+ # to be processed and compute the output
213
+ if ext in extensions:
214
+ files.append(inp_path)
215
+ outputs.append(str(repathed))
216
+ # otherwise just copy the file
217
+ else:
218
+ shutil.copy2(inp_path, str(repathed))
219
+
220
+ __tf = None
221
+ # output file
222
+ if ctx.obj["verbose"] > 1:
223
+ __tf = tempfile.NamedTemporaryFile(delete=True, mode='w')
224
+ C = Console(file=__tf)
225
+
226
+ # process largest inputs first to avoid late stragglers
227
+ file_pairs = list(zip(files, outputs))
228
+ file_pairs.sort(key=lambda fo: os.path.getsize(fo[0]) if os.path.exists(fo[0]) else 0, reverse=True)
229
+ files, outputs = zip(*file_pairs) if file_pairs else ([], [])
230
+
231
+ C.print(f"\nMode: [blue]{command}[/blue]; got [bold cyan]{len(files)}[/bold cyan] transcript{'s' if len(files) > 1 else ''} to process from {in_dir}:\n")
232
+
233
+ # Determine number of workers
234
+ num_workers = kwargs.get("num_workers", ctx.obj.get("workers", os.cpu_count()))
235
+
236
+ # Pre-download stanza resources if needed to avoid interleaved downloads in workers
237
+ if command in ["morphotag", "utseg", "coref"]:
238
+ try:
239
+ import stanza
240
+ stanza.download_resources_json()
241
+ except Exception:
242
+ pass
243
+
244
+ # For some commands or environments, we might want to limit this
245
+ if command in ["transcribe", "transcribe_s"]:
246
+ num_workers = min(num_workers, 2) # GPU memory limits
247
+
248
+ C.print(f"Using [bold]{num_workers}[/bold] worker processes.\n")
249
+
250
+ manager = multiprocessing.Manager() if files else None
251
+ progress_queue = manager.Queue() if manager else None
252
+
253
+ def render_stage(stage_tasks):
254
+ if not stage_tasks:
255
+ return "Processing..."
256
+ if not isinstance(stage_tasks, (list, tuple)):
257
+ stage_tasks = [stage_tasks]
258
+ names = [TaskFriendlyName.get(task, str(task)) for task in stage_tasks]
259
+ return ", ".join(names)
260
+
261
+ # create the spinner
262
+ prog = Progress(SpinnerColumn(), *Progress.get_default_columns()[:-1],
263
+ TimeElapsedColumn(),
264
+ TextColumn("[cyan]{task.fields[processor]}[/cyan]"), console=C)
265
+ errors = []
266
+
267
+ try:
268
+ with prog as prog:
269
+ tasks = {}
270
+ task_totals = {}
271
+
272
+ for f in files:
273
+ tasks[f] = prog.add_task(Path(f).name, start=False, total=1, processor="Waiting...")
274
+ task_totals[f] = 1
275
+
276
+ def drain_progress_queue():
277
+ if not progress_queue:
278
+ return
279
+ while True:
280
+ try:
281
+ file, completed, total, stage_tasks = progress_queue.get_nowait()
282
+ except queue.Empty:
283
+ break
284
+ except Exception:
285
+ break
286
+ if file not in tasks:
287
+ continue
288
+ task_total = max(int(total) if total else task_totals.get(file, 1), 1)
289
+ task_totals[file] = task_total
290
+ prog.update(tasks[file],
291
+ total=task_total,
292
+ completed=min(int(completed), task_total),
293
+ processor=render_stage(stage_tasks))
294
+
295
+ with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
296
+ worker_func = partial(_worker_task,
297
+ command=command,
298
+ lang=lang,
299
+ num_speakers=num_speakers,
300
+ loader_info=None,
301
+ writer_info=None,
302
+ progress_queue=progress_queue,
303
+ **kwargs)
304
+
305
+ future_to_file = {executor.submit(worker_func, (f, o)): f for f, o in zip(files, outputs)}
306
+
307
+ for f in files:
308
+ prog.start_task(tasks[f])
309
+ prog.update(tasks[f], processor="Processing...")
310
+
311
+ pending = set(future_to_file.keys())
312
+ while pending:
313
+ done, pending = concurrent.futures.wait(
314
+ pending,
315
+ timeout=0.1,
316
+ return_when=concurrent.futures.FIRST_COMPLETED,
317
+ )
318
+ drain_progress_queue()
319
+
320
+ for future in done:
321
+ file = future_to_file[future]
322
+ try:
323
+ res_file, trcbk, e, captured = future.result()
324
+ final_total = max(task_totals.get(file, 1), 1)
325
+ if e:
326
+ prog.update(tasks[file], total=final_total, completed=final_total, processor="[bold red]FAIL[/bold red]")
327
+ errors.append((res_file, trcbk, e, captured))
328
+ else:
329
+ prog.update(tasks[file], total=final_total, completed=final_total, processor="[bold green]DONE[/bold green]")
330
+ if ctx.obj["verbose"] >= 1 and captured.strip():
331
+ errors.append((res_file, "Logs only (Success)", None, captured))
332
+ except Exception as e:
333
+ final_total = max(task_totals.get(file, 1), 1)
334
+ prog.update(tasks[file], total=final_total, completed=final_total, processor="[bold red]FAIL[/bold red]")
335
+ errors.append((file, traceback.format_exc(), e, ""))
336
+
337
+ drain_progress_queue()
338
+ finally:
339
+ if manager:
340
+ manager.shutdown()
341
+
342
+ if len(errors) > 0:
343
+ C.print()
344
+ for file, trcbk, e, captured in errors:
345
+ rel_path = os.path.relpath(str(Path(file).absolute()), in_dir)
346
+ if e:
347
+ C.print(f"[bold red]ERROR[/bold red] on file [italic]{rel_path}[/italic]: {escape(str(e))}\n")
348
+ if captured.strip():
349
+ C.print(f"[dim]Captured Worker Output:[/dim]\n{escape(captured.strip())}\n")
350
+ if ctx.obj["verbose"] == 1:
351
+ C.print(escape(str(trcbk)))
352
+ elif ctx.obj["verbose"] > 1:
353
+ Console().print(escape(str(trcbk)))
354
+ elif captured.strip():
355
+ C.print(f"[bold blue]INFO[/bold blue] on file [italic]{rel_path}[/italic]:\n")
356
+ C.print(f"{escape(captured.strip())}\n")
357
+ else:
358
+ C.print(f"\nAll done. Results saved to {out_dir}!\n")
359
+
360
+ if ctx.obj["verbose"] > 1:
361
+ C.end_capture()
362
+
363
+ if __tf:
364
+ __tf.close()