batchalign 0.8.0__tar.gz → 0.8.0.post2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of batchalign might be problematic. Click here for more details.

Files changed (145) hide show
  1. {batchalign-0.8.0/batchalign.egg-info → batchalign-0.8.0.post2}/PKG-INFO +1 -1
  2. batchalign-0.8.0.post2/batchalign/cli/dispatch.py +565 -0
  3. batchalign-0.8.0.post2/batchalign/version +3 -0
  4. {batchalign-0.8.0 → batchalign-0.8.0.post2/batchalign.egg-info}/PKG-INFO +1 -1
  5. batchalign-0.8.0/batchalign/cli/dispatch.py +0 -364
  6. batchalign-0.8.0/batchalign/version +0 -3
  7. {batchalign-0.8.0 → batchalign-0.8.0.post2}/LICENSE +0 -0
  8. {batchalign-0.8.0 → batchalign-0.8.0.post2}/MANIFEST.in +0 -0
  9. {batchalign-0.8.0 → batchalign-0.8.0.post2}/README.md +0 -0
  10. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/__init__.py +0 -0
  11. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/__main__.py +0 -0
  12. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/cli/__init__.py +0 -0
  13. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/cli/cli.py +0 -0
  14. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/constants.py +0 -0
  15. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/document.py +0 -0
  16. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/errors.py +0 -0
  17. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/formats/__init__.py +0 -0
  18. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/formats/base.py +0 -0
  19. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/formats/chat/__init__.py +0 -0
  20. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/formats/chat/file.py +0 -0
  21. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/formats/chat/generator.py +0 -0
  22. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/formats/chat/lexer.py +0 -0
  23. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/formats/chat/parser.py +0 -0
  24. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/formats/chat/utils.py +0 -0
  25. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/formats/textgrid/__init__.py +0 -0
  26. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/formats/textgrid/file.py +0 -0
  27. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/formats/textgrid/generator.py +0 -0
  28. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/formats/textgrid/parser.py +0 -0
  29. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/models/__init__.py +0 -0
  30. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/models/resolve.py +0 -0
  31. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/models/speaker/__init__.py +0 -0
  32. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/models/speaker/config.yaml +0 -0
  33. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/models/speaker/infer.py +0 -0
  34. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/models/speaker/utils.py +0 -0
  35. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/models/training/__init__.py +0 -0
  36. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/models/training/run.py +0 -0
  37. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/models/training/utils.py +0 -0
  38. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/models/utils.py +0 -0
  39. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/models/utterance/__init__.py +0 -0
  40. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/models/utterance/cantonese_infer.py +0 -0
  41. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/models/utterance/dataset.py +0 -0
  42. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/models/utterance/execute.py +0 -0
  43. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/models/utterance/infer.py +0 -0
  44. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/models/utterance/prep.py +0 -0
  45. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/models/utterance/train.py +0 -0
  46. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/models/wave2vec/__init__.py +0 -0
  47. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/models/wave2vec/infer_fa.py +0 -0
  48. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/models/whisper/__init__.py +0 -0
  49. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/models/whisper/infer_asr.py +0 -0
  50. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/models/whisper/infer_fa.py +0 -0
  51. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/__init__.py +0 -0
  52. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/analysis/__init__.py +0 -0
  53. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/analysis/eval.py +0 -0
  54. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/__init__.py +0 -0
  55. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2chinese.py +0 -0
  56. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/__init__.py +0 -0
  57. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/deu.py +0 -0
  58. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/ell.py +0 -0
  59. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/eng.py +0 -0
  60. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/eus.py +0 -0
  61. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/fra.py +0 -0
  62. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/hrv.py +0 -0
  63. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/ind.py +0 -0
  64. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/jpn.py +0 -0
  65. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/nld.py +0 -0
  66. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/por.py +0 -0
  67. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/spa.py +0 -0
  68. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/tha.py +0 -0
  69. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/oai_whisper.py +0 -0
  70. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/rev.py +0 -0
  71. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/utils.py +0 -0
  72. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/whisper.py +0 -0
  73. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/whisperx.py +0 -0
  74. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/avqi/__init__.py +0 -0
  75. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/avqi/engine.py +0 -0
  76. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/base.py +0 -0
  77. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/cleanup/__init__.py +0 -0
  78. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  79. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  80. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  81. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/cleanup/retrace.py +0 -0
  82. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  83. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  84. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/cleanup/support/test.test +0 -0
  85. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/diarization/__init__.py +0 -0
  86. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/diarization/pyannote.py +0 -0
  87. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/dispatch.py +0 -0
  88. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/fa/__init__.py +0 -0
  89. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/fa/wave2vec_fa.py +0 -0
  90. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  91. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  92. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/morphosyntax/coref.py +0 -0
  93. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  94. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  95. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  96. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  97. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  98. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/morphosyntax/ud.py +0 -0
  99. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/opensmile/__init__.py +0 -0
  100. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/opensmile/engine.py +0 -0
  101. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/pipeline.py +0 -0
  102. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/speaker/__init__.py +0 -0
  103. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  104. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/translate/__init__.py +0 -0
  105. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/translate/gtrans.py +0 -0
  106. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/translate/seamless.py +0 -0
  107. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/translate/utils.py +0 -0
  108. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/utr/__init__.py +0 -0
  109. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/utr/rev_utr.py +0 -0
  110. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/utr/utils.py +0 -0
  111. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  112. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/utterance/__init__.py +0 -0
  113. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  114. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/tests/__init__.py +0 -0
  115. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/tests/conftest.py +0 -0
  116. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  117. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  118. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  119. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  120. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  121. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  122. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  123. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  124. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  125. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  126. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  127. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  128. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/tests/pipelines/fixures.py +0 -0
  129. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  130. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  131. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/tests/test_document.py +0 -0
  132. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/utils/__init__.py +0 -0
  133. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/utils/abbrev.py +0 -0
  134. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/utils/compounds.py +0 -0
  135. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/utils/config.py +0 -0
  136. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/utils/dp.py +0 -0
  137. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/utils/names.py +0 -0
  138. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign/utils/utils.py +0 -0
  139. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign.egg-info/SOURCES.txt +0 -0
  140. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign.egg-info/dependency_links.txt +0 -0
  141. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign.egg-info/entry_points.txt +0 -0
  142. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign.egg-info/requires.txt +0 -0
  143. {batchalign-0.8.0 → batchalign-0.8.0.post2}/batchalign.egg-info/top_level.txt +0 -0
  144. {batchalign-0.8.0 → batchalign-0.8.0.post2}/setup.cfg +0 -0
  145. {batchalign-0.8.0 → batchalign-0.8.0.post2}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.8.0
3
+ Version: 0.8.0.post2
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -0,0 +1,565 @@
1
+ """
2
+ dispatch.py
3
+ CLI runner dispatch. Essentially the translation layer between `command` in CLI
4
+ and actual BatchalignPipeline.
5
+ """
6
+
7
+ from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
8
+
9
+ import warnings
10
+
11
+ import shutil
12
+ import os
13
+ import glob
14
+ import queue
15
+
16
+ from rich.console import Console
17
+ from rich.markup import escape
18
+
19
+ from pathlib import Path
20
+
21
+ import concurrent.futures
22
+ import multiprocessing
23
+ from functools import partial
24
+
25
+ # Oneliner of directory-based glob and replace
26
+ globase = lambda path, statement: glob(os.path.join(path, statement))
27
+ repath_file = lambda file_path, new_dir: os.path.join(new_dir, Path(file_path).name)
28
+
29
+ import tempfile
30
+ import time
31
+
32
+ import traceback
33
+ import logging as L
34
+ baL = L.getLogger('batchalign')
35
+ import psutil
36
+
37
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
38
+
39
+ # Global cache for the pipeline in worker processes
40
+ _worker_pipeline = None
41
+
42
+ def _get_worker_pipeline(command, lang, num_speakers, **kwargs):
43
+ global _worker_pipeline
44
+ if _worker_pipeline is None:
45
+ from batchalign.pipelines import BatchalignPipeline
46
+ _worker_pipeline = BatchalignPipeline.new(Cmd2Task[command],
47
+ lang=lang, num_speakers=num_speakers, **kwargs)
48
+ return _worker_pipeline
49
+
50
+ def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_info, progress_queue=None, verbose=0, **kwargs):
51
+ """The task executed in each worker process."""
52
+ import sys
53
+ import os
54
+ import tempfile
55
+ import logging
56
+
57
+ file, output = file_info
58
+ pid = os.getpid()
59
+ rss_start = None
60
+ rss_end = None
61
+ rss_peak = None
62
+
63
+ def _safe_rss():
64
+ try:
65
+ import psutil
66
+ return psutil.Process(pid).memory_info().rss
67
+ except Exception:
68
+ return None
69
+
70
+ def _safe_peak_rss():
71
+ try:
72
+ import resource
73
+ peak = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
74
+ if peak is None:
75
+ return None
76
+ # ru_maxrss is KB on Linux, bytes on macOS; normalize to bytes.
77
+ return int(peak * 1024) if peak < 1024 * 1024 * 1024 else int(peak)
78
+ except Exception:
79
+ return None
80
+
81
+ rss_start = _safe_rss()
82
+
83
+ # Configure logging in this worker process
84
+ if verbose >= 1:
85
+ # Ensure basicConfig is called so logging works
86
+ logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.ERROR)
87
+
88
+ # Configure batchalign logger level in this worker process
89
+ baL = logging.getLogger('batchalign')
90
+ if verbose == 0:
91
+ baL.setLevel(logging.WARN)
92
+ elif verbose == 1:
93
+ baL.setLevel(logging.INFO)
94
+ else:
95
+ baL.setLevel(logging.DEBUG)
96
+
97
+ # Always capture output to avoid interleaving with progress rendering.
98
+ should_capture = True
99
+
100
+ if should_capture:
101
+ # Use a temporary file to capture ALL output at the FD level
102
+ # This is the most robust way to prevent interleaved output
103
+ log_file = tempfile.TemporaryFile(mode='w+')
104
+ old_stdout_fd = os.dup(sys.stdout.fileno())
105
+ old_stderr_fd = os.dup(sys.stderr.fileno())
106
+
107
+ # Redirect FD 1 and 2 to our temp file
108
+ os.dup2(log_file.fileno(), sys.stdout.fileno())
109
+ os.dup2(log_file.fileno(), sys.stderr.fileno())
110
+
111
+ try:
112
+ pipeline = _get_worker_pipeline(command, lang, num_speakers, **kwargs)
113
+
114
+ def progress_callback(completed, total, tasks):
115
+ if not progress_queue:
116
+ return
117
+ try:
118
+ progress_queue.put((file, completed, total, tasks))
119
+ except Exception:
120
+ pass
121
+
122
+ # For now, we'll re-import what we need
123
+ from batchalign.formats.chat import CHATFile
124
+
125
+ # Morphosyntax specific loader/writer logic moved here for picklability
126
+ if command == "morphotag":
127
+ # Extract morphotag-specific arguments from kwargs
128
+ mwt = kwargs.pop("mwt", {})
129
+ retokenize = kwargs.pop("retokenize", False)
130
+ skipmultilang = kwargs.pop("skipmultilang", False)
131
+
132
+ cf = CHATFile(path=os.path.abspath(file), special_mor_=True)
133
+ doc = cf.doc
134
+ if str(cf).count("%mor") > 0:
135
+ doc.ba_special_["special_mor_notation"] = True
136
+
137
+ # Prepare arguments for the pipeline
138
+ pipeline_kwargs = {
139
+ "retokenize": retokenize,
140
+ "skipmultilang": skipmultilang,
141
+ "mwt": mwt
142
+ }
143
+ # Add any remaining kwargs
144
+ pipeline_kwargs.update(kwargs)
145
+
146
+ # Process
147
+ doc = pipeline(doc, callback=progress_callback, **pipeline_kwargs)
148
+
149
+ # Write
150
+ CHATFile(doc=doc, special_mor_=doc.ba_special_.get("special_mor_notation", False)).write(output)
151
+
152
+ # Add other commands as needed, or use a more generic registry
153
+ elif command == "align":
154
+ cf = CHATFile(path=os.path.abspath(file))
155
+ doc = cf.doc
156
+ kw = {"pauses": kwargs.get("pauses", False)}
157
+ doc = pipeline(doc, callback=progress_callback, **kw)
158
+ CHATFile(doc=doc).write(output, write_wor=kwargs.get("wor", True))
159
+
160
+ else:
161
+ loader, writer = loader_info, writer_info
162
+ doc = loader(os.path.abspath(file))
163
+ kw = {}
164
+ if isinstance(doc, tuple) and len(doc) > 1:
165
+ doc, kw = doc
166
+ doc = pipeline(doc, callback=progress_callback, **kw)
167
+ writer(doc, output)
168
+
169
+ # Flush and read captured output if we were capturing
170
+ if should_capture:
171
+ sys.stdout.flush()
172
+ sys.stderr.flush()
173
+ log_file.seek(0)
174
+ captured = log_file.read()
175
+ else:
176
+ captured = ""
177
+
178
+ rss_end = _safe_rss()
179
+ rss_peak = _safe_peak_rss()
180
+ mem_info = {
181
+ "pid": pid,
182
+ "rss_start": rss_start,
183
+ "rss_end": rss_end,
184
+ "rss_peak": rss_peak,
185
+ }
186
+ return file, None, None, captured, mem_info
187
+ except Exception as e:
188
+ # Flush and read captured output if we were capturing
189
+ if should_capture:
190
+ sys.stdout.flush()
191
+ sys.stderr.flush()
192
+ log_file.seek(0)
193
+ captured = log_file.read()
194
+ else:
195
+ captured = ""
196
+ rss_end = _safe_rss()
197
+ rss_peak = _safe_peak_rss()
198
+ mem_info = {
199
+ "pid": pid,
200
+ "rss_start": rss_start,
201
+ "rss_end": rss_end,
202
+ "rss_peak": rss_peak,
203
+ }
204
+ return file, traceback.format_exc(), e, captured, mem_info
205
+ finally:
206
+ # Restore original FDs only if we redirected them
207
+ if should_capture:
208
+ os.dup2(old_stdout_fd, sys.stdout.fileno())
209
+ os.dup2(old_stderr_fd, sys.stderr.fileno())
210
+ os.close(old_stdout_fd)
211
+ os.close(old_stderr_fd)
212
+ log_file.close()
213
+
214
+ # this dictionary maps what commands are executed
215
+ # against what BatchalignPipeline tasks are actually ran
216
+ Cmd2Task = {
217
+ "align": "fa",
218
+ "transcribe": "asr",
219
+ "transcribe_s": "asr,speaker",
220
+ "morphotag": "morphosyntax",
221
+ "benchmark": "asr,eval",
222
+ "utseg": "utterance",
223
+ "coref": "coref",
224
+ "translate": "translate",
225
+ "opensmile": "opensmile",
226
+ }
227
+
228
+ # this is the main runner used by all functions
229
+ def _dispatch(command, lang, num_speakers,
230
+ extensions, ctx, in_dir, out_dir,
231
+ loader:callable, writer:callable, console,
232
+ **kwargs):
233
+
234
+ C = console
235
+ from batchalign.constants import FORCED_CONVERSION
236
+ from batchalign.document import TaskFriendlyName
237
+
238
+ # get files by walking the directory
239
+ files = []
240
+ outputs = []
241
+
242
+ for basedir, _, fs in os.walk(in_dir):
243
+ for f in fs:
244
+ path = Path(os.path.join(basedir, f))
245
+ ext = path.suffix.strip(".").strip().lower()
246
+
247
+ # calculate input path, convert if needed
248
+ inp_path = str(path)
249
+ if ext in FORCED_CONVERSION:
250
+ # check for ffmpeg
251
+ if not shutil.which("ffmpeg"):
252
+ raise ValueError(f"ffmpeg not found in Path! Cannot load input media at {inp_path}.\nHint: Please convert your input audio sample to .wav before proceeding witch Batchalign, or install ffmpeg (https://ffmpeg.org/download.html)")
253
+ # convert
254
+ from pydub import AudioSegment
255
+ seg = AudioSegment.from_file(inp_path, ext)
256
+ seg.export(inp_path.replace(f".{ext}", ".wav"), format="wav")
257
+ inp_path = inp_path.replace(f".{ext}", ".wav")
258
+
259
+ # repath the file to the output
260
+ rel = os.path.relpath(inp_path, in_dir)
261
+ repathed = Path(os.path.join(out_dir, rel))
262
+ # make the repathed dir, if it doesn't exist
263
+ parent = repathed.parent.absolute()
264
+ os.makedirs(parent, exist_ok=True)
265
+
266
+ # HACK check for @Options:\tdummy in the file
267
+ # and simply copy it
268
+ if ext == "cha":
269
+ with open(inp_path, 'r', encoding="utf-8") as df:
270
+ data = df.read()
271
+ if "@Options:\tdummy" in data:
272
+ shutil.copy2(inp_path, str(repathed))
273
+ continue
274
+ elif "This is a dummy file to permit playback from the TalkBank browser" in data:
275
+ shutil.copy2(inp_path, str(repathed))
276
+ continue
277
+
278
+ # if the file needs to get processed, append it to the list
279
+ # to be processed and compute the output
280
+ if ext in extensions:
281
+ files.append(inp_path)
282
+ outputs.append(str(repathed))
283
+ # otherwise just copy the file
284
+ else:
285
+ shutil.copy2(inp_path, str(repathed))
286
+
287
+ __tf = None
288
+ # output file
289
+ if ctx.obj["verbose"] > 1:
290
+ __tf = tempfile.NamedTemporaryFile(delete=True, mode='w')
291
+ C = Console(file=__tf)
292
+
293
+ # process largest inputs first to avoid late stragglers
294
+ file_pairs = list(zip(files, outputs))
295
+ file_pairs.sort(key=lambda fo: os.path.getsize(fo[0]) if os.path.exists(fo[0]) else 0, reverse=True)
296
+ files, outputs = zip(*file_pairs) if file_pairs else ([], [])
297
+ file_sizes = {f: os.path.getsize(f) if os.path.exists(f) else 0 for f in files}
298
+
299
+ C.print(f"\nMode: [blue]{command}[/blue]; got [bold cyan]{len(files)}[/bold cyan] transcript{'s' if len(files) > 1 else ''} to process from {in_dir}:\n")
300
+
301
+ # Determine number of workers
302
+ num_workers = kwargs.get("num_workers", ctx.obj.get("workers", os.cpu_count()))
303
+
304
+ # Pre-download stanza resources if needed to avoid interleaved downloads in workers
305
+ if command in ["morphotag", "utseg", "coref"]:
306
+ try:
307
+ import stanza
308
+ stanza.download_resources_json()
309
+ except Exception:
310
+ pass
311
+
312
+ # For some commands or environments, we might want to limit this
313
+ if command in ["transcribe", "transcribe_s"]:
314
+ num_workers = min(num_workers, 2) # GPU memory limits
315
+
316
+ C.print(f"Using [bold]{num_workers}[/bold] worker processes.\n")
317
+
318
+ manager = multiprocessing.Manager() if files else None
319
+ progress_queue = manager.Queue() if manager else None
320
+
321
+ def render_stage(stage_tasks):
322
+ if not stage_tasks:
323
+ return "Processing..."
324
+ if not isinstance(stage_tasks, (list, tuple)):
325
+ stage_tasks = [stage_tasks]
326
+ names = [TaskFriendlyName.get(task, str(task)) for task in stage_tasks]
327
+ return ", ".join(names)
328
+
329
+ # create the spinner
330
+ prog = Progress(SpinnerColumn(), *Progress.get_default_columns()[:-1],
331
+ TimeElapsedColumn(),
332
+ TextColumn("[magenta]{task.fields[mem]}[/magenta]"),
333
+ TextColumn("[cyan]{task.fields[processor]}[/cyan]"),
334
+ console=C, refresh_per_second=5)
335
+ errors = []
336
+ mem_records = {}
337
+ mem_samples = []
338
+ last_low_mem_warn = 0.0
339
+
340
+ def _format_bytes(count, precision=2):
341
+ if count is None:
342
+ return "unknown"
343
+ units = ["B", "KB", "MB", "GB", "TB"]
344
+ idx = 0
345
+ size = float(count)
346
+ while size >= 1024 and idx < len(units) - 1:
347
+ size /= 1024
348
+ idx += 1
349
+ if idx == 0:
350
+ return f"{int(size)}{units[idx]}"
351
+ return f"{size:.{precision}f}{units[idx]}"
352
+
353
+ def _mem_label(base, available=None, low_mem=False):
354
+ parts = [base]
355
+ if available is not None:
356
+ parts.append(f"avail {_format_bytes(available, precision=1)}")
357
+ if low_mem:
358
+ parts.append("LOW MEM")
359
+ return " | ".join(parts)
360
+
361
+ def _system_memory():
362
+ try:
363
+ vm = psutil.virtual_memory()
364
+ return vm.total, vm.available
365
+ except Exception:
366
+ return None, None
367
+
368
+ def _memory_reserve(total):
369
+ if total is None:
370
+ return None
371
+ return max(int(total * 0.10), 2 * 1024 * 1024 * 1024)
372
+
373
+ def _estimate_worker_bytes(file_size):
374
+ if not mem_samples:
375
+ return 512 * 1024 * 1024
376
+ ratios = [mem / size for size, mem in mem_samples if size and mem]
377
+ if not ratios:
378
+ return 512 * 1024 * 1024
379
+ ratios.sort()
380
+ median_ratio = ratios[len(ratios) // 2]
381
+ est = int(median_ratio * file_size)
382
+ return max(512 * 1024 * 1024, min(est, 6 * 1024 * 1024 * 1024))
383
+
384
+ def _should_throttle(est_bytes):
385
+ total, available = _system_memory()
386
+ if total is None or available is None:
387
+ return False, total, available
388
+ reserve = _memory_reserve(total)
389
+ if reserve is None:
390
+ return False, total, available
391
+ return (available - est_bytes) < reserve, total, available
392
+
393
+ try:
394
+ with prog as prog:
395
+ tasks = {}
396
+ task_totals = {}
397
+
398
+ for f in files:
399
+ tasks[f] = prog.add_task(Path(f).name, start=False, total=1, processor="Waiting...", mem="queued")
400
+ task_totals[f] = 1
401
+ prog.start_task(tasks[f])
402
+
403
+ def drain_progress_queue():
404
+ if not progress_queue:
405
+ return
406
+ while True:
407
+ try:
408
+ file, completed, total, stage_tasks = progress_queue.get_nowait()
409
+ except queue.Empty:
410
+ break
411
+ except Exception:
412
+ break
413
+ if file not in tasks:
414
+ continue
415
+ task_total = max(int(total) if total else task_totals.get(file, 1), 1)
416
+ task_totals[file] = task_total
417
+ total_mem, available_mem = _system_memory()
418
+ reserve = _memory_reserve(total_mem)
419
+ low_mem = False
420
+ if reserve is not None and available_mem is not None:
421
+ low_mem = available_mem < reserve
422
+ prog.update(tasks[file],
423
+ total=task_total,
424
+ completed=min(int(completed), task_total),
425
+ processor=render_stage(stage_tasks),
426
+ mem=_mem_label("running", available_mem, low_mem))
427
+
428
+ with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
429
+ worker_func = partial(_worker_task,
430
+ command=command,
431
+ lang=lang,
432
+ num_speakers=num_speakers,
433
+ loader_info=None,
434
+ writer_info=None,
435
+ progress_queue=progress_queue,
436
+ verbose=ctx.obj["verbose"],
437
+ **kwargs)
438
+
439
+ file_iter = iter(zip(files, outputs))
440
+ future_to_file = {}
441
+
442
+ def submit_one(file_path, output_path):
443
+ future = executor.submit(worker_func, (file_path, output_path))
444
+ future_to_file[future] = file_path
445
+ est_bytes = _estimate_worker_bytes(file_sizes.get(file_path, 0))
446
+ total_mem, available_mem = _system_memory()
447
+ reserve = _memory_reserve(total_mem)
448
+ low_mem = False
449
+ if reserve is not None and available_mem is not None:
450
+ low_mem = available_mem < reserve
451
+ prog.update(
452
+ tasks[file_path],
453
+ processor="Processing...",
454
+ mem=_mem_label(f"est {_format_bytes(est_bytes)}", available_mem, low_mem),
455
+ )
456
+
457
+ def schedule_available():
458
+ nonlocal last_low_mem_warn
459
+ while len(future_to_file) < num_workers:
460
+ try:
461
+ next_file, next_output = next(file_iter)
462
+ except StopIteration:
463
+ break
464
+ est_bytes = _estimate_worker_bytes(file_sizes.get(next_file, 0))
465
+ throttle, total, available = _should_throttle(est_bytes)
466
+ if throttle and future_to_file:
467
+ now = time.time()
468
+ if now - last_low_mem_warn > 10:
469
+ reserve = _memory_reserve(total)
470
+ prog.console.print(
471
+ f"[bold yellow]Low memory[/bold yellow]: "
472
+ f"{_format_bytes(available)} free, "
473
+ f"{_format_bytes(reserve)} reserve. "
474
+ f"Throttling new workers."
475
+ )
476
+ last_low_mem_warn = now
477
+ break
478
+ if throttle and not future_to_file:
479
+ prog.console.print(
480
+ f"[bold yellow]Low memory[/bold yellow]: "
481
+ f"{_format_bytes(available)} free. "
482
+ "Continuing with a single worker."
483
+ )
484
+ submit_one(next_file, next_output)
485
+
486
+ schedule_available()
487
+
488
+ pending = set(future_to_file.keys())
489
+ while pending:
490
+ done, pending = concurrent.futures.wait(
491
+ pending,
492
+ timeout=0.1,
493
+ return_when=concurrent.futures.FIRST_COMPLETED,
494
+ )
495
+ drain_progress_queue()
496
+
497
+ for future in done:
498
+ file = future_to_file[future]
499
+ future_to_file.pop(future, None)
500
+ try:
501
+ res_file, trcbk, e, captured, mem_info = future.result()
502
+ final_total = max(task_totals.get(file, 1), 1)
503
+ if e:
504
+ prog.update(tasks[file], total=final_total, completed=final_total, processor="[bold red]FAIL[/bold red]")
505
+ errors.append((res_file, trcbk, e, captured))
506
+ else:
507
+ prog.update(tasks[file], total=final_total, completed=final_total, processor="[bold green]DONE[/bold green]")
508
+ if ctx.obj["verbose"] >= 1 and captured.strip():
509
+ prog.console.print(f"[bold blue]INFO[/bold blue] on file [italic]{Path(file).name}[/italic]:\n{escape(captured.strip())}\n")
510
+ if mem_info:
511
+ mem_records[file] = mem_info
512
+ peak = mem_info.get("rss_peak") or mem_info.get("rss_end")
513
+ if peak:
514
+ mem_samples.append((file_sizes.get(file, 0), peak))
515
+ total_mem, available_mem = _system_memory()
516
+ reserve = _memory_reserve(total_mem)
517
+ low_mem = False
518
+ if reserve is not None and available_mem is not None:
519
+ low_mem = available_mem < reserve
520
+ prog.update(tasks[file], mem=_mem_label(_format_bytes(peak), available_mem, low_mem))
521
+ except Exception as e:
522
+ final_total = max(task_totals.get(file, 1), 1)
523
+ prog.update(tasks[file], total=final_total, completed=final_total, processor="[bold red]FAIL[/bold red]")
524
+ errors.append((file, traceback.format_exc(), e, ""))
525
+
526
+ schedule_available()
527
+ pending = set(future_to_file.keys())
528
+ drain_progress_queue()
529
+ finally:
530
+ if manager:
531
+ manager.shutdown()
532
+
533
+ if len(errors) > 0:
534
+ C.print()
535
+ for file, trcbk, e, captured in errors:
536
+ rel_path = os.path.relpath(str(Path(file).absolute()), in_dir)
537
+ if e:
538
+ C.print(f"[bold red]ERROR[/bold red] on file [italic]{rel_path}[/italic]: {escape(str(e))}\n")
539
+ if captured.strip():
540
+ C.print(f"[dim]Captured Worker Output:[/dim]\n{escape(captured.strip())}\n")
541
+ if ctx.obj["verbose"] == 1:
542
+ C.print(escape(str(trcbk)))
543
+ elif ctx.obj["verbose"] > 1:
544
+ Console().print(escape(str(trcbk)))
545
+ elif captured.strip():
546
+ C.print(f"[bold blue]INFO[/bold blue] on file [italic]{rel_path}[/italic]:\n")
547
+ C.print(f"{escape(captured.strip())}\n")
548
+ else:
549
+ C.print(f"\nAll done. Results saved to {out_dir}!\n")
550
+
551
+ if mem_records and ctx.obj["verbose"] >= 1:
552
+ C.print("\nMemory usage per file (worker RSS peak):")
553
+ for file, info in mem_records.items():
554
+ rel_path = os.path.relpath(str(Path(file).absolute()), in_dir)
555
+ peak = info.get("rss_peak") or info.get("rss_end")
556
+ C.print(f"- {rel_path}: {_format_bytes(peak)}")
557
+ total, available = _system_memory()
558
+ if total is not None and available is not None:
559
+ C.print(f"\nSystem memory available: {_format_bytes(available)} / {_format_bytes(total)}")
560
+
561
+ if ctx.obj["verbose"] > 1:
562
+ C.end_capture()
563
+
564
+ if __tf:
565
+ __tf.close()
@@ -0,0 +1,3 @@
1
+ 0.8.0-post.2
2
+ Jan 15th, 2025
3
+ Memory Safegaurds
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.8.0
3
+ Version: 0.8.0.post2
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu