batchalign 0.8.0.post1__tar.gz → 0.8.0.post2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of batchalign might be problematic. Click here for more details.

Files changed (144) hide show
  1. {batchalign-0.8.0.post1/batchalign.egg-info → batchalign-0.8.0.post2}/PKG-INFO +1 -1
  2. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/cli/dispatch.py +186 -14
  3. batchalign-0.8.0.post2/batchalign/version +3 -0
  4. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2/batchalign.egg-info}/PKG-INFO +1 -1
  5. batchalign-0.8.0.post1/batchalign/version +0 -3
  6. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/LICENSE +0 -0
  7. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/MANIFEST.in +0 -0
  8. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/README.md +0 -0
  9. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/__init__.py +0 -0
  10. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/__main__.py +0 -0
  11. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/cli/__init__.py +0 -0
  12. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/cli/cli.py +0 -0
  13. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/constants.py +0 -0
  14. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/document.py +0 -0
  15. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/errors.py +0 -0
  16. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/formats/__init__.py +0 -0
  17. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/formats/base.py +0 -0
  18. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/formats/chat/__init__.py +0 -0
  19. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/formats/chat/file.py +0 -0
  20. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/formats/chat/generator.py +0 -0
  21. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/formats/chat/lexer.py +0 -0
  22. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/formats/chat/parser.py +0 -0
  23. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/formats/chat/utils.py +0 -0
  24. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/formats/textgrid/__init__.py +0 -0
  25. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/formats/textgrid/file.py +0 -0
  26. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/formats/textgrid/generator.py +0 -0
  27. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/formats/textgrid/parser.py +0 -0
  28. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/models/__init__.py +0 -0
  29. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/models/resolve.py +0 -0
  30. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/models/speaker/__init__.py +0 -0
  31. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/models/speaker/config.yaml +0 -0
  32. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/models/speaker/infer.py +0 -0
  33. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/models/speaker/utils.py +0 -0
  34. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/models/training/__init__.py +0 -0
  35. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/models/training/run.py +0 -0
  36. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/models/training/utils.py +0 -0
  37. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/models/utils.py +0 -0
  38. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/models/utterance/__init__.py +0 -0
  39. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/models/utterance/cantonese_infer.py +0 -0
  40. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/models/utterance/dataset.py +0 -0
  41. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/models/utterance/execute.py +0 -0
  42. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/models/utterance/infer.py +0 -0
  43. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/models/utterance/prep.py +0 -0
  44. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/models/utterance/train.py +0 -0
  45. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/models/wave2vec/__init__.py +0 -0
  46. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/models/wave2vec/infer_fa.py +0 -0
  47. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/models/whisper/__init__.py +0 -0
  48. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/models/whisper/infer_asr.py +0 -0
  49. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/models/whisper/infer_fa.py +0 -0
  50. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/__init__.py +0 -0
  51. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/analysis/__init__.py +0 -0
  52. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/analysis/eval.py +0 -0
  53. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/__init__.py +0 -0
  54. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2chinese.py +0 -0
  55. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/__init__.py +0 -0
  56. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/deu.py +0 -0
  57. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/ell.py +0 -0
  58. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/eng.py +0 -0
  59. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/eus.py +0 -0
  60. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/fra.py +0 -0
  61. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/hrv.py +0 -0
  62. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/ind.py +0 -0
  63. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/jpn.py +0 -0
  64. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/nld.py +0 -0
  65. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/por.py +0 -0
  66. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/spa.py +0 -0
  67. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/num2lang/tha.py +0 -0
  68. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/oai_whisper.py +0 -0
  69. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/rev.py +0 -0
  70. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/utils.py +0 -0
  71. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/whisper.py +0 -0
  72. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/asr/whisperx.py +0 -0
  73. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/avqi/__init__.py +0 -0
  74. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/avqi/engine.py +0 -0
  75. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/base.py +0 -0
  76. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/cleanup/__init__.py +0 -0
  77. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  78. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  79. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  80. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/cleanup/retrace.py +0 -0
  81. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  82. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  83. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/cleanup/support/test.test +0 -0
  84. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/diarization/__init__.py +0 -0
  85. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/diarization/pyannote.py +0 -0
  86. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/dispatch.py +0 -0
  87. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/fa/__init__.py +0 -0
  88. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/fa/wave2vec_fa.py +0 -0
  89. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  90. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  91. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/morphosyntax/coref.py +0 -0
  92. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  93. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  94. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  95. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  96. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  97. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/morphosyntax/ud.py +0 -0
  98. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/opensmile/__init__.py +0 -0
  99. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/opensmile/engine.py +0 -0
  100. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/pipeline.py +0 -0
  101. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/speaker/__init__.py +0 -0
  102. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  103. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/translate/__init__.py +0 -0
  104. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/translate/gtrans.py +0 -0
  105. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/translate/seamless.py +0 -0
  106. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/translate/utils.py +0 -0
  107. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/utr/__init__.py +0 -0
  108. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/utr/rev_utr.py +0 -0
  109. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/utr/utils.py +0 -0
  110. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  111. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/utterance/__init__.py +0 -0
  112. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  113. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/tests/__init__.py +0 -0
  114. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/tests/conftest.py +0 -0
  115. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  116. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  117. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  118. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  119. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  120. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  121. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  122. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  123. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  124. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  125. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  126. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  127. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/tests/pipelines/fixures.py +0 -0
  128. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  129. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  130. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/tests/test_document.py +0 -0
  131. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/utils/__init__.py +0 -0
  132. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/utils/abbrev.py +0 -0
  133. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/utils/compounds.py +0 -0
  134. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/utils/config.py +0 -0
  135. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/utils/dp.py +0 -0
  136. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/utils/names.py +0 -0
  137. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign/utils/utils.py +0 -0
  138. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign.egg-info/SOURCES.txt +0 -0
  139. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign.egg-info/dependency_links.txt +0 -0
  140. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign.egg-info/entry_points.txt +0 -0
  141. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign.egg-info/requires.txt +0 -0
  142. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/batchalign.egg-info/top_level.txt +0 -0
  143. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/setup.cfg +0 -0
  144. {batchalign-0.8.0.post1 → batchalign-0.8.0.post2}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.8.0.post1
3
+ Version: 0.8.0.post2
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -32,6 +32,7 @@ import time
32
32
  import traceback
33
33
  import logging as L
34
34
  baL = L.getLogger('batchalign')
35
+ import psutil
35
36
 
36
37
  warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
37
38
 
@@ -55,6 +56,29 @@ def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_inf
55
56
 
56
57
  file, output = file_info
57
58
  pid = os.getpid()
59
+ rss_start = None
60
+ rss_end = None
61
+ rss_peak = None
62
+
63
+ def _safe_rss():
64
+ try:
65
+ import psutil
66
+ return psutil.Process(pid).memory_info().rss
67
+ except Exception:
68
+ return None
69
+
70
+ def _safe_peak_rss():
71
+ try:
72
+ import resource
73
+ peak = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
74
+ if peak is None:
75
+ return None
76
+ # ru_maxrss is KB on Linux, bytes on macOS; normalize to bytes.
77
+ return int(peak * 1024) if peak < 1024 * 1024 * 1024 else int(peak)
78
+ except Exception:
79
+ return None
80
+
81
+ rss_start = _safe_rss()
58
82
 
59
83
  # Configure logging in this worker process
60
84
  if verbose >= 1:
@@ -70,9 +94,8 @@ def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_inf
70
94
  else:
71
95
  baL.setLevel(logging.DEBUG)
72
96
 
73
- # Only capture output if not in verbose mode
74
- # In verbose mode, let logs stream naturally to the console
75
- should_capture = verbose == 0
97
+ # Always capture output to avoid interleaving with progress rendering.
98
+ should_capture = True
76
99
 
77
100
  if should_capture:
78
101
  # Use a temporary file to capture ALL output at the FD level
@@ -152,7 +175,15 @@ def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_inf
152
175
  else:
153
176
  captured = ""
154
177
 
155
- return file, None, None, captured
178
+ rss_end = _safe_rss()
179
+ rss_peak = _safe_peak_rss()
180
+ mem_info = {
181
+ "pid": pid,
182
+ "rss_start": rss_start,
183
+ "rss_end": rss_end,
184
+ "rss_peak": rss_peak,
185
+ }
186
+ return file, None, None, captured, mem_info
156
187
  except Exception as e:
157
188
  # Flush and read captured output if we were capturing
158
189
  if should_capture:
@@ -162,7 +193,15 @@ def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_inf
162
193
  captured = log_file.read()
163
194
  else:
164
195
  captured = ""
165
- return file, traceback.format_exc(), e, captured
196
+ rss_end = _safe_rss()
197
+ rss_peak = _safe_peak_rss()
198
+ mem_info = {
199
+ "pid": pid,
200
+ "rss_start": rss_start,
201
+ "rss_end": rss_end,
202
+ "rss_peak": rss_peak,
203
+ }
204
+ return file, traceback.format_exc(), e, captured, mem_info
166
205
  finally:
167
206
  # Restore original FDs only if we redirected them
168
207
  if should_capture:
@@ -255,6 +294,7 @@ def _dispatch(command, lang, num_speakers,
255
294
  file_pairs = list(zip(files, outputs))
256
295
  file_pairs.sort(key=lambda fo: os.path.getsize(fo[0]) if os.path.exists(fo[0]) else 0, reverse=True)
257
296
  files, outputs = zip(*file_pairs) if file_pairs else ([], [])
297
+ file_sizes = {f: os.path.getsize(f) if os.path.exists(f) else 0 for f in files}
258
298
 
259
299
  C.print(f"\nMode: [blue]{command}[/blue]; got [bold cyan]{len(files)}[/bold cyan] transcript{'s' if len(files) > 1 else ''} to process from {in_dir}:\n")
260
300
 
@@ -289,8 +329,66 @@ def _dispatch(command, lang, num_speakers,
289
329
  # create the spinner
290
330
  prog = Progress(SpinnerColumn(), *Progress.get_default_columns()[:-1],
291
331
  TimeElapsedColumn(),
292
- TextColumn("[cyan]{task.fields[processor]}[/cyan]"), console=C)
332
+ TextColumn("[magenta]{task.fields[mem]}[/magenta]"),
333
+ TextColumn("[cyan]{task.fields[processor]}[/cyan]"),
334
+ console=C, refresh_per_second=5)
293
335
  errors = []
336
+ mem_records = {}
337
+ mem_samples = []
338
+ last_low_mem_warn = 0.0
339
+
340
+ def _format_bytes(count, precision=2):
341
+ if count is None:
342
+ return "unknown"
343
+ units = ["B", "KB", "MB", "GB", "TB"]
344
+ idx = 0
345
+ size = float(count)
346
+ while size >= 1024 and idx < len(units) - 1:
347
+ size /= 1024
348
+ idx += 1
349
+ if idx == 0:
350
+ return f"{int(size)}{units[idx]}"
351
+ return f"{size:.{precision}f}{units[idx]}"
352
+
353
+ def _mem_label(base, available=None, low_mem=False):
354
+ parts = [base]
355
+ if available is not None:
356
+ parts.append(f"avail {_format_bytes(available, precision=1)}")
357
+ if low_mem:
358
+ parts.append("LOW MEM")
359
+ return " | ".join(parts)
360
+
361
+ def _system_memory():
362
+ try:
363
+ vm = psutil.virtual_memory()
364
+ return vm.total, vm.available
365
+ except Exception:
366
+ return None, None
367
+
368
+ def _memory_reserve(total):
369
+ if total is None:
370
+ return None
371
+ return max(int(total * 0.10), 2 * 1024 * 1024 * 1024)
372
+
373
+ def _estimate_worker_bytes(file_size):
374
+ if not mem_samples:
375
+ return 512 * 1024 * 1024
376
+ ratios = [mem / size for size, mem in mem_samples if size and mem]
377
+ if not ratios:
378
+ return 512 * 1024 * 1024
379
+ ratios.sort()
380
+ median_ratio = ratios[len(ratios) // 2]
381
+ est = int(median_ratio * file_size)
382
+ return max(512 * 1024 * 1024, min(est, 6 * 1024 * 1024 * 1024))
383
+
384
+ def _should_throttle(est_bytes):
385
+ total, available = _system_memory()
386
+ if total is None or available is None:
387
+ return False, total, available
388
+ reserve = _memory_reserve(total)
389
+ if reserve is None:
390
+ return False, total, available
391
+ return (available - est_bytes) < reserve, total, available
294
392
 
295
393
  try:
296
394
  with prog as prog:
@@ -298,8 +396,9 @@ def _dispatch(command, lang, num_speakers,
298
396
  task_totals = {}
299
397
 
300
398
  for f in files:
301
- tasks[f] = prog.add_task(Path(f).name, start=False, total=1, processor="Waiting...")
399
+ tasks[f] = prog.add_task(Path(f).name, start=False, total=1, processor="Waiting...", mem="queued")
302
400
  task_totals[f] = 1
401
+ prog.start_task(tasks[f])
303
402
 
304
403
  def drain_progress_queue():
305
404
  if not progress_queue:
@@ -315,10 +414,16 @@ def _dispatch(command, lang, num_speakers,
315
414
  continue
316
415
  task_total = max(int(total) if total else task_totals.get(file, 1), 1)
317
416
  task_totals[file] = task_total
417
+ total_mem, available_mem = _system_memory()
418
+ reserve = _memory_reserve(total_mem)
419
+ low_mem = False
420
+ if reserve is not None and available_mem is not None:
421
+ low_mem = available_mem < reserve
318
422
  prog.update(tasks[file],
319
423
  total=task_total,
320
424
  completed=min(int(completed), task_total),
321
- processor=render_stage(stage_tasks))
425
+ processor=render_stage(stage_tasks),
426
+ mem=_mem_label("running", available_mem, low_mem))
322
427
 
323
428
  with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
324
429
  worker_func = partial(_worker_task,
@@ -331,11 +436,54 @@ def _dispatch(command, lang, num_speakers,
331
436
  verbose=ctx.obj["verbose"],
332
437
  **kwargs)
333
438
 
334
- future_to_file = {executor.submit(worker_func, (f, o)): f for f, o in zip(files, outputs)}
439
+ file_iter = iter(zip(files, outputs))
440
+ future_to_file = {}
441
+
442
+ def submit_one(file_path, output_path):
443
+ future = executor.submit(worker_func, (file_path, output_path))
444
+ future_to_file[future] = file_path
445
+ est_bytes = _estimate_worker_bytes(file_sizes.get(file_path, 0))
446
+ total_mem, available_mem = _system_memory()
447
+ reserve = _memory_reserve(total_mem)
448
+ low_mem = False
449
+ if reserve is not None and available_mem is not None:
450
+ low_mem = available_mem < reserve
451
+ prog.update(
452
+ tasks[file_path],
453
+ processor="Processing...",
454
+ mem=_mem_label(f"est {_format_bytes(est_bytes)}", available_mem, low_mem),
455
+ )
335
456
 
336
- for f in files:
337
- prog.start_task(tasks[f])
338
- prog.update(tasks[f], processor="Processing...")
457
+ def schedule_available():
458
+ nonlocal last_low_mem_warn
459
+ while len(future_to_file) < num_workers:
460
+ try:
461
+ next_file, next_output = next(file_iter)
462
+ except StopIteration:
463
+ break
464
+ est_bytes = _estimate_worker_bytes(file_sizes.get(next_file, 0))
465
+ throttle, total, available = _should_throttle(est_bytes)
466
+ if throttle and future_to_file:
467
+ now = time.time()
468
+ if now - last_low_mem_warn > 10:
469
+ reserve = _memory_reserve(total)
470
+ prog.console.print(
471
+ f"[bold yellow]Low memory[/bold yellow]: "
472
+ f"{_format_bytes(available)} free, "
473
+ f"{_format_bytes(reserve)} reserve. "
474
+ f"Throttling new workers."
475
+ )
476
+ last_low_mem_warn = now
477
+ break
478
+ if throttle and not future_to_file:
479
+ prog.console.print(
480
+ f"[bold yellow]Low memory[/bold yellow]: "
481
+ f"{_format_bytes(available)} free. "
482
+ "Continuing with a single worker."
483
+ )
484
+ submit_one(next_file, next_output)
485
+
486
+ schedule_available()
339
487
 
340
488
  pending = set(future_to_file.keys())
341
489
  while pending:
@@ -348,8 +496,9 @@ def _dispatch(command, lang, num_speakers,
348
496
 
349
497
  for future in done:
350
498
  file = future_to_file[future]
499
+ future_to_file.pop(future, None)
351
500
  try:
352
- res_file, trcbk, e, captured = future.result()
501
+ res_file, trcbk, e, captured, mem_info = future.result()
353
502
  final_total = max(task_totals.get(file, 1), 1)
354
503
  if e:
355
504
  prog.update(tasks[file], total=final_total, completed=final_total, processor="[bold red]FAIL[/bold red]")
@@ -357,12 +506,25 @@ def _dispatch(command, lang, num_speakers,
357
506
  else:
358
507
  prog.update(tasks[file], total=final_total, completed=final_total, processor="[bold green]DONE[/bold green]")
359
508
  if ctx.obj["verbose"] >= 1 and captured.strip():
360
- errors.append((res_file, "Logs only (Success)", None, captured))
509
+ prog.console.print(f"[bold blue]INFO[/bold blue] on file [italic]{Path(file).name}[/italic]:\n{escape(captured.strip())}\n")
510
+ if mem_info:
511
+ mem_records[file] = mem_info
512
+ peak = mem_info.get("rss_peak") or mem_info.get("rss_end")
513
+ if peak:
514
+ mem_samples.append((file_sizes.get(file, 0), peak))
515
+ total_mem, available_mem = _system_memory()
516
+ reserve = _memory_reserve(total_mem)
517
+ low_mem = False
518
+ if reserve is not None and available_mem is not None:
519
+ low_mem = available_mem < reserve
520
+ prog.update(tasks[file], mem=_mem_label(_format_bytes(peak), available_mem, low_mem))
361
521
  except Exception as e:
362
522
  final_total = max(task_totals.get(file, 1), 1)
363
523
  prog.update(tasks[file], total=final_total, completed=final_total, processor="[bold red]FAIL[/bold red]")
364
524
  errors.append((file, traceback.format_exc(), e, ""))
365
525
 
526
+ schedule_available()
527
+ pending = set(future_to_file.keys())
366
528
  drain_progress_queue()
367
529
  finally:
368
530
  if manager:
@@ -386,6 +548,16 @@ def _dispatch(command, lang, num_speakers,
386
548
  else:
387
549
  C.print(f"\nAll done. Results saved to {out_dir}!\n")
388
550
 
551
+ if mem_records and ctx.obj["verbose"] >= 1:
552
+ C.print("\nMemory usage per file (worker RSS peak):")
553
+ for file, info in mem_records.items():
554
+ rel_path = os.path.relpath(str(Path(file).absolute()), in_dir)
555
+ peak = info.get("rss_peak") or info.get("rss_end")
556
+ C.print(f"- {rel_path}: {_format_bytes(peak)}")
557
+ total, available = _system_memory()
558
+ if total is not None and available is not None:
559
+ C.print(f"\nSystem memory available: {_format_bytes(available)} / {_format_bytes(total)}")
560
+
389
561
  if ctx.obj["verbose"] > 1:
390
562
  C.end_capture()
391
563
 
@@ -0,0 +1,3 @@
1
+ 0.8.0-post.2
2
+ Jan 15th, 2025
3
+ Memory Safegaurds
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.8.0.post1
3
+ Version: 0.8.0.post2
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -1,3 +0,0 @@
1
- 0.8.0-post.1
2
- Jan 13th, 2025
3
- Speed