BatchalignHK 0.7.19.post8__tar.gz → 0.7.19.post9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/BatchalignHK.egg-info/PKG-INFO +1 -1
  2. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/PKG-INFO +1 -1
  3. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/asr/tencent.py +17 -72
  4. batchalignhk-0.7.19.post9/batchalign/version +3 -0
  5. batchalignhk-0.7.19.post8/batchalign/version +0 -3
  6. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/BatchalignHK.egg-info/SOURCES.txt +0 -0
  7. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/BatchalignHK.egg-info/dependency_links.txt +0 -0
  8. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/BatchalignHK.egg-info/entry_points.txt +0 -0
  9. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/BatchalignHK.egg-info/requires.txt +0 -0
  10. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/BatchalignHK.egg-info/top_level.txt +0 -0
  11. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/LICENSE +0 -0
  12. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/MANIFEST.in +0 -0
  13. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/README.md +0 -0
  14. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/__init__.py +0 -0
  15. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/__main__.py +0 -0
  16. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/cli/__init__.py +0 -0
  17. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/cli/cli.py +0 -0
  18. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/cli/dispatch.py +0 -0
  19. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/constants.py +0 -0
  20. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/document.py +0 -0
  21. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/errors.py +0 -0
  22. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/formats/__init__.py +0 -0
  23. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/formats/base.py +0 -0
  24. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/formats/chat/__init__.py +0 -0
  25. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/formats/chat/file.py +0 -0
  26. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/formats/chat/generator.py +0 -0
  27. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/formats/chat/lexer.py +0 -0
  28. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/formats/chat/parser.py +0 -0
  29. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/formats/chat/utils.py +0 -0
  30. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/formats/textgrid/__init__.py +0 -0
  31. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/formats/textgrid/file.py +0 -0
  32. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/formats/textgrid/generator.py +0 -0
  33. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/formats/textgrid/parser.py +0 -0
  34. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/models/__init__.py +0 -0
  35. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/models/resolve.py +0 -0
  36. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/models/speaker/__init__.py +0 -0
  37. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/models/speaker/config.yaml +0 -0
  38. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/models/speaker/infer.py +0 -0
  39. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/models/speaker/utils.py +0 -0
  40. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/models/training/__init__.py +0 -0
  41. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/models/training/run.py +0 -0
  42. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/models/training/utils.py +0 -0
  43. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/models/utils.py +0 -0
  44. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/models/utterance/__init__.py +0 -0
  45. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/models/utterance/cantonese_infer.py +0 -0
  46. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/models/utterance/dataset.py +0 -0
  47. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/models/utterance/execute.py +0 -0
  48. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/models/utterance/infer.py +0 -0
  49. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/models/utterance/prep.py +0 -0
  50. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/models/utterance/train.py +0 -0
  51. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/models/wave2vec/__init__.py +0 -0
  52. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/models/wave2vec/infer_fa.py +0 -0
  53. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/models/whisper/__init__.py +0 -0
  54. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/models/whisper/infer_asr.py +0 -0
  55. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/models/whisper/infer_fa.py +0 -0
  56. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/__init__.py +0 -0
  57. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/analysis/__init__.py +0 -0
  58. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/analysis/eval.py +0 -0
  59. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/asr/__init__.py +0 -0
  60. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/asr/num2chinese.py +0 -0
  61. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/asr/oai_whisper.py +0 -0
  62. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/asr/rev.py +0 -0
  63. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/asr/utils.py +0 -0
  64. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/asr/whisper.py +0 -0
  65. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/asr/whisperx.py +0 -0
  66. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/base.py +0 -0
  67. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/cleanup/__init__.py +0 -0
  68. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  69. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  70. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  71. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/cleanup/retrace.py +0 -0
  72. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  73. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  74. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/cleanup/support/test.test +0 -0
  75. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/dispatch.py +0 -0
  76. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/fa/__init__.py +0 -0
  77. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/fa/wave2vec_fa.py +0 -0
  78. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  79. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  80. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/morphosyntax/coref.py +0 -0
  81. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  82. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  83. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  84. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  85. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  86. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/morphosyntax/ud.py +0 -0
  87. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/pipeline.py +0 -0
  88. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/speaker/__init__.py +0 -0
  89. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  90. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/translate/__init__.py +0 -0
  91. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/translate/gtrans.py +0 -0
  92. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/translate/seamless.py +0 -0
  93. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/translate/utils.py +0 -0
  94. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/utr/__init__.py +0 -0
  95. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/utr/rev_utr.py +0 -0
  96. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/utr/utils.py +0 -0
  97. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  98. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/utterance/__init__.py +0 -0
  99. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  100. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/tests/__init__.py +0 -0
  101. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/tests/conftest.py +0 -0
  102. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  103. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  104. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  105. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  106. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  107. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  108. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  109. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  110. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  111. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  112. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  113. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  114. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/tests/pipelines/fixures.py +0 -0
  115. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  116. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  117. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/tests/test_document.py +0 -0
  118. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/utils/__init__.py +0 -0
  119. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/utils/abbrev.py +0 -0
  120. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/utils/config.py +0 -0
  121. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/utils/dp.py +0 -0
  122. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/utils/names.py +0 -0
  123. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/batchalign/utils/utils.py +0 -0
  124. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/setup.cfg +0 -0
  125. {batchalignhk-0.7.19.post8 → batchalignhk-0.7.19.post9}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: BatchalignHK
3
- Version: 0.7.19.post8
3
+ Version: 0.7.19.post9
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: BatchalignHK
3
- Version: 0.7.19.post8
3
+ Version: 0.7.19.post9
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -21,8 +21,8 @@ import tempfile
21
21
  import pycountry
22
22
  import numpy as np
23
23
  import soundfile as sf
24
- from pydub import AudioSegment
25
- from pydub.effects import normalize
24
+ # from pydub import AudioSegment
25
+ # from pydub.effects import normalize
26
26
  import base64
27
27
  from tencentcloud.common.credential import Credential
28
28
  from tencentcloud.asr.v20190614.asr_client import AsrClient, models
@@ -30,9 +30,9 @@ from tencentcloud.asr.v20190614.asr_client import AsrClient, models
30
30
  import asyncio
31
31
  import tempfile
32
32
  import os
33
- from pydub import AudioSegment
34
- from pydub.effects import normalize
35
- from pydub.exceptions import CouldntDecodeError
33
+ # from pydub import AudioSegment
34
+ # from pydub.effects import normalize
35
+ # from pydub.exceptions import CouldntDecodeError
36
36
 
37
37
 
38
38
  import logging
@@ -77,66 +77,6 @@ class TencentEngine(BatchalignEngine):
77
77
  L.debug("Done.")
78
78
  else:
79
79
  self.__engine = None
80
-
81
- def __preprocess_audio(self, input_path):
82
- """Enhanced audio preprocessing for low-volume speech"""
83
- try:
84
- L.info(f"Optimizing audio for ASR: {input_path}")
85
-
86
- # read the audio file
87
- audio = AudioSegment.from_file(input_path)
88
-
89
- audio = audio.set_channels(1)
90
- audio = audio.set_frame_rate(16000)
91
-
92
-
93
- audio = audio.compress_dynamic_range(
94
- threshold=-40,
95
- ratio=3,
96
- attack=5,
97
- release=100
98
- )
99
- audio = audio.low_pass_filter(4000) # filter out high frequencies
100
- audio = audio.normalize(headroom=2) # keep the headroom
101
- audio = audio.compress_dynamic_range(
102
- threshold=-55,
103
- ratio=6,
104
- attack=15,
105
- release=200
106
- )
107
-
108
- # enhance low volume
109
- audio = audio.high_pass_filter(80)
110
- boosted = audio.high_pass_filter(1000).apply_gain(+4)
111
- audio = audio.overlay(boosted)
112
-
113
- if L.level <= logging.DEBUG:
114
- self.__print_audio_stats(audio)
115
-
116
- # output to a temporary file
117
- temp_fd, temp_path = tempfile.mkstemp(suffix=".mp3")
118
- os.close(temp_fd)
119
- audio.export(
120
- temp_path,
121
- format="mp3",
122
- codec="libmp3lame",
123
- bitrate="96k",
124
- tags={"title": "BA_Optimized"},
125
- parameters=[
126
- "-compression_level", "2",
127
- "-reservoir", "0",
128
- "-joint_stereo", "0"
129
- ]
130
- )
131
-
132
- return temp_path
133
-
134
- except CouldntDecodeError:
135
- L.error(f"Audio decoding failed: {input_path}")
136
- return input_path
137
- except Exception as e:
138
- L.error(f"Audio processing error: {str(e)}")
139
- return input_path
140
80
 
141
81
  def replace_cantonese_words(self, word):
142
82
  """Function to replace Cantonese words with custom replacements."""
@@ -176,13 +116,15 @@ class TencentEngine(BatchalignEngine):
176
116
  lang = self.__lang
177
117
  client = self.__client
178
118
 
179
- processed_path = self.__preprocess_audio(f)
180
- audio = AudioSegment.from_file(processed_path)
119
+ # processed_path = self.__preprocess_audio(f)
120
+ # audio = AudioSegment.from_file(processed_path)
181
121
 
182
122
  try:
183
123
  L.info(f"Uploading '{pathlib.Path(f).stem}'...")
184
- with open(processed_path, "rb") as audio_file:
185
- encoded_string = base64.b64encode(audio_file.read())
124
+ # we will send the file for processing
125
+ if not str(f).startswith("http"):
126
+ with open(f, "rb") as image_file:
127
+ encoded_string = base64.b64encode(image_file.read())
186
128
 
187
129
  req = models.CreateRecTaskRequest()
188
130
  if lang in {'zho', 'yue', 'wuu', 'nan','hak'}:
@@ -192,9 +134,12 @@ class TencentEngine(BatchalignEngine):
192
134
  req.ResTextFormat = 1
193
135
  req.SpeakerDiarization = 1
194
136
  req.ChannelNum = 1
195
- req.Data = encoded_string.decode('ascii')
196
- req.SourceType = 1
197
-
137
+ if not str(f).startswith("http"):
138
+ req.Data = encoded_string.decode('ascii')
139
+ req.SourceType = 1
140
+ else:
141
+ req.Url = f
142
+ req.SourceType = 0
198
143
  resp = client.CreateRecTask(req)
199
144
 
200
145
  L.info(f"Tencent is transcribing '{pathlib.Path(f).stem}'...")
@@ -0,0 +1,3 @@
1
+ 0.7.19-post.9
2
+ May 24th, 2025
3
+ reverts file only prep changes
@@ -1,3 +0,0 @@
1
- 0.7.19-post.8
2
- May 23th, 2025
3
- abbreviations