BatchalignHK 0.7.19.post7__tar.gz → 0.7.19.post9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/BatchalignHK.egg-info/PKG-INFO +2 -3
  2. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/BatchalignHK.egg-info/SOURCES.txt +1 -0
  3. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/PKG-INFO +2 -3
  4. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/document.py +6 -0
  5. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/asr/tencent.py +17 -72
  6. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/morphosyntax/ud.py +1 -0
  7. batchalignhk-0.7.19.post9/batchalign/utils/abbrev.py +182 -0
  8. batchalignhk-0.7.19.post9/batchalign/version +3 -0
  9. batchalignhk-0.7.19.post7/batchalign/version +0 -3
  10. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/BatchalignHK.egg-info/dependency_links.txt +0 -0
  11. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/BatchalignHK.egg-info/entry_points.txt +0 -0
  12. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/BatchalignHK.egg-info/requires.txt +0 -0
  13. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/BatchalignHK.egg-info/top_level.txt +0 -0
  14. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/LICENSE +0 -0
  15. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/MANIFEST.in +0 -0
  16. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/README.md +0 -0
  17. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/__init__.py +0 -0
  18. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/__main__.py +0 -0
  19. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/cli/__init__.py +0 -0
  20. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/cli/cli.py +0 -0
  21. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/cli/dispatch.py +0 -0
  22. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/constants.py +0 -0
  23. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/errors.py +0 -0
  24. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/formats/__init__.py +0 -0
  25. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/formats/base.py +0 -0
  26. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/formats/chat/__init__.py +0 -0
  27. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/formats/chat/file.py +0 -0
  28. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/formats/chat/generator.py +0 -0
  29. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/formats/chat/lexer.py +0 -0
  30. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/formats/chat/parser.py +0 -0
  31. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/formats/chat/utils.py +0 -0
  32. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/formats/textgrid/__init__.py +0 -0
  33. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/formats/textgrid/file.py +0 -0
  34. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/formats/textgrid/generator.py +0 -0
  35. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/formats/textgrid/parser.py +0 -0
  36. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/models/__init__.py +0 -0
  37. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/models/resolve.py +0 -0
  38. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/models/speaker/__init__.py +0 -0
  39. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/models/speaker/config.yaml +0 -0
  40. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/models/speaker/infer.py +0 -0
  41. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/models/speaker/utils.py +0 -0
  42. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/models/training/__init__.py +0 -0
  43. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/models/training/run.py +0 -0
  44. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/models/training/utils.py +0 -0
  45. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/models/utils.py +0 -0
  46. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/models/utterance/__init__.py +0 -0
  47. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/models/utterance/cantonese_infer.py +0 -0
  48. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/models/utterance/dataset.py +0 -0
  49. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/models/utterance/execute.py +0 -0
  50. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/models/utterance/infer.py +0 -0
  51. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/models/utterance/prep.py +0 -0
  52. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/models/utterance/train.py +0 -0
  53. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/models/wave2vec/__init__.py +0 -0
  54. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/models/wave2vec/infer_fa.py +0 -0
  55. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/models/whisper/__init__.py +0 -0
  56. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/models/whisper/infer_asr.py +0 -0
  57. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/models/whisper/infer_fa.py +0 -0
  58. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/__init__.py +0 -0
  59. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/analysis/__init__.py +0 -0
  60. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/analysis/eval.py +0 -0
  61. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/asr/__init__.py +0 -0
  62. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/asr/num2chinese.py +0 -0
  63. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/asr/oai_whisper.py +0 -0
  64. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/asr/rev.py +0 -0
  65. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/asr/utils.py +0 -0
  66. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/asr/whisper.py +0 -0
  67. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/asr/whisperx.py +0 -0
  68. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/base.py +0 -0
  69. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/cleanup/__init__.py +0 -0
  70. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  71. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  72. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  73. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/cleanup/retrace.py +0 -0
  74. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  75. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  76. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/cleanup/support/test.test +0 -0
  77. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/dispatch.py +0 -0
  78. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/fa/__init__.py +0 -0
  79. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/fa/wave2vec_fa.py +0 -0
  80. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  81. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  82. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/morphosyntax/coref.py +0 -0
  83. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  84. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  85. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  86. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  87. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  88. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/pipeline.py +0 -0
  89. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/speaker/__init__.py +0 -0
  90. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  91. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/translate/__init__.py +0 -0
  92. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/translate/gtrans.py +0 -0
  93. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/translate/seamless.py +0 -0
  94. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/translate/utils.py +0 -0
  95. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/utr/__init__.py +0 -0
  96. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/utr/rev_utr.py +0 -0
  97. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/utr/utils.py +0 -0
  98. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  99. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/utterance/__init__.py +0 -0
  100. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  101. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/tests/__init__.py +0 -0
  102. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/tests/conftest.py +0 -0
  103. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  104. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  105. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  106. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  107. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  108. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  109. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  110. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  111. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  112. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  113. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  114. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  115. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/tests/pipelines/fixures.py +0 -0
  116. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  117. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  118. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/tests/test_document.py +0 -0
  119. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/utils/__init__.py +0 -0
  120. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/utils/config.py +0 -0
  121. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/utils/dp.py +0 -0
  122. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/utils/names.py +0 -0
  123. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/batchalign/utils/utils.py +0 -0
  124. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/setup.cfg +0 -0
  125. {batchalignhk-0.7.19.post7 → batchalignhk-0.7.19.post9}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.2
2
2
  Name: BatchalignHK
3
- Version: 0.7.19.post7
3
+ Version: 0.7.19.post9
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -50,7 +50,6 @@ Dynamic: author-email
50
50
  Dynamic: classifier
51
51
  Dynamic: description
52
52
  Dynamic: description-content-type
53
- Dynamic: license-file
54
53
  Dynamic: provides-extra
55
54
  Dynamic: requires-dist
56
55
  Dynamic: summary
@@ -115,6 +115,7 @@ batchalign/tests/pipelines/cleanup/test_disfluency.py
115
115
  batchalign/tests/pipelines/cleanup/test_parse_support.py
116
116
  batchalign/tests/pipelines/fa/test_fa_pipeline.py
117
117
  batchalign/utils/__init__.py
118
+ batchalign/utils/abbrev.py
118
119
  batchalign/utils/config.py
119
120
  batchalign/utils/dp.py
120
121
  batchalign/utils/names.py
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.2
2
2
  Name: BatchalignHK
3
- Version: 0.7.19.post7
3
+ Version: 0.7.19.post9
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -50,7 +50,6 @@ Dynamic: author-email
50
50
  Dynamic: classifier
51
51
  Dynamic: description
52
52
  Dynamic: description-content-type
53
- Dynamic: license-file
54
53
  Dynamic: provides-extra
55
54
  Dynamic: requires-dist
56
55
  Dynamic: summary
@@ -11,6 +11,7 @@ from pathlib import Path
11
11
 
12
12
  from batchalign.errors import *
13
13
  from batchalign.constants import *
14
+ from batchalign.utils.abbrev import abbrev
14
15
 
15
16
  import re
16
17
 
@@ -223,6 +224,11 @@ class Utterance(BaseModel):
223
224
  t = re.sub(r",", " , ", t.strip()).strip()
224
225
  t = re.sub(r" +", " ", t.strip()).strip()
225
226
  t = t.replace("+ ,", "+,").strip()
227
+
228
+ abbrevs = [" " .join(list(i)) for i in abbrev]
229
+ for i in abbrevs:
230
+ t = t.replace(i, i.replace(" ", ""))
231
+
226
232
  return t
227
233
 
228
234
  def __repr__(self):
@@ -21,8 +21,8 @@ import tempfile
21
21
  import pycountry
22
22
  import numpy as np
23
23
  import soundfile as sf
24
- from pydub import AudioSegment
25
- from pydub.effects import normalize
24
+ # from pydub import AudioSegment
25
+ # from pydub.effects import normalize
26
26
  import base64
27
27
  from tencentcloud.common.credential import Credential
28
28
  from tencentcloud.asr.v20190614.asr_client import AsrClient, models
@@ -30,9 +30,9 @@ from tencentcloud.asr.v20190614.asr_client import AsrClient, models
30
30
  import asyncio
31
31
  import tempfile
32
32
  import os
33
- from pydub import AudioSegment
34
- from pydub.effects import normalize
35
- from pydub.exceptions import CouldntDecodeError
33
+ # from pydub import AudioSegment
34
+ # from pydub.effects import normalize
35
+ # from pydub.exceptions import CouldntDecodeError
36
36
 
37
37
 
38
38
  import logging
@@ -77,66 +77,6 @@ class TencentEngine(BatchalignEngine):
77
77
  L.debug("Done.")
78
78
  else:
79
79
  self.__engine = None
80
-
81
- def __preprocess_audio(self, input_path):
82
- """Enhanced audio preprocessing for low-volume speech"""
83
- try:
84
- L.info(f"Optimizing audio for ASR: {input_path}")
85
-
86
- # read the audio file
87
- audio = AudioSegment.from_file(input_path)
88
-
89
- audio = audio.set_channels(1)
90
- audio = audio.set_frame_rate(16000)
91
-
92
-
93
- audio = audio.compress_dynamic_range(
94
- threshold=-40,
95
- ratio=3,
96
- attack=5,
97
- release=100
98
- )
99
- audio = audio.low_pass_filter(4000) # filter out high frequencies
100
- audio = audio.normalize(headroom=2) # keep the headroom
101
- audio = audio.compress_dynamic_range(
102
- threshold=-55,
103
- ratio=6,
104
- attack=15,
105
- release=200
106
- )
107
-
108
- # enhance low volume
109
- audio = audio.high_pass_filter(80)
110
- boosted = audio.high_pass_filter(1000).apply_gain(+4)
111
- audio = audio.overlay(boosted)
112
-
113
- if L.level <= logging.DEBUG:
114
- self.__print_audio_stats(audio)
115
-
116
- # output to a temporary file
117
- temp_fd, temp_path = tempfile.mkstemp(suffix=".mp3")
118
- os.close(temp_fd)
119
- audio.export(
120
- temp_path,
121
- format="mp3",
122
- codec="libmp3lame",
123
- bitrate="96k",
124
- tags={"title": "BA_Optimized"},
125
- parameters=[
126
- "-compression_level", "2",
127
- "-reservoir", "0",
128
- "-joint_stereo", "0"
129
- ]
130
- )
131
-
132
- return temp_path
133
-
134
- except CouldntDecodeError:
135
- L.error(f"Audio decoding failed: {input_path}")
136
- return input_path
137
- except Exception as e:
138
- L.error(f"Audio processing error: {str(e)}")
139
- return input_path
140
80
 
141
81
  def replace_cantonese_words(self, word):
142
82
  """Function to replace Cantonese words with custom replacements."""
@@ -176,13 +116,15 @@ class TencentEngine(BatchalignEngine):
176
116
  lang = self.__lang
177
117
  client = self.__client
178
118
 
179
- processed_path = self.__preprocess_audio(f)
180
- audio = AudioSegment.from_file(processed_path)
119
+ # processed_path = self.__preprocess_audio(f)
120
+ # audio = AudioSegment.from_file(processed_path)
181
121
 
182
122
  try:
183
123
  L.info(f"Uploading '{pathlib.Path(f).stem}'...")
184
- with open(processed_path, "rb") as audio_file:
185
- encoded_string = base64.b64encode(audio_file.read())
124
+ # we will send the file for processing
125
+ if not str(f).startswith("http"):
126
+ with open(f, "rb") as image_file:
127
+ encoded_string = base64.b64encode(image_file.read())
186
128
 
187
129
  req = models.CreateRecTaskRequest()
188
130
  if lang in {'zho', 'yue', 'wuu', 'nan','hak'}:
@@ -192,9 +134,12 @@ class TencentEngine(BatchalignEngine):
192
134
  req.ResTextFormat = 1
193
135
  req.SpeakerDiarization = 1
194
136
  req.ChannelNum = 1
195
- req.Data = encoded_string.decode('ascii')
196
- req.SourceType = 1
197
-
137
+ if not str(f).startswith("http"):
138
+ req.Data = encoded_string.decode('ascii')
139
+ req.SourceType = 1
140
+ else:
141
+ req.Url = f
142
+ req.SourceType = 0
198
143
  resp = client.CreateRecTask(req)
199
144
 
200
145
  L.info(f"Tencent is transcribing '{pathlib.Path(f).stem}'...")
@@ -711,6 +711,7 @@ def adlist_postprocessor(i, lang, adlist):
711
711
 
712
712
  ######
713
713
  def morphoanalyze(doc: Document, retokenize:bool, skipmultilang:bool, status_hook:callable = None, **kwargs):
714
+
714
715
  L.debug("Starting Stanza...")
715
716
  inputs = []
716
717
 
@@ -0,0 +1,182 @@
1
+ abbrev = [
2
+ "FBI",
3
+ "CIA",
4
+ "NSA",
5
+ "NATO",
6
+ "UN",
7
+ "WHO",
8
+ "NASA",
9
+ "CDC",
10
+ "IRS",
11
+ "EPA",
12
+ "HTTP",
13
+ "URL",
14
+ "HTML",
15
+ "CSS",
16
+ "API",
17
+ "IP",
18
+ "DNS",
19
+ "SQL",
20
+ "USB",
21
+ "VPN",
22
+ "ATT",
23
+ "AT&T",
24
+ "CEO",
25
+ "CFO",
26
+ "COO",
27
+ "IPO",
28
+ "ROI",
29
+ "GDP",
30
+ "LLC",
31
+ "HR",
32
+ "M&",
33
+ "KPI",
34
+ "GPA",
35
+ "SAT",
36
+ "ACT",
37
+ "MBA",
38
+ "PhD",
39
+ "BA",
40
+ "MA",
41
+ "STEM",
42
+ "ESL",
43
+ "GED",
44
+ "AWOL",
45
+ "MIA",
46
+ "POW",
47
+ "IED",
48
+ "UAV",
49
+ "RPG",
50
+ "NATO",
51
+ "SEAL",
52
+ "JAG",
53
+ "ROTC",
54
+ "CERN",
55
+ "GMO",
56
+ "H2O",
57
+ "CO2",
58
+ "UV",
59
+ "IR",
60
+ "AI",
61
+ "VR",
62
+ "AR",
63
+ "NPR",
64
+ "BBC",
65
+ "MTV",
66
+ "CNN",
67
+ "HBO",
68
+ "ESPN",
69
+ "TMZ",
70
+ "AMC",
71
+ "IMAX",
72
+ "WWE",
73
+ "ASAP",
74
+ "DIY",
75
+ "ETA",
76
+ "RSVP",
77
+ "FYI",
78
+ "LOL",
79
+ "BRB",
80
+ "IDK",
81
+ "BTW",
82
+ "TMI",
83
+ "PBJ",
84
+ "AIDS",
85
+ "HIV",
86
+ "ADHD",
87
+ "COPD",
88
+ "PTSD",
89
+ "CHF",
90
+ "CAD",
91
+ "TB",
92
+ "UTI",
93
+ "GERD",
94
+ "MRI",
95
+ "CT",
96
+ "ECG",
97
+ "EEG",
98
+ "CBC",
99
+ "BMP",
100
+ "ABG",
101
+ "PFT",
102
+ "FOBT",
103
+ "ENT",
104
+ "OB",
105
+ "PCP",
106
+ "ICU",
107
+ "NICU",
108
+ "ER",
109
+ "OR",
110
+ "PT",
111
+ "OT",
112
+ "EM",
113
+ "OTC",
114
+ "NSAID",
115
+ "IV",
116
+ "IM",
117
+ "SC",
118
+ "PRN",
119
+ "BID",
120
+ "TID",
121
+ "QID",
122
+ "NPO",
123
+ "CNS",
124
+ "PNS",
125
+ "GI",
126
+ "GU",
127
+ "CV",
128
+ "MSK",
129
+ "ENT",
130
+ "BMI",
131
+ "BMR",
132
+ "BP",
133
+ "WBC",
134
+ "RBC",
135
+ "HGB",
136
+ "HCT",
137
+ "PLT",
138
+ "ESR",
139
+ "CRP",
140
+ "LFT",
141
+ "TFT",
142
+ "INR",
143
+ "MMR",
144
+ "DPT",
145
+ "HPV",
146
+ "Tdap",
147
+ "BCG",
148
+ "IPV",
149
+ "HBV",
150
+ "HAV",
151
+ "HCV",
152
+ "RSV",
153
+ "SOAP",
154
+ "DNR",
155
+ "AMA",
156
+ "LOS",
157
+ "EHR",
158
+ "EMR",
159
+ "ICD",
160
+ "CPT",
161
+ "HIPAA",
162
+ "HR",
163
+ "RR",
164
+ "SpO2",
165
+ "MAP",
166
+ "GFR",
167
+ "A1C",
168
+ "LDL",
169
+ "HDL",
170
+ "TG",
171
+ "BUN",
172
+ "SIDS",
173
+ "DVT",
174
+ "PE",
175
+ "ARDS",
176
+ "SLE",
177
+ "RA",
178
+ "TIA",
179
+ "CVA",
180
+ "ALS",
181
+ "MS",
182
+ ]
@@ -0,0 +1,3 @@
1
+ 0.7.19-post.9
2
+ May 24th, 2025
3
+ reverts file only prep changes
@@ -1,3 +0,0 @@
1
- 0.7.19-post.7
2
- May 20th, 2025
3
- fixes for ASR