BatchalignHK 0.7.17.post14__tar.gz → 0.7.17.post16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/BatchalignHK.egg-info/PKG-INFO +1 -1
  2. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/PKG-INFO +1 -1
  3. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/cli/cli.py +3 -0
  4. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/cli/dispatch.py +14 -3
  5. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/asr/tencent.py +12 -7
  6. batchalignhk-0.7.17.post16/batchalign/version +3 -0
  7. batchalignhk-0.7.17.post14/batchalign/version +0 -3
  8. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/BatchalignHK.egg-info/SOURCES.txt +0 -0
  9. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/BatchalignHK.egg-info/dependency_links.txt +0 -0
  10. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/BatchalignHK.egg-info/entry_points.txt +0 -0
  11. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/BatchalignHK.egg-info/requires.txt +0 -0
  12. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/BatchalignHK.egg-info/top_level.txt +0 -0
  13. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/LICENSE +0 -0
  14. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/MANIFEST.in +0 -0
  15. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/README.md +0 -0
  16. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/__init__.py +0 -0
  17. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/__main__.py +0 -0
  18. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/cli/__init__.py +0 -0
  19. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/constants.py +0 -0
  20. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/document.py +0 -0
  21. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/errors.py +0 -0
  22. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/formats/__init__.py +0 -0
  23. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/formats/base.py +0 -0
  24. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/formats/chat/__init__.py +0 -0
  25. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/formats/chat/file.py +0 -0
  26. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/formats/chat/generator.py +0 -0
  27. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/formats/chat/lexer.py +0 -0
  28. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/formats/chat/parser.py +0 -0
  29. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/formats/chat/utils.py +0 -0
  30. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/formats/textgrid/__init__.py +0 -0
  31. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/formats/textgrid/file.py +0 -0
  32. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/formats/textgrid/generator.py +0 -0
  33. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/formats/textgrid/parser.py +0 -0
  34. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/models/__init__.py +0 -0
  35. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/models/resolve.py +0 -0
  36. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/models/speaker/__init__.py +0 -0
  37. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/models/speaker/config.yaml +0 -0
  38. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/models/speaker/infer.py +0 -0
  39. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/models/speaker/utils.py +0 -0
  40. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/models/training/__init__.py +0 -0
  41. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/models/training/run.py +0 -0
  42. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/models/training/utils.py +0 -0
  43. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/models/utils.py +0 -0
  44. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/models/utterance/__init__.py +0 -0
  45. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/models/utterance/cantonese_infer.py +0 -0
  46. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/models/utterance/dataset.py +0 -0
  47. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/models/utterance/execute.py +0 -0
  48. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/models/utterance/infer.py +0 -0
  49. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/models/utterance/prep.py +0 -0
  50. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/models/utterance/train.py +0 -0
  51. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/models/wave2vec/__init__.py +0 -0
  52. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/models/wave2vec/infer_fa.py +0 -0
  53. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/models/whisper/__init__.py +0 -0
  54. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/models/whisper/infer_asr.py +0 -0
  55. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/models/whisper/infer_fa.py +0 -0
  56. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/__init__.py +0 -0
  57. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/analysis/__init__.py +0 -0
  58. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/analysis/eval.py +0 -0
  59. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/asr/__init__.py +0 -0
  60. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/asr/num2chinese.py +0 -0
  61. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/asr/rev.py +0 -0
  62. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/asr/utils.py +0 -0
  63. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/asr/whisper.py +0 -0
  64. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/asr/whisperx.py +0 -0
  65. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/base.py +0 -0
  66. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/cleanup/__init__.py +0 -0
  67. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  68. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  69. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  70. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/cleanup/retrace.py +0 -0
  71. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  72. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  73. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/cleanup/support/test.test +0 -0
  74. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/dispatch.py +0 -0
  75. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/fa/__init__.py +0 -0
  76. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/fa/wave2vec_fa.py +0 -0
  77. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  78. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  79. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/morphosyntax/coref.py +0 -0
  80. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  81. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  82. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  83. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  84. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  85. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/morphosyntax/ud.py +0 -0
  86. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/pipeline.py +0 -0
  87. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/speaker/__init__.py +0 -0
  88. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  89. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/translate/__init__.py +0 -0
  90. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/translate/seamless.py +0 -0
  91. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/utr/__init__.py +0 -0
  92. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/utr/rev_utr.py +0 -0
  93. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/utr/utils.py +0 -0
  94. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  95. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/utterance/__init__.py +0 -0
  96. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  97. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/tests/__init__.py +0 -0
  98. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/tests/conftest.py +0 -0
  99. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  100. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  101. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  102. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  103. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  104. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  105. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  106. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  107. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  108. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  109. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  110. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  111. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/tests/pipelines/fixures.py +0 -0
  112. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  113. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  114. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/tests/test_document.py +0 -0
  115. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/utils/__init__.py +0 -0
  116. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/utils/config.py +0 -0
  117. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/utils/dp.py +0 -0
  118. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/batchalign/utils/utils.py +0 -0
  119. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/setup.cfg +0 -0
  120. {batchalignhk-0.7.17.post14 → batchalignhk-0.7.17.post16}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: BatchalignHK
3
- Version: 0.7.17.post14
3
+ Version: 0.7.17.post16
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: BatchalignHK
3
- Version: 0.7.17.post14
3
+ Version: 0.7.17.post16
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -154,6 +154,9 @@ def align(ctx, in_dir, out_dir, whisper, wav2vec, **kwargs):
154
154
  default=False, help="Perform speaker diarization (this flag is ignored with Rev.AI)")
155
155
  @click.option("--wor/--nowor",
156
156
  default=False, help="Should we write word level alignment line? Default to no.")
157
+ @click.option("--data",
158
+ help="the URL of the data",
159
+ type=str)
157
160
  @click.option("--lang",
158
161
  help="sample language in three-letter ISO 3166-1 alpha-3 code",
159
162
  show_default=True,
@@ -5,6 +5,7 @@ and actual BatchalignPipeline.
5
5
  """
6
6
 
7
7
  from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn, BarColumn
8
+ from urllib.parse import urlparse
8
9
 
9
10
  import warnings
10
11
 
@@ -63,6 +64,15 @@ def _dispatch(command, lang, num_speakers,
63
64
  files = []
64
65
  outputs = []
65
66
 
67
+ if kwargs.get("data"):
68
+ url = kwargs.get("data")
69
+ url = urlparse(url)
70
+ if url.scheme == "":
71
+ url = url._replace(scheme="http")
72
+ base = os.path.basename(url.path)
73
+ files.append(url)
74
+ outputs.append(os.path.join(out_dir, base))
75
+
66
76
  for basedir, _, fs in os.walk(in_dir):
67
77
  for f in fs:
68
78
  path = Path(os.path.join(basedir, f))
@@ -128,7 +138,8 @@ def _dispatch(command, lang, num_speakers,
128
138
  errors = []
129
139
  # create the spinner bars
130
140
  for f in files:
131
- tasks[f] = prog.add_task(Path(f).name, start=False, processor="")
141
+ tasks[f] = prog.add_task(Path(f).name if isinstance(f, str) else Path(f.geturl()).name,
142
+ start=False, processor="")
132
143
 
133
144
  # create pipeline and read files
134
145
  baL.debug("Attempting to create BatchalignPipeline for CLI...")
@@ -152,7 +163,7 @@ def _dispatch(command, lang, num_speakers,
152
163
  prog.start_task(tasks[file])
153
164
  with warnings.catch_warnings(record=True) as w:
154
165
  # parse the input format, as needed
155
- doc = loader(os.path.abspath(file))
166
+ doc = loader(os.path.abspath(file) if isinstance(file, str) else file.geturl())
156
167
  # if we ended up with a tuple of length two,
157
168
  # that means that the loader requested kwargs
158
169
  kw = {}
@@ -179,7 +190,7 @@ def _dispatch(command, lang, num_speakers,
179
190
  if len(errors) > 0:
180
191
  C.print()
181
192
  for file, trcbk, e in errors:
182
- C.print(f"[bold red]ERROR[/bold red] on file [italic]{os.path.relpath(str(Path(file).absolute()), in_dir)}[/italic]: {escape(str(e))}\n")
193
+ C.print(f"[bold red]ERROR[/bold red] on file [italic]{os.path.relpath(str(Path(file).absolute()), in_dir) if isinstance(file, str) else file.geturl()}[/italic]: {escape(str(e))}\n")
183
194
  if ctx.obj["verbose"] == 1:
184
195
  C.print(escape(str(trcbk)))
185
196
  elif ctx.obj["verbose"] > 1:
@@ -74,16 +74,21 @@ class TencentEngine(BatchalignEngine):
74
74
 
75
75
  L.info(f"Uploading '{pathlib.Path(f).stem}'...")
76
76
  # we will send the file for processing
77
- with open(f, "rb") as image_file:
78
- encoded_string = base64.b64encode(image_file.read())
77
+ if not str(f).startswith("http"):
78
+ with open(f, "rb") as image_file:
79
+ encoded_string = base64.b64encode(image_file.read())
79
80
 
80
81
  req = models.CreateRecTaskRequest()
81
82
  req.EngineModelType = f"16k_{lang}"
82
83
  req.ResTextFormat = 1
83
- req.SourceType = 1
84
84
  req.SpeakerDiarization = 1
85
85
  req.ChannelNum = 1
86
- req.Data = encoded_string.decode('ascii')
86
+ if not str(f).startswith("http"):
87
+ req.Data = encoded_string.decode('ascii')
88
+ req.SourceType = 1
89
+ else:
90
+ req.Url = f
91
+ req.SourceType = 0
87
92
  resp = client.CreateRecTask(req)
88
93
 
89
94
  L.info(f"Tencent is transcribing '{pathlib.Path(f).stem}'...")
@@ -96,7 +101,7 @@ class TencentEngine(BatchalignEngine):
96
101
  res = client.DescribeTaskStatus(req)
97
102
 
98
103
  # if failed, raise
99
- if res.Data.Status == "3":
104
+ if res.Data.Status == "3" or res.Data.Status == 3:
100
105
  raise RuntimeError(f"Tencent reports job failed! error='{res.Data.ErrorMsg}'")
101
106
 
102
107
  turns = []
@@ -106,8 +111,8 @@ class TencentEngine(BatchalignEngine):
106
111
  for j in i.Words:
107
112
  turn.append({
108
113
  "type": "text",
109
- "ts": j.OffsetStartMs+start,
110
- "end_ts": j.OffsetEndMs+start,
114
+ "ts": (j.OffsetStartMs+start)/1000,
115
+ "end_ts": (j.OffsetEndMs+start)/1000,
111
116
  "value": j.Word
112
117
  })
113
118
  turns.append({
@@ -0,0 +1,3 @@
1
+ 0.7.17-post.16
2
+ March 26th, 2025
3
+ better tencent ASR
@@ -1,3 +0,0 @@
1
- 0.7.17-post.14
2
- March 26th, 2025
3
- better coref model