batchalign 0.7.3b6__tar.gz → 0.7.3b8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. {batchalign-0.7.3b6/batchalign.egg-info → batchalign-0.7.3b8}/PKG-INFO +1 -1
  2. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/document.py +5 -0
  3. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/models/whisper/infer_asr.py +29 -6
  4. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/asr/utils.py +10 -3
  5. batchalign-0.7.3b8/batchalign/version +3 -0
  6. {batchalign-0.7.3b6 → batchalign-0.7.3b8/batchalign.egg-info}/PKG-INFO +1 -1
  7. batchalign-0.7.3b6/batchalign/version +0 -3
  8. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/LICENSE +0 -0
  9. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/MANIFEST.in +0 -0
  10. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/README.md +0 -0
  11. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/__init__.py +0 -0
  12. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/__main__.py +0 -0
  13. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/cli/__init__.py +0 -0
  14. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/cli/cli.py +0 -0
  15. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/cli/dispatch.py +0 -0
  16. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/constants.py +0 -0
  17. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/errors.py +0 -0
  18. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/formats/__init__.py +0 -0
  19. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/formats/base.py +0 -0
  20. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/formats/chat/__init__.py +0 -0
  21. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/formats/chat/file.py +0 -0
  22. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/formats/chat/generator.py +0 -0
  23. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/formats/chat/lexer.py +0 -0
  24. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/formats/chat/parser.py +0 -0
  25. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/formats/chat/utils.py +0 -0
  26. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/formats/textgrid/__init__.py +0 -0
  27. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/formats/textgrid/file.py +0 -0
  28. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/formats/textgrid/generator.py +0 -0
  29. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/formats/textgrid/parser.py +0 -0
  30. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/models/__init__.py +0 -0
  31. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/models/resolve.py +0 -0
  32. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/models/speaker/__init__.py +0 -0
  33. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/models/speaker/config.yaml +0 -0
  34. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/models/speaker/infer.py +0 -0
  35. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/models/speaker/utils.py +0 -0
  36. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/models/training/__init__.py +0 -0
  37. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/models/training/run.py +0 -0
  38. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/models/training/utils.py +0 -0
  39. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/models/utils.py +0 -0
  40. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/models/utterance/__init__.py +0 -0
  41. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/models/utterance/dataset.py +0 -0
  42. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/models/utterance/execute.py +0 -0
  43. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/models/utterance/infer.py +0 -0
  44. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/models/utterance/prep.py +0 -0
  45. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/models/utterance/train.py +0 -0
  46. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/models/whisper/__init__.py +0 -0
  47. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/models/whisper/infer_fa.py +0 -0
  48. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/__init__.py +0 -0
  49. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/analysis/__init__.py +0 -0
  50. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/analysis/eval.py +0 -0
  51. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/asr/__init__.py +0 -0
  52. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/asr/rev.py +0 -0
  53. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/asr/whisper.py +0 -0
  54. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/asr/whisperx.py +0 -0
  55. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/base.py +0 -0
  56. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/cleanup/__init__.py +0 -0
  57. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  58. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  59. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  60. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/cleanup/retrace.py +0 -0
  61. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  62. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  63. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/cleanup/support/test.test +0 -0
  64. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/dispatch.py +0 -0
  65. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/fa/__init__.py +0 -0
  66. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  67. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  68. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  69. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/morphosyntax/ud.py +0 -0
  70. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/pipeline.py +0 -0
  71. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/speaker/__init__.py +0 -0
  72. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  73. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/utr/__init__.py +0 -0
  74. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/utr/rev_utr.py +0 -0
  75. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/utr/utils.py +0 -0
  76. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  77. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/utterance/__init__.py +0 -0
  78. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  79. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/tests/__init__.py +0 -0
  80. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/tests/conftest.py +0 -0
  81. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  82. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  83. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  84. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  85. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  86. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  87. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  88. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  89. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  90. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  91. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  92. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  93. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/tests/pipelines/fixures.py +0 -0
  94. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  95. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  96. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/tests/test_document.py +0 -0
  97. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/utils/__init__.py +0 -0
  98. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/utils/config.py +0 -0
  99. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/utils/dp.py +0 -0
  100. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign/utils/utils.py +0 -0
  101. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign.egg-info/SOURCES.txt +0 -0
  102. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign.egg-info/dependency_links.txt +0 -0
  103. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign.egg-info/entry_points.txt +0 -0
  104. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign.egg-info/requires.txt +0 -0
  105. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/batchalign.egg-info/top_level.txt +0 -0
  106. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/setup.cfg +0 -0
  107. {batchalign-0.7.3b6 → batchalign-0.7.3b8}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.3b6
3
+ Version: 0.7.3b8
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -198,6 +198,11 @@ class Utterance(BaseModel):
198
198
  t = self._detokenize()
199
199
 
200
200
  t = t.replace(". . .", "+...")
201
+ t = t.replace("¿", "").replace("¡", "")
202
+ t = re.sub(r"^\+\.\.\.", "", t.strip()).strip()
203
+ t = re.sub(r"^\W+", "", t.strip()).strip()
204
+ t = re.sub(r",", " , ", t.strip()).strip()
205
+ t = re.sub(r" +", " ", t.strip()).strip()
201
206
  return t
202
207
 
203
208
  def __repr__(self):
@@ -3,6 +3,7 @@ from torchaudio import load
3
3
  import numpy as np
4
4
  import os
5
5
 
6
+ import re
6
7
  from transformers import pipeline
7
8
 
8
9
  from dataclasses import dataclass
@@ -189,16 +190,38 @@ class WhisperASRModel(object):
189
190
  element = groups.pop(0)
190
191
 
191
192
  if element["type"] == "text":
192
- text = {
193
+ pl = element["payload"].strip()
194
+ before = re.findall(r"^\W+", pl)
195
+ after = re.findall(r"\W+$", pl)
196
+ texts = []
197
+ if len(before) > 0:
198
+ texts.append({
199
+ "type": "punct",
200
+ "ts": element["start"],
201
+ "end_ts": element["end"] if element["end"] else element["start"]+1,
202
+ "value": before[0],
203
+ })
204
+ pl = pl.strip(before[0])
205
+ if len(after) > 0:
206
+ pl = pl.strip(after[0])
207
+ texts.append({
193
208
  "type": "text",
194
209
  "ts": element["start"],
195
210
  "end_ts": element["end"] if element["end"] else element["start"]+1,
196
- "value": element["payload"].strip(),
197
- }
211
+ "value": pl.strip(),
212
+ })
213
+ if len(after) > 0:
214
+ texts.append({
215
+ "type": "punct",
216
+ "ts": element["start"],
217
+ "end_ts": element["end"] if element["end"] else element["start"]+1,
218
+ "value": after[0],
219
+ })
198
220
 
199
- if text["ts"] != text["end_ts"] and text["value"].strip() != "…":
200
- # text with no DTW time is likely a spurious retrace
201
- current_turn.append(text)
221
+ for text in texts:
222
+ if text["ts"] != text["end_ts"] and text["value"].strip() != "…" and text["value"].strip() != "":
223
+ # text with no DTW time is likely a spurious retrace
224
+ current_turn.append(text)
202
225
  elif element["type"] == "segment" and current_speaker != element["payload"]:
203
226
  turns.append({
204
227
  "elements": current_turn,
@@ -25,6 +25,7 @@ def retokenize(intermediate_output):
25
25
  tmp = []
26
26
  for word, bullet in utterance:
27
27
  word = word.replace("。", ".")
28
+ word = word.replace("¿", " ").replace("¡", " ")
28
29
  tmp.append((word, bullet))
29
30
  if word in ENDING_PUNCT or word[-1] in ENDING_PUNCT:
30
31
  if word in ENDING_PUNCT:
@@ -178,14 +179,20 @@ def process_generation(output, lang="eng", utterance_engine=None):
178
179
  id=f"PAR{speaker}",
179
180
  name=f"Participant")
180
181
  words = []
181
- for word, (start,end) in utterance:
182
- if word not in ENDING_PUNCT:
182
+ for indx, (word, (start,end)) in enumerate(utterance):
183
+ if indx == 0:
184
+ seen_word = False
185
+ if word.strip() == "":
186
+ continue
187
+ if word not in ENDING_PUNCT+MOR_PUNCT:
183
188
  if start == None or end == None:
184
189
  words.append(Form(text=word, time=None))
185
190
  else:
191
+ seen_word = True
186
192
  words.append(Form(text=word, time=(int(start), int(end))))
187
193
  else:
188
- words.append(Form(text=word, time=None))
194
+ if seen_word:
195
+ words.append(Form(text=word, time=None))
189
196
 
190
197
  final_utterances.append(Utterance(
191
198
  tier=participant,
@@ -0,0 +1,3 @@
1
+ 0.7.3-beta.8
2
+ June 14, 2024
3
+ more asr fixes
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.3b6
3
+ Version: 0.7.3b8
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -1,3 +0,0 @@
1
- 0.7.3-beta.6
2
- June 11, 2024
3
- hebrew unicode patches
File without changes
File without changes
File without changes
File without changes
File without changes