batchalign 0.7.16__tar.gz → 0.7.17__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. {batchalign-0.7.16/batchalign.egg-info → batchalign-0.7.17}/PKG-INFO +1 -1
  2. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/models/resolve.py +1 -1
  3. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/models/utterance/cantonese_infer.py +37 -7
  4. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/models/whisper/infer_asr.py +17 -1
  5. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/asr/utils.py +16 -7
  6. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/morphosyntax/ud.py +1 -1
  7. batchalign-0.7.17/batchalign/version +3 -0
  8. {batchalign-0.7.16 → batchalign-0.7.17/batchalign.egg-info}/PKG-INFO +1 -1
  9. batchalign-0.7.16/batchalign/version +0 -3
  10. {batchalign-0.7.16 → batchalign-0.7.17}/LICENSE +0 -0
  11. {batchalign-0.7.16 → batchalign-0.7.17}/MANIFEST.in +0 -0
  12. {batchalign-0.7.16 → batchalign-0.7.17}/README.md +0 -0
  13. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/__init__.py +0 -0
  14. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/__main__.py +0 -0
  15. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/cli/__init__.py +0 -0
  16. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/cli/cli.py +0 -0
  17. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/cli/dispatch.py +0 -0
  18. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/constants.py +0 -0
  19. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/document.py +0 -0
  20. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/errors.py +0 -0
  21. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/formats/__init__.py +0 -0
  22. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/formats/base.py +0 -0
  23. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/formats/chat/__init__.py +0 -0
  24. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/formats/chat/file.py +0 -0
  25. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/formats/chat/generator.py +0 -0
  26. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/formats/chat/lexer.py +0 -0
  27. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/formats/chat/parser.py +0 -0
  28. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/formats/chat/utils.py +0 -0
  29. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/formats/textgrid/__init__.py +0 -0
  30. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/formats/textgrid/file.py +0 -0
  31. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/formats/textgrid/generator.py +0 -0
  32. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/formats/textgrid/parser.py +0 -0
  33. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/models/__init__.py +0 -0
  34. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/models/speaker/__init__.py +0 -0
  35. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/models/speaker/config.yaml +0 -0
  36. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/models/speaker/infer.py +0 -0
  37. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/models/speaker/utils.py +0 -0
  38. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/models/training/__init__.py +0 -0
  39. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/models/training/run.py +0 -0
  40. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/models/training/utils.py +0 -0
  41. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/models/utils.py +0 -0
  42. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/models/utterance/__init__.py +0 -0
  43. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/models/utterance/dataset.py +0 -0
  44. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/models/utterance/execute.py +0 -0
  45. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/models/utterance/infer.py +0 -0
  46. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/models/utterance/prep.py +0 -0
  47. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/models/utterance/train.py +0 -0
  48. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/models/wave2vec/__init__.py +0 -0
  49. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/models/wave2vec/infer_fa.py +0 -0
  50. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/models/whisper/__init__.py +0 -0
  51. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/models/whisper/infer_fa.py +0 -0
  52. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/__init__.py +0 -0
  53. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/analysis/__init__.py +0 -0
  54. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/analysis/eval.py +0 -0
  55. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/asr/__init__.py +0 -0
  56. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/asr/num2chinese.py +0 -0
  57. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/asr/rev.py +0 -0
  58. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/asr/whisper.py +0 -0
  59. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/asr/whisperx.py +0 -0
  60. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/base.py +0 -0
  61. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/cleanup/__init__.py +0 -0
  62. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  63. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  64. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  65. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/cleanup/retrace.py +0 -0
  66. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  67. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  68. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/cleanup/support/test.test +0 -0
  69. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/dispatch.py +0 -0
  70. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/fa/__init__.py +0 -0
  71. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/fa/wave2vec_fa.py +0 -0
  72. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  73. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  74. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/morphosyntax/coref.py +0 -0
  75. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  76. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  77. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  78. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  79. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  80. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/pipeline.py +0 -0
  81. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/speaker/__init__.py +0 -0
  82. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  83. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/translate/__init__.py +0 -0
  84. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/translate/seamless.py +0 -0
  85. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/utr/__init__.py +0 -0
  86. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/utr/rev_utr.py +0 -0
  87. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/utr/utils.py +0 -0
  88. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  89. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/utterance/__init__.py +0 -0
  90. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  91. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/tests/__init__.py +0 -0
  92. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/tests/conftest.py +0 -0
  93. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  94. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  95. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  96. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  97. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  98. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  99. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  100. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  101. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  102. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  103. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  104. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  105. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/tests/pipelines/fixures.py +0 -0
  106. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  107. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  108. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/tests/test_document.py +0 -0
  109. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/utils/__init__.py +0 -0
  110. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/utils/config.py +0 -0
  111. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/utils/dp.py +0 -0
  112. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign/utils/utils.py +0 -0
  113. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign.egg-info/SOURCES.txt +0 -0
  114. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign.egg-info/dependency_links.txt +0 -0
  115. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign.egg-info/entry_points.txt +0 -0
  116. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign.egg-info/requires.txt +0 -0
  117. {batchalign-0.7.16 → batchalign-0.7.17}/batchalign.egg-info/top_level.txt +0 -0
  118. {batchalign-0.7.16 → batchalign-0.7.17}/setup.cfg +0 -0
  119. {batchalign-0.7.16 → batchalign-0.7.17}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.16
3
+ Version: 0.7.17
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -12,7 +12,7 @@ resolver = {
12
12
  },
13
13
  "whisper": {
14
14
  'eng': ("talkbank/CHATWhisper-en-large-v1", "openai/whisper-large-v2"),
15
- # 'yue': ("alvanlii/whisper-small-cantonese", "alvanlii/whisper-small-cantonese"),
15
+ 'yue': ("alvanlii/whisper-small-cantonese", "alvanlii/whisper-small-cantonese"),
16
16
  }
17
17
  }
18
18
 
@@ -150,15 +150,45 @@ class BertCantoneseUtteranceModel(object):
150
150
  final_passage.append(self.tokenizer.convert_tokens_to_string(res_toks))
151
151
 
152
152
  # Step 4: Join processed chunks together into the final passage
153
- final_text = ' '.join(final_passage)
153
+ final_passage = ' '.join(final_passage)
154
154
 
155
155
  print("Text processing completed. Generating final output...")
156
-
156
+
157
157
  # Optionally, tokenize the final text into sentences based on punctuation
158
- try:
159
- split_passage = sent_tokenize(final_text)
160
- except LookupError:
161
- nltk.download('punkt')
162
- split_passage = sent_tokenize(final_text)
158
+ def custom_sent_tokenize(text):
159
+ # Regular expression to match sentence-ending punctuation marks (. ! ?)
160
+ sentence_endings = re.compile(r'([.!?])')
161
+ split_passage = []
162
+
163
+ # Split the passage based on punctuation marks and keep them
164
+ parts = re.split(sentence_endings, text)
165
+
166
+ # Debug: Output the parts after splitting
167
+ print(f"Parts after splitting: {parts}")
168
+
169
+ # Combine parts and punctuation together
170
+ for i in range(0, len(parts) - 1, 2):
171
+ sentence = parts[i] + parts[i + 1] # Join sentence with punctuation
172
+ print(f"Sentence formed: {sentence}") # Debug: Output the current sentence
173
+
174
+ if sentence.strip(): # Only add non-empty sentences (check for non-whitespace content)
175
+ split_passage.append(sentence)
176
+
177
+ # If the last part doesn't have punctuation, we handle it here
178
+ if len(parts) % 2 != 0: # If there's no punctuation at the end
179
+ last_part = parts[-1].strip()
180
+ print(f"Last part without punctuation: {last_part}") # Debug: Output the last part
181
+
182
+ if last_part: # Only add non-empty sentences
183
+ split_passage.append(last_part)
184
+
185
+ # Final output
186
+ print(f"Final split passage: {split_passage}")
187
+ return split_passage
188
+
189
+ split_passage = custom_sent_tokenize(final_passage)
190
+
191
+ # Debugging: Output the sentences after splitting
192
+ print(f"Final sentences: {split_passage}")
163
193
 
164
194
  return split_passage
@@ -68,6 +68,22 @@ class WhisperASRModel(object):
68
68
  self.__config = GenerationConfig.from_pretrained(base)
69
69
  self.__config.no_repeat_ngram_size = 4
70
70
 
71
+
72
+ if language == "Cantonese":
73
+ self.__config.no_repeat_ngram_size = 4
74
+ self.__config.no_timestamps_token_id = 50363
75
+ self.__config.alignment_heads = [
76
+ [5, 3],
77
+ [5, 9],
78
+ [8, 0],
79
+ [8, 4],
80
+ [8, 8],
81
+ [9, 0],
82
+ [9, 7],
83
+ [9, 9],
84
+ [10, 5]
85
+ ]
86
+
71
87
  self.pipe = pipeline(
72
88
  "automatic-speech-recognition",
73
89
  model=model,
@@ -159,7 +175,7 @@ class WhisperASRModel(object):
159
175
  if self.lang == "Cantonese":
160
176
  config = {
161
177
  "repetition_penalty": 1.001,
162
- # "generation_config": self.__config,
178
+ "generation_config": self.__config,
163
179
  # "task": "transcribe",
164
180
  # "language": self.lang
165
181
  }
@@ -71,7 +71,7 @@ def retokenize_with_engine(intermediate_output, engine):
71
71
  final_outputs = []
72
72
 
73
73
  for speaker, utterance in intermediate_output:
74
- # becasue we are using an utterance engine, we need
74
+ # because we are using an utterance engine, we need
75
75
  # to get rid of all the preexisting punctuation
76
76
  for i in utterance:
77
77
  for j in MOR_PUNCT+ENDING_PUNCT:
@@ -84,8 +84,12 @@ def retokenize_with_engine(intermediate_output, engine):
84
84
  joined = joined.replace("。", ".")
85
85
  split = engine(joined)
86
86
 
87
+ # Initialize current index to track position in original utterance
88
+ current_index = 0
89
+
87
90
  # align the utterance against original splits and generate final outputs
88
91
  for i in split:
92
+ # Check if the split has ending punctuation
89
93
  if i[-1] in ENDING_PUNCT:
90
94
  new_ut, delim = (i[:-1].split(" "), i[-1])
91
95
  else:
@@ -94,12 +98,18 @@ def retokenize_with_engine(intermediate_output, engine):
94
98
  tmp = []
95
99
 
96
100
  for s in new_ut:
97
- try:
98
- tmp.append((s, utterance.pop(0)[1]))
99
- except IndexError:
100
- continue
101
+ if current_index < len(utterance):
102
+ # Use current element and move index forward
103
+ tmp.append((s, utterance[current_index][1]))
104
+ current_index += 1
105
+ else:
106
+ # Append with default timestamp if utterance is exhausted
107
+ tmp.append((s, [None, None]))
108
+
109
+ if current_index >= len(utterance):
110
+ tmp.append((delim, [None, None])) # Append the punctuation
101
111
 
102
- final_outputs.append((speaker, tmp+[[delim, [None, None]]]))
112
+ final_outputs.append((speaker, tmp))
103
113
 
104
114
  return final_outputs
105
115
 
@@ -220,7 +230,6 @@ def process_generation(output, lang="eng", utterance_engine=None):
220
230
  seen_word = True
221
231
  words.append(Form(text=word, time=(int(start), int(end))))
222
232
  else:
223
- if seen_word:
224
233
  words.append(Form(text=word, time=None))
225
234
 
226
235
  final_utterances.append(Utterance(
@@ -143,7 +143,7 @@ def handler(word, lang=None):
143
143
  pos,target = verbform(pos,target,word.text)
144
144
  target = target.replace(',', 'cm')
145
145
 
146
-
146
+ target = re.sub(r'@\w$', '', target)
147
147
  return f"{'' if not unknown else '0'}{pos}|{target}"
148
148
 
149
149
  # POS specific handler
@@ -0,0 +1,3 @@
1
+ 0.7.17
2
+ March 4th, 2025
3
+ Cantonese patches + removing special mor line annotations
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.16
3
+ Version: 0.7.17
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -1,3 +0,0 @@
1
- 0.7.16
2
- Feburary 27th, 2025
3
- More Hebrew features
File without changes
File without changes
File without changes
File without changes
File without changes