batchalign 0.7.3b7__tar.gz → 0.7.3b8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batchalign-0.7.3b7/batchalign.egg-info → batchalign-0.7.3b8}/PKG-INFO +1 -1
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/document.py +5 -1
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/models/whisper/infer_asr.py +29 -6
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/asr/utils.py +10 -3
- batchalign-0.7.3b8/batchalign/version +3 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8/batchalign.egg-info}/PKG-INFO +1 -1
- batchalign-0.7.3b7/batchalign/version +0 -3
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/LICENSE +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/MANIFEST.in +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/README.md +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/__init__.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/__main__.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/cli/__init__.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/cli/cli.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/cli/dispatch.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/constants.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/errors.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/formats/__init__.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/formats/base.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/formats/chat/__init__.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/formats/chat/file.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/formats/chat/generator.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/formats/chat/lexer.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/formats/chat/parser.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/formats/chat/utils.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/formats/textgrid/__init__.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/formats/textgrid/file.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/formats/textgrid/generator.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/formats/textgrid/parser.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/models/__init__.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/models/resolve.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/models/speaker/__init__.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/models/speaker/config.yaml +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/models/speaker/infer.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/models/speaker/utils.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/models/training/__init__.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/models/training/run.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/models/training/utils.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/models/utils.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/models/utterance/__init__.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/models/utterance/dataset.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/models/utterance/execute.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/models/utterance/infer.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/models/utterance/prep.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/models/utterance/train.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/models/whisper/__init__.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/models/whisper/infer_fa.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/__init__.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/analysis/__init__.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/analysis/eval.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/asr/__init__.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/asr/rev.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/asr/whisper.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/asr/whisperx.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/base.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/cleanup/__init__.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/cleanup/cleanup.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/cleanup/parse_support.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/cleanup/retrace.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/cleanup/support/test.test +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/dispatch.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/fa/__init__.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/fa/whisper_fa.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/morphosyntax/ud.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/pipeline.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/speaker/__init__.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/utr/__init__.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/utr/rev_utr.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/utr/utils.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/utr/whisper_utr.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/utterance/__init__.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/__init__.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/conftest.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/pipelines/fixures.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/pipelines/test_pipeline.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/test_document.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/utils/__init__.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/utils/config.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/utils/dp.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/utils/utils.py +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign.egg-info/SOURCES.txt +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign.egg-info/dependency_links.txt +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign.egg-info/entry_points.txt +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign.egg-info/requires.txt +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign.egg-info/top_level.txt +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/setup.cfg +0 -0
- {batchalign-0.7.3b7 → batchalign-0.7.3b8}/setup.py +0 -0
@@ -198,7 +198,11 @@ class Utterance(BaseModel):
|
|
198
198
|
t = self._detokenize()
|
199
199
|
|
200
200
|
t = t.replace(". . .", "+...")
|
201
|
-
t =
|
201
|
+
t = t.replace("¿", "").replace("¡", "")
|
202
|
+
t = re.sub(r"^\+\.\.\.", "", t.strip()).strip()
|
203
|
+
t = re.sub(r"^\W+", "", t.strip()).strip()
|
204
|
+
t = re.sub(r",", " , ", t.strip()).strip()
|
205
|
+
t = re.sub(r" +", " ", t.strip()).strip()
|
202
206
|
return t
|
203
207
|
|
204
208
|
def __repr__(self):
|
@@ -3,6 +3,7 @@ from torchaudio import load
|
|
3
3
|
import numpy as np
|
4
4
|
import os
|
5
5
|
|
6
|
+
import re
|
6
7
|
from transformers import pipeline
|
7
8
|
|
8
9
|
from dataclasses import dataclass
|
@@ -189,16 +190,38 @@ class WhisperASRModel(object):
|
|
189
190
|
element = groups.pop(0)
|
190
191
|
|
191
192
|
if element["type"] == "text":
|
192
|
-
|
193
|
+
pl = element["payload"].strip()
|
194
|
+
before = re.findall(r"^\W+", pl)
|
195
|
+
after = re.findall(r"\W+$", pl)
|
196
|
+
texts = []
|
197
|
+
if len(before) > 0:
|
198
|
+
texts.append({
|
199
|
+
"type": "punct",
|
200
|
+
"ts": element["start"],
|
201
|
+
"end_ts": element["end"] if element["end"] else element["start"]+1,
|
202
|
+
"value": before[0],
|
203
|
+
})
|
204
|
+
pl = pl.strip(before[0])
|
205
|
+
if len(after) > 0:
|
206
|
+
pl = pl.strip(after[0])
|
207
|
+
texts.append({
|
193
208
|
"type": "text",
|
194
209
|
"ts": element["start"],
|
195
210
|
"end_ts": element["end"] if element["end"] else element["start"]+1,
|
196
|
-
"value":
|
197
|
-
}
|
211
|
+
"value": pl.strip(),
|
212
|
+
})
|
213
|
+
if len(after) > 0:
|
214
|
+
texts.append({
|
215
|
+
"type": "punct",
|
216
|
+
"ts": element["start"],
|
217
|
+
"end_ts": element["end"] if element["end"] else element["start"]+1,
|
218
|
+
"value": after[0],
|
219
|
+
})
|
198
220
|
|
199
|
-
|
200
|
-
|
201
|
-
|
221
|
+
for text in texts:
|
222
|
+
if text["ts"] != text["end_ts"] and text["value"].strip() != "…" and text["value"].strip() != "":
|
223
|
+
# text with no DTW time is likely a spurious retrace
|
224
|
+
current_turn.append(text)
|
202
225
|
elif element["type"] == "segment" and current_speaker != element["payload"]:
|
203
226
|
turns.append({
|
204
227
|
"elements": current_turn,
|
@@ -25,6 +25,7 @@ def retokenize(intermediate_output):
|
|
25
25
|
tmp = []
|
26
26
|
for word, bullet in utterance:
|
27
27
|
word = word.replace("。", ".")
|
28
|
+
word = word.replace("¿", " ").replace("¡", " ")
|
28
29
|
tmp.append((word, bullet))
|
29
30
|
if word in ENDING_PUNCT or word[-1] in ENDING_PUNCT:
|
30
31
|
if word in ENDING_PUNCT:
|
@@ -178,14 +179,20 @@ def process_generation(output, lang="eng", utterance_engine=None):
|
|
178
179
|
id=f"PAR{speaker}",
|
179
180
|
name=f"Participant")
|
180
181
|
words = []
|
181
|
-
for word, (start,end) in utterance:
|
182
|
-
if
|
182
|
+
for indx, (word, (start,end)) in enumerate(utterance):
|
183
|
+
if indx == 0:
|
184
|
+
seen_word = False
|
185
|
+
if word.strip() == "":
|
186
|
+
continue
|
187
|
+
if word not in ENDING_PUNCT+MOR_PUNCT:
|
183
188
|
if start == None or end == None:
|
184
189
|
words.append(Form(text=word, time=None))
|
185
190
|
else:
|
191
|
+
seen_word = True
|
186
192
|
words.append(Form(text=word, time=(int(start), int(end))))
|
187
193
|
else:
|
188
|
-
|
194
|
+
if seen_word:
|
195
|
+
words.append(Form(text=word, time=None))
|
189
196
|
|
190
197
|
final_utterances.append(Utterance(
|
191
198
|
tier=participant,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/cleanup/support/filled_pauses.eng
RENAMED
File without changes
|
{batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/pipelines/cleanup/support/replacements.eng
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/formats/chat/test_chat_generator.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/formats/textgrid/test_textgrid.py
RENAMED
File without changes
|
File without changes
|
{batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/pipelines/asr/test_asr_pipeline.py
RENAMED
File without changes
|
File without changes
|
{batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/pipelines/cleanup/test_disfluency.py
RENAMED
File without changes
|
{batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/pipelines/cleanup/test_parse_support.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.3b7 → batchalign-0.7.3b8}/batchalign/tests/pipelines/test_pipeline_models.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|