batchalign 0.7.13.post1__tar.gz → 0.7.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batchalign-0.7.13.post1/batchalign.egg-info → batchalign-0.7.15}/PKG-INFO +4 -1
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/cli/cli.py +22 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/cli/dispatch.py +1 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/document.py +4 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/chat/generator.py +2 -1
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/chat/parser.py +5 -1
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/__init__.py +1 -1
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/resolve.py +1 -1
- batchalign-0.7.15/batchalign/models/utterance/__init__.py +4 -0
- batchalign-0.7.15/batchalign/models/utterance/cantonese_infer.py +164 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/whisper/infer_asr.py +1 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/__init__.py +1 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/asr/rev.py +6 -2
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/asr/utils.py +5 -2
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/asr/whisper.py +6 -2
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/dispatch.py +4 -1
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/fa/wave2vec_fa.py +2 -2
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/fa/whisper_fa.py +2 -2
- batchalign-0.7.15/batchalign/pipelines/translate/__init__.py +1 -0
- batchalign-0.7.15/batchalign/pipelines/translate/seamless.py +53 -0
- batchalign-0.7.15/batchalign/version +3 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15/batchalign.egg-info}/PKG-INFO +4 -1
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign.egg-info/SOURCES.txt +3 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign.egg-info/requires.txt +3 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/setup.py +3 -0
- batchalign-0.7.13.post1/batchalign/models/utterance/__init__.py +0 -2
- batchalign-0.7.13.post1/batchalign/version +0 -3
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/LICENSE +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/MANIFEST.in +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/README.md +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/__init__.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/__main__.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/cli/__init__.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/constants.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/errors.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/__init__.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/base.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/chat/__init__.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/chat/file.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/chat/lexer.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/chat/utils.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/textgrid/__init__.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/textgrid/file.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/textgrid/generator.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/textgrid/parser.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/speaker/__init__.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/speaker/config.yaml +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/speaker/infer.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/speaker/utils.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/training/__init__.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/training/run.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/training/utils.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/utils.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/utterance/dataset.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/utterance/execute.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/utterance/infer.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/utterance/prep.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/utterance/train.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/wave2vec/__init__.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/wave2vec/infer_fa.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/whisper/__init__.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/whisper/infer_fa.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/analysis/__init__.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/analysis/eval.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/asr/__init__.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/asr/num2chinese.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/asr/whisperx.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/base.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/cleanup/__init__.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/cleanup/cleanup.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/cleanup/parse_support.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/cleanup/retrace.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/cleanup/support/test.test +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/fa/__init__.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/coref.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/ud.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/pipeline.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/speaker/__init__.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/utr/__init__.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/utr/rev_utr.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/utr/utils.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/utr/whisper_utr.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/utterance/__init__.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/__init__.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/conftest.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/fixures.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/test_pipeline.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/test_document.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/utils/__init__.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/utils/config.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/utils/dp.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/utils/utils.py +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign.egg-info/dependency_links.txt +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign.egg-info/entry_points.txt +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign.egg-info/top_level.txt +0 -0
- {batchalign-0.7.13.post1 → batchalign-0.7.15}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: batchalign
|
3
|
-
Version: 0.7.
|
3
|
+
Version: 0.7.15
|
4
4
|
Summary: Python Speech Language Sample Analysis
|
5
5
|
Author: Brian MacWhinney, Houjun Liu
|
6
6
|
Author-email: macw@cmu.edu, houjun@cmu.edu
|
@@ -33,6 +33,9 @@ Requires-Dist: soundfile~=0.12.0
|
|
33
33
|
Requires-Dist: rich-click>=1.7.0
|
34
34
|
Requires-Dist: typing-extensions
|
35
35
|
Requires-Dist: num2words
|
36
|
+
Requires-Dist: tiktoken
|
37
|
+
Requires-Dist: blobfile
|
38
|
+
Requires-Dist: sentencepiece
|
36
39
|
Provides-Extra: dev
|
37
40
|
Requires-Dist: pytest; extra == "dev"
|
38
41
|
Provides-Extra: train
|
@@ -196,6 +196,28 @@ def transcribe(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
|
|
196
196
|
loader, writer, C,
|
197
197
|
asr=asr, **kwargs)
|
198
198
|
|
199
|
+
#################### TRANSLATE ################################
|
200
|
+
|
201
|
+
@batchalign.command()
|
202
|
+
@common_options
|
203
|
+
@click.pass_context
|
204
|
+
def translate(ctx, in_dir, out_dir, **kwargs):
|
205
|
+
"""Translate the transcript to English."""
|
206
|
+
|
207
|
+
def loader(file):
|
208
|
+
cf = CHATFile(path=os.path.abspath(file), special_mor_=True)
|
209
|
+
doc = cf.doc
|
210
|
+
# if str(cf).count("%mor") > 0:
|
211
|
+
# doc.ba_special_["special_mor_notation"] = True
|
212
|
+
return doc
|
213
|
+
|
214
|
+
def writer(doc, output):
|
215
|
+
CHATFile(doc=doc).write(output)
|
216
|
+
|
217
|
+
_dispatch("translate", "eng", 1, ["cha"], ctx,
|
218
|
+
in_dir, out_dir,
|
219
|
+
loader, writer, C)
|
220
|
+
|
199
221
|
#################### MORPHOTAG ################################
|
200
222
|
|
201
223
|
@batchalign.command()
|
@@ -31,6 +31,7 @@ class Task(IntEnum):
|
|
31
31
|
MORPHOSYNTAX = 11
|
32
32
|
COREF = 12
|
33
33
|
WER = 13
|
34
|
+
TRANSLATE = 14
|
34
35
|
|
35
36
|
|
36
37
|
DEBUG__G = 0
|
@@ -54,6 +55,7 @@ TypeMap = {
|
|
54
55
|
Task.DISFLUENCY_ANALYSIS: TaskType.PROCESSING,
|
55
56
|
Task.COREF: TaskType.PROCESSING,
|
56
57
|
Task.WER: TaskType.ANALYSIS,
|
58
|
+
Task.TRANSLATE: TaskType.PROCESSING,
|
57
59
|
|
58
60
|
Task.DEBUG__G: TaskType.GENERATION,
|
59
61
|
Task.DEBUG__P: TaskType.PROCESSING,
|
@@ -73,6 +75,7 @@ TaskFriendlyName = {
|
|
73
75
|
Task.DISFLUENCY_ANALYSIS: "Disfluncy Analysis",
|
74
76
|
Task.COREF: "Coreference Resolution",
|
75
77
|
Task.WER: "Word Error Rate",
|
78
|
+
Task.TRANSLATE: "Translation",
|
76
79
|
Task.DEBUG__G: "TEST_GENERATION",
|
77
80
|
Task.DEBUG__P: "TEST_PROCESSING",
|
78
81
|
Task.DEBUG__A: "TEST_ANALYSIS",
|
@@ -150,6 +153,7 @@ class Utterance(BaseModel):
|
|
150
153
|
tier: Tier = Field(default=Tier())
|
151
154
|
content: Sentence
|
152
155
|
text: Optional[str] = Field(default=None)
|
156
|
+
translation: Optional[str] = Field(default=None)
|
153
157
|
time: Optional[Tuple[int,int]] = Field(default=None)
|
154
158
|
custom_dependencies: List[CustomLine] = Field(default=[])
|
155
159
|
|
@@ -95,7 +95,8 @@ def generate_chat_utterance(utterance: Utterance, special_mor=False, write_wor=T
|
|
95
95
|
result.append("%wor:\t"+" ".join(wor_elems))
|
96
96
|
if has_coref:
|
97
97
|
result.append("%coref:\t"+" ".join(coref_elems))
|
98
|
-
|
98
|
+
if utterance.translation != None:
|
99
|
+
result.append("%xtra:\t"+utterance.translation)
|
99
100
|
|
100
101
|
|
101
102
|
#### EXTRA LINE GENERATION ####
|
@@ -280,6 +280,7 @@ def chat_parse_doc(lines, special_mor=False):
|
|
280
280
|
mor = None
|
281
281
|
gra = None
|
282
282
|
wor = None
|
283
|
+
translation = None
|
283
284
|
additional = []
|
284
285
|
|
285
286
|
while raw[0][0] == "%":
|
@@ -291,6 +292,8 @@ def chat_parse_doc(lines, special_mor=False):
|
|
291
292
|
gra = line
|
292
293
|
elif beg.strip() == "wor" or beg.strip() == "xwor":
|
293
294
|
wor = line
|
295
|
+
elif beg.strip() == "xtra":
|
296
|
+
translation = line
|
294
297
|
else:
|
295
298
|
additional.append(CustomLine(id=beg.strip(),
|
296
299
|
type=CustomLineType.DEPENDENT,
|
@@ -309,7 +312,8 @@ def chat_parse_doc(lines, special_mor=False):
|
|
309
312
|
"content": parsed,
|
310
313
|
"text": text,
|
311
314
|
"delim": delim,
|
312
|
-
"custom_dependencies": additional
|
315
|
+
"custom_dependencies": additional,
|
316
|
+
"translation": translation
|
313
317
|
})
|
314
318
|
|
315
319
|
timing = re.findall(rf"\x15(\d+)_(\d+)\x15", text)
|
@@ -8,7 +8,7 @@ resolver = {
|
|
8
8
|
"utterance": {
|
9
9
|
'eng': "talkbank/CHATUtterance-en",
|
10
10
|
"zho": "talkbank/CHATUtterance-zh_CN",
|
11
|
-
"yue": "
|
11
|
+
"yue": "PolyU-AngelChanLab/Cantonese-Utterance-Segmentation",
|
12
12
|
},
|
13
13
|
"whisper": {
|
14
14
|
'eng': ("talkbank/CHATWhisper-en-large-v1", "openai/whisper-large-v2"),
|
@@ -0,0 +1,164 @@
|
|
1
|
+
import re
|
2
|
+
import string
|
3
|
+
import random
|
4
|
+
|
5
|
+
# tokenization utilities
|
6
|
+
import nltk
|
7
|
+
from nltk import word_tokenize, sent_tokenize
|
8
|
+
|
9
|
+
# torch
|
10
|
+
import torch
|
11
|
+
from torch.utils.data import dataset
|
12
|
+
from torch.utils.data.dataloader import DataLoader
|
13
|
+
from torch.optim import AdamW
|
14
|
+
|
15
|
+
# import huggingface utils
|
16
|
+
from transformers import AutoTokenizer, BertForTokenClassification
|
17
|
+
from transformers import DataCollatorForTokenClassification
|
18
|
+
|
19
|
+
# tqdm
|
20
|
+
from tqdm import tqdm
|
21
|
+
|
22
|
+
# seed device and tokens
|
23
|
+
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
24
|
+
|
25
|
+
# seed model
|
26
|
+
class BertCantoneseUtteranceModel(object):
|
27
|
+
|
28
|
+
def __init__(self, model):
|
29
|
+
# seed tokenizers and model
|
30
|
+
self.tokenizer = AutoTokenizer.from_pretrained(model)
|
31
|
+
self.model = BertForTokenClassification.from_pretrained(model).to(DEVICE)
|
32
|
+
self.max_length = 512
|
33
|
+
self.overlap = 20
|
34
|
+
|
35
|
+
# eval mode
|
36
|
+
self.model.eval()
|
37
|
+
print(f"Model and tokenizer initialized on device: {DEVICE}")
|
38
|
+
print(f"Max length set to {self.max_length} with overlap of {self.overlap}")
|
39
|
+
|
40
|
+
def __call__(self, passage):
|
41
|
+
# Step 1: Clean up passage
|
42
|
+
passage = passage.lower()
|
43
|
+
passage = passage.replace('.','')
|
44
|
+
passage = passage.replace(',','')
|
45
|
+
passage = passage.replace('!','')
|
46
|
+
passage = passage.replace('!','')
|
47
|
+
passage = passage.replace('?','')
|
48
|
+
passage = passage.replace('。','')
|
49
|
+
passage = passage.replace(',','')
|
50
|
+
passage = passage.replace('?','')
|
51
|
+
passage = passage.replace('(','')
|
52
|
+
passage = passage.replace(')','')
|
53
|
+
passage = passage.replace(':','')
|
54
|
+
passage = passage.replace('*','')
|
55
|
+
passage = passage.replace('l','')
|
56
|
+
|
57
|
+
|
58
|
+
# Step 2: Define keywords and split the passage based on them
|
59
|
+
keywords = ['呀', '啦', '喎', '嘞', '㗎喇', '囉', '㗎', '啊', '嗯'] # Replace with your desired keywords
|
60
|
+
|
61
|
+
chunks = []
|
62
|
+
start = 0
|
63
|
+
|
64
|
+
while start < len(passage):
|
65
|
+
# Find the position of each keyword in the passage starting from the current `start`
|
66
|
+
keyword_positions = [(keyword, passage.find(keyword, start)) for keyword in keywords]
|
67
|
+
# Filter out keywords that are not found (find() returns -1 if not found)
|
68
|
+
keyword_positions = [kp for kp in keyword_positions if kp[1] != -1]
|
69
|
+
|
70
|
+
if keyword_positions:
|
71
|
+
# Find the keyword that appears first in the passage from current start
|
72
|
+
first_keyword, keyword_pos = min(keyword_positions, key=lambda x: x[1])
|
73
|
+
chunk = passage[start:keyword_pos + len(first_keyword)]
|
74
|
+
chunks.append(chunk)
|
75
|
+
start = keyword_pos + len(first_keyword)
|
76
|
+
else:
|
77
|
+
# No more keywords found, add the rest of the passage as the last chunk
|
78
|
+
chunks.append(passage[start:])
|
79
|
+
break
|
80
|
+
|
81
|
+
# Debugging: Print number of chunks and their content
|
82
|
+
print(f"Created {len(chunks)} chunks based on keywords.")
|
83
|
+
for i, chunk in enumerate(chunks):
|
84
|
+
print(f"Chunk {i + 1}: {chunk[:100]}...") # Print the first 100 characters of each chunk
|
85
|
+
|
86
|
+
# Step 3: Process each chunk and restore punctuation
|
87
|
+
final_passage = []
|
88
|
+
for chunk_index, chunk in enumerate(chunks):
|
89
|
+
print(f"Processing chunk {chunk_index + 1}/{len(chunks)}...")
|
90
|
+
|
91
|
+
# Step 3.1: Split chunk by characters (Chinese tokenization)
|
92
|
+
tokenized_chunk = list(chunk) # Simply split by characters for Chinese text
|
93
|
+
|
94
|
+
# Step 3.2: Pass chunk through the tokenizer and model
|
95
|
+
tokd = self.tokenizer.batch_encode_plus([tokenized_chunk],
|
96
|
+
return_tensors='pt',
|
97
|
+
truncation=True,
|
98
|
+
padding=True,
|
99
|
+
max_length=self.max_length,
|
100
|
+
is_split_into_words=True).to(DEVICE)
|
101
|
+
|
102
|
+
try:
|
103
|
+
# Pass it through the model
|
104
|
+
res = self.model(**tokd).logits
|
105
|
+
except Exception as e:
|
106
|
+
print(f"Error during model inference: {e}")
|
107
|
+
return []
|
108
|
+
|
109
|
+
# Argmax for classification
|
110
|
+
classified_targets = torch.argmax(res, dim=2).cpu()
|
111
|
+
|
112
|
+
# Initialize result tokens list for the current chunk
|
113
|
+
res_toks = []
|
114
|
+
prev_word_idx = None
|
115
|
+
|
116
|
+
# Iterate over tokenized words
|
117
|
+
wids = tokd.word_ids(0)
|
118
|
+
for indx, elem in enumerate(wids):
|
119
|
+
if elem is None or elem == prev_word_idx:
|
120
|
+
continue
|
121
|
+
|
122
|
+
prev_word_idx = elem
|
123
|
+
action = classified_targets[0][indx]
|
124
|
+
|
125
|
+
# Get the word corresponding to the token
|
126
|
+
w = tokenized_chunk[elem] # Use tokenized chunk here
|
127
|
+
|
128
|
+
# Fix one word hanging issue (if needed)
|
129
|
+
will_action = False
|
130
|
+
if indx < len(wids) - 2 and classified_targets[0][indx + 1] > 0:
|
131
|
+
will_action = True
|
132
|
+
|
133
|
+
if not will_action:
|
134
|
+
# Perform the edits based on model predictions
|
135
|
+
if action == 1: # First capital letter
|
136
|
+
w = w[0].upper() + w[1:]
|
137
|
+
elif action == 2: # Add period
|
138
|
+
w = w + '.'
|
139
|
+
elif action == 3: # Add question mark
|
140
|
+
w = w + '?'
|
141
|
+
elif action == 4: # Add exclamation mark
|
142
|
+
w = w + '!'
|
143
|
+
elif action == 5: # Add comma
|
144
|
+
w = w + ','
|
145
|
+
|
146
|
+
# Append modified word to result list
|
147
|
+
res_toks.append(w)
|
148
|
+
|
149
|
+
# Convert list of tokens back to string and append to final_passage
|
150
|
+
final_passage.append(self.tokenizer.convert_tokens_to_string(res_toks))
|
151
|
+
|
152
|
+
# Step 4: Join processed chunks together into the final passage
|
153
|
+
final_text = ' '.join(final_passage)
|
154
|
+
|
155
|
+
print("Text processing completed. Generating final output...")
|
156
|
+
|
157
|
+
# Optionally, tokenize the final text into sentences based on punctuation
|
158
|
+
try:
|
159
|
+
split_passage = sent_tokenize(final_text)
|
160
|
+
except LookupError:
|
161
|
+
nltk.download('punkt')
|
162
|
+
split_passage = sent_tokenize(final_text)
|
163
|
+
|
164
|
+
return split_passage
|
@@ -33,6 +33,7 @@ import pycountry
|
|
33
33
|
import logging
|
34
34
|
L = logging.getLogger("batchalign")
|
35
35
|
|
36
|
+
# DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
36
37
|
# DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
37
38
|
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device("mps") if torch.backends.mps.is_available() else torch.device('cpu')
|
38
39
|
# PYTORCH_ENABLE_MPS_FALLBACK=1
|
@@ -10,7 +10,7 @@ from batchalign.utils.config import config_read
|
|
10
10
|
|
11
11
|
from batchalign.errors import *
|
12
12
|
|
13
|
-
from batchalign.models import BertUtteranceModel, resolve
|
13
|
+
from batchalign.models import BertUtteranceModel, BertCantoneseUtteranceModel, resolve
|
14
14
|
|
15
15
|
import time
|
16
16
|
import pathlib
|
@@ -49,7 +49,11 @@ class RevEngine(BatchalignEngine):
|
|
49
49
|
self.__client = apiclient.RevAiAPIClient(key)
|
50
50
|
if resolve("utterance", lang) != None:
|
51
51
|
L.debug("Initializing utterance model...")
|
52
|
-
|
52
|
+
if lang != "yue":
|
53
|
+
self.__engine = BertUtteranceModel(resolve("utterance", lang))
|
54
|
+
else:
|
55
|
+
# we have special inference procedure for cantonese
|
56
|
+
self.__engine = BertCantoneseUtteranceModel(resolve("utterance", lang))
|
53
57
|
L.debug("Done.")
|
54
58
|
else:
|
55
59
|
self.__engine = None
|
@@ -94,7 +94,10 @@ def retokenize_with_engine(intermediate_output, engine):
|
|
94
94
|
tmp = []
|
95
95
|
|
96
96
|
for s in new_ut:
|
97
|
-
|
97
|
+
try:
|
98
|
+
tmp.append((s, utterance.pop(0)[1]))
|
99
|
+
except IndexError:
|
100
|
+
continue
|
98
101
|
|
99
102
|
final_outputs.append((speaker, tmp+[[delim, [None, None]]]))
|
100
103
|
|
@@ -159,7 +162,7 @@ def process_generation(output, lang="eng", utterance_engine=None):
|
|
159
162
|
final_words.append([part.strip(), [cur, cur+div]])
|
160
163
|
cur += div
|
161
164
|
|
162
|
-
lang_2 = pycountry.languages.get(alpha_3=lang).alpha_2
|
165
|
+
lang_2 = "yue" if lang == "yue" else pycountry.languages.get(alpha_3=lang).alpha_2
|
163
166
|
def catched_num2words(i):
|
164
167
|
if not i.isdigit():
|
165
168
|
return i
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from batchalign.document import *
|
2
2
|
from batchalign.pipelines.base import *
|
3
3
|
from batchalign.pipelines.asr.utils import *
|
4
|
-
from batchalign.models import WhisperASRModel, BertUtteranceModel
|
4
|
+
from batchalign.models import WhisperASRModel, BertUtteranceModel, BertCantoneseUtteranceModel
|
5
5
|
|
6
6
|
import pycountry
|
7
7
|
|
@@ -44,7 +44,11 @@ class WhisperEngine(BatchalignEngine):
|
|
44
44
|
|
45
45
|
if resolve("utterance", self.__lang) != None:
|
46
46
|
L.debug("Initializing utterance model...")
|
47
|
-
|
47
|
+
if lang != "yue":
|
48
|
+
self.__engine = BertUtteranceModel(resolve("utterance", lang))
|
49
|
+
else:
|
50
|
+
# we have special inference procedure for cantonese
|
51
|
+
self.__engine = BertCantoneseUtteranceModel(resolve("utterance", lang))
|
48
52
|
L.debug("Done.")
|
49
53
|
else:
|
50
54
|
self.__engine = None
|
@@ -6,7 +6,7 @@ Tabulate default packages and options.
|
|
6
6
|
from batchalign import (WhisperEngine, WhisperFAEngine, StanzaEngine, RevEngine,
|
7
7
|
NgramRetraceEngine, DisfluencyReplacementEngine, WhisperUTREngine,
|
8
8
|
RevUTREngine, EvaluationEngine, WhisperXEngine, NemoSpeakerEngine,
|
9
|
-
StanzaUtteranceEngine, CorefEngine, Wave2VecFAEngine)
|
9
|
+
StanzaUtteranceEngine, CorefEngine, Wave2VecFAEngine, SeamlessTranslationModel)
|
10
10
|
from batchalign import BatchalignPipeline
|
11
11
|
from batchalign.models import resolve
|
12
12
|
|
@@ -28,6 +28,7 @@ DEFAULT_PACKAGES = {
|
|
28
28
|
"eval": "evaluation",
|
29
29
|
"utterance": "stanza_utt",
|
30
30
|
"coref": "stanza_coref",
|
31
|
+
"translate": "seamless_translate",
|
31
32
|
}
|
32
33
|
|
33
34
|
LANGUAGE_OVERRIDE_PACKAGES = {
|
@@ -129,6 +130,8 @@ def dispatch_pipeline(pkg_str, lang, num_speakers=None, **arg_overrides):
|
|
129
130
|
engines.append(CorefEngine())
|
130
131
|
elif engine == "wav2vec_fa":
|
131
132
|
engines.append(Wave2VecFAEngine())
|
133
|
+
elif engine == "seamless_translate":
|
134
|
+
engines.append(SeamlessTranslationModel())
|
132
135
|
|
133
136
|
L.debug(f"Done initalizing packages.")
|
134
137
|
return BatchalignPipeline(*engines)
|
@@ -154,9 +154,9 @@ class Wave2VecFAEngine(BatchalignEngine):
|
|
154
154
|
if '\x15' not in ut.text:
|
155
155
|
ut.text = (ut.text+f" \x15{ut.alignment[0]}_{ut.alignment[1]}\x15").strip()
|
156
156
|
else:
|
157
|
-
ut.text = re.sub("\x15\d+_\d+\x15",
|
157
|
+
ut.text = re.sub(r"\x15\d+_\d+\x15",
|
158
158
|
f"\x15{ut.alignment[0]}_{ut.alignment[1]}\x15", ut.text).strip()
|
159
159
|
elif ut.text != None:
|
160
|
-
ut.text = re.sub("\x15\d+_\d+\x15", f"", ut.text).strip()
|
160
|
+
ut.text = re.sub(r"\x15\d+_\d+\x15", f"", ut.text).strip()
|
161
161
|
|
162
162
|
return doc
|
@@ -179,9 +179,9 @@ class WhisperFAEngine(BatchalignEngine):
|
|
179
179
|
if '\x15' not in ut.text:
|
180
180
|
ut.text = (ut.text+f" \x15{ut.alignment[0]}_{ut.alignment[1]}\x15").strip()
|
181
181
|
else:
|
182
|
-
ut.text = re.sub("\x15\d+_\d+\x15",
|
182
|
+
ut.text = re.sub(r"\x15\d+_\d+\x15",
|
183
183
|
f"\x15{ut.alignment[0]}_{ut.alignment[1]}\x15", ut.text).strip()
|
184
184
|
elif ut.text != None:
|
185
|
-
ut.text = re.sub("\x15\d+_\d+\x15", f"", ut.text).strip()
|
185
|
+
ut.text = re.sub(r"\x15\d+_\d+\x15", f"", ut.text).strip()
|
186
186
|
|
187
187
|
return doc
|
@@ -0,0 +1 @@
|
|
1
|
+
from .seamless import SeamlessTranslationModel
|
@@ -0,0 +1,53 @@
|
|
1
|
+
from batchalign.models import WhisperFAModel
|
2
|
+
from batchalign.document import *
|
3
|
+
from batchalign.pipelines.base import *
|
4
|
+
from batchalign.utils import *
|
5
|
+
from batchalign.utils.dp import *
|
6
|
+
from batchalign.constants import *
|
7
|
+
|
8
|
+
from transformers import AutoProcessor, SeamlessM4TModel
|
9
|
+
|
10
|
+
import logging
|
11
|
+
L = logging.getLogger("batchalign")
|
12
|
+
|
13
|
+
import re
|
14
|
+
|
15
|
+
# !uv pip install sentencepiece
|
16
|
+
|
17
|
+
import pycountry
|
18
|
+
import warnings
|
19
|
+
|
20
|
+
class SeamlessTranslationModel(BatchalignEngine):
|
21
|
+
tasks = [ Task.TRANSLATE ]
|
22
|
+
|
23
|
+
def _hook_status(self, status_hook):
|
24
|
+
self.status_hook = status_hook
|
25
|
+
|
26
|
+
def __init__(self):
|
27
|
+
self.status_hook = None
|
28
|
+
self.processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")
|
29
|
+
self.model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")
|
30
|
+
|
31
|
+
def process(self, doc:Document, **kwargs):
|
32
|
+
|
33
|
+
for indx, i in enumerate(doc.content):
|
34
|
+
if not isinstance(i, Utterance):
|
35
|
+
continue
|
36
|
+
if i.translation:
|
37
|
+
continue
|
38
|
+
|
39
|
+
text = i.strip(join_with_spaces=False, include_retrace=True, include_fp=True)
|
40
|
+
text_inputs = self.processor(text=text, src_lang=doc.langs[0] if doc.langs[0] != "zho" else "cmn", return_tensors="pt")
|
41
|
+
output_tokens = self.model.generate(**text_inputs, tgt_lang="eng", generate_speech=False)
|
42
|
+
translated_text_from_text = self.processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
|
43
|
+
|
44
|
+
i.translation = translated_text_from_text
|
45
|
+
for j in MOR_PUNCT + ENDING_PUNCT:
|
46
|
+
i.translation = i.translation.replace(j, " "+j)
|
47
|
+
|
48
|
+
if self.status_hook != None:
|
49
|
+
self.status_hook(indx+1, len(doc.content))
|
50
|
+
|
51
|
+
return doc
|
52
|
+
|
53
|
+
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: batchalign
|
3
|
-
Version: 0.7.
|
3
|
+
Version: 0.7.15
|
4
4
|
Summary: Python Speech Language Sample Analysis
|
5
5
|
Author: Brian MacWhinney, Houjun Liu
|
6
6
|
Author-email: macw@cmu.edu, houjun@cmu.edu
|
@@ -33,6 +33,9 @@ Requires-Dist: soundfile~=0.12.0
|
|
33
33
|
Requires-Dist: rich-click>=1.7.0
|
34
34
|
Requires-Dist: typing-extensions
|
35
35
|
Requires-Dist: num2words
|
36
|
+
Requires-Dist: tiktoken
|
37
|
+
Requires-Dist: blobfile
|
38
|
+
Requires-Dist: sentencepiece
|
36
39
|
Provides-Extra: dev
|
37
40
|
Requires-Dist: pytest; extra == "dev"
|
38
41
|
Provides-Extra: train
|
@@ -40,6 +40,7 @@ batchalign/models/training/__init__.py
|
|
40
40
|
batchalign/models/training/run.py
|
41
41
|
batchalign/models/training/utils.py
|
42
42
|
batchalign/models/utterance/__init__.py
|
43
|
+
batchalign/models/utterance/cantonese_infer.py
|
43
44
|
batchalign/models/utterance/dataset.py
|
44
45
|
batchalign/models/utterance/execute.py
|
45
46
|
batchalign/models/utterance/infer.py
|
@@ -83,6 +84,8 @@ batchalign/pipelines/morphosyntax/fr/case.py
|
|
83
84
|
batchalign/pipelines/morphosyntax/ja/verbforms.py
|
84
85
|
batchalign/pipelines/speaker/__init__.py
|
85
86
|
batchalign/pipelines/speaker/nemo_speaker.py
|
87
|
+
batchalign/pipelines/translate/__init__.py
|
88
|
+
batchalign/pipelines/translate/seamless.py
|
86
89
|
batchalign/pipelines/utr/__init__.py
|
87
90
|
batchalign/pipelines/utr/rev_utr.py
|
88
91
|
batchalign/pipelines/utr/utils.py
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/cleanup/support/filled_pauses.eng
RENAMED
File without changes
|
{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/cleanup/support/replacements.eng
RENAMED
File without changes
|
{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/cleanup/support/test.test
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/ja/verbforms.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/utterance/ud_utterance.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_file.py
RENAMED
File without changes
|
{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_generator.py
RENAMED
File without changes
|
{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_lexer.py
RENAMED
File without changes
|
{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_parser.py
RENAMED
File without changes
|
{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_utils.py
RENAMED
File without changes
|
{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/formats/textgrid/test_textgrid.py
RENAMED
File without changes
|
{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/analysis/test_eval.py
RENAMED
File without changes
|
{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/asr/test_asr_pipeline.py
RENAMED
File without changes
|
{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/asr/test_asr_utils.py
RENAMED
File without changes
|
{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/cleanup/test_disfluency.py
RENAMED
File without changes
|
File without changes
|
{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/fa/test_fa_pipeline.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/test_pipeline_models.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|