batchalign 0.7.13.post1__tar.gz → 0.7.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. {batchalign-0.7.13.post1/batchalign.egg-info → batchalign-0.7.15}/PKG-INFO +4 -1
  2. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/cli/cli.py +22 -0
  3. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/cli/dispatch.py +1 -0
  4. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/document.py +4 -0
  5. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/chat/generator.py +2 -1
  6. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/chat/parser.py +5 -1
  7. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/__init__.py +1 -1
  8. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/resolve.py +1 -1
  9. batchalign-0.7.15/batchalign/models/utterance/__init__.py +4 -0
  10. batchalign-0.7.15/batchalign/models/utterance/cantonese_infer.py +164 -0
  11. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/whisper/infer_asr.py +1 -0
  12. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/__init__.py +1 -0
  13. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/asr/rev.py +6 -2
  14. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/asr/utils.py +5 -2
  15. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/asr/whisper.py +6 -2
  16. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/dispatch.py +4 -1
  17. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/fa/wave2vec_fa.py +2 -2
  18. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/fa/whisper_fa.py +2 -2
  19. batchalign-0.7.15/batchalign/pipelines/translate/__init__.py +1 -0
  20. batchalign-0.7.15/batchalign/pipelines/translate/seamless.py +53 -0
  21. batchalign-0.7.15/batchalign/version +3 -0
  22. {batchalign-0.7.13.post1 → batchalign-0.7.15/batchalign.egg-info}/PKG-INFO +4 -1
  23. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign.egg-info/SOURCES.txt +3 -0
  24. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign.egg-info/requires.txt +3 -0
  25. {batchalign-0.7.13.post1 → batchalign-0.7.15}/setup.py +3 -0
  26. batchalign-0.7.13.post1/batchalign/models/utterance/__init__.py +0 -2
  27. batchalign-0.7.13.post1/batchalign/version +0 -3
  28. {batchalign-0.7.13.post1 → batchalign-0.7.15}/LICENSE +0 -0
  29. {batchalign-0.7.13.post1 → batchalign-0.7.15}/MANIFEST.in +0 -0
  30. {batchalign-0.7.13.post1 → batchalign-0.7.15}/README.md +0 -0
  31. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/__init__.py +0 -0
  32. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/__main__.py +0 -0
  33. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/cli/__init__.py +0 -0
  34. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/constants.py +0 -0
  35. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/errors.py +0 -0
  36. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/__init__.py +0 -0
  37. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/base.py +0 -0
  38. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/chat/__init__.py +0 -0
  39. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/chat/file.py +0 -0
  40. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/chat/lexer.py +0 -0
  41. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/chat/utils.py +0 -0
  42. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/textgrid/__init__.py +0 -0
  43. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/textgrid/file.py +0 -0
  44. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/textgrid/generator.py +0 -0
  45. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/formats/textgrid/parser.py +0 -0
  46. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/speaker/__init__.py +0 -0
  47. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/speaker/config.yaml +0 -0
  48. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/speaker/infer.py +0 -0
  49. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/speaker/utils.py +0 -0
  50. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/training/__init__.py +0 -0
  51. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/training/run.py +0 -0
  52. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/training/utils.py +0 -0
  53. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/utils.py +0 -0
  54. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/utterance/dataset.py +0 -0
  55. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/utterance/execute.py +0 -0
  56. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/utterance/infer.py +0 -0
  57. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/utterance/prep.py +0 -0
  58. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/utterance/train.py +0 -0
  59. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/wave2vec/__init__.py +0 -0
  60. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/wave2vec/infer_fa.py +0 -0
  61. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/whisper/__init__.py +0 -0
  62. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/models/whisper/infer_fa.py +0 -0
  63. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/analysis/__init__.py +0 -0
  64. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/analysis/eval.py +0 -0
  65. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/asr/__init__.py +0 -0
  66. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/asr/num2chinese.py +0 -0
  67. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/asr/whisperx.py +0 -0
  68. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/base.py +0 -0
  69. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/cleanup/__init__.py +0 -0
  70. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  71. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  72. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  73. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/cleanup/retrace.py +0 -0
  74. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  75. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  76. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/cleanup/support/test.test +0 -0
  77. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/fa/__init__.py +0 -0
  78. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  79. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/coref.py +0 -0
  80. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  81. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  82. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  83. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  84. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  85. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/ud.py +0 -0
  86. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/pipeline.py +0 -0
  87. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/speaker/__init__.py +0 -0
  88. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  89. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/utr/__init__.py +0 -0
  90. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/utr/rev_utr.py +0 -0
  91. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/utr/utils.py +0 -0
  92. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  93. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/utterance/__init__.py +0 -0
  94. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  95. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/__init__.py +0 -0
  96. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/conftest.py +0 -0
  97. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  98. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  99. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  100. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  101. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  102. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  103. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  104. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  105. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  106. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  107. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  108. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  109. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/fixures.py +0 -0
  110. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  111. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  112. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/tests/test_document.py +0 -0
  113. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/utils/__init__.py +0 -0
  114. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/utils/config.py +0 -0
  115. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/utils/dp.py +0 -0
  116. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign/utils/utils.py +0 -0
  117. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign.egg-info/dependency_links.txt +0 -0
  118. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign.egg-info/entry_points.txt +0 -0
  119. {batchalign-0.7.13.post1 → batchalign-0.7.15}/batchalign.egg-info/top_level.txt +0 -0
  120. {batchalign-0.7.13.post1 → batchalign-0.7.15}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.13.post1
3
+ Version: 0.7.15
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -33,6 +33,9 @@ Requires-Dist: soundfile~=0.12.0
33
33
  Requires-Dist: rich-click>=1.7.0
34
34
  Requires-Dist: typing-extensions
35
35
  Requires-Dist: num2words
36
+ Requires-Dist: tiktoken
37
+ Requires-Dist: blobfile
38
+ Requires-Dist: sentencepiece
36
39
  Provides-Extra: dev
37
40
  Requires-Dist: pytest; extra == "dev"
38
41
  Provides-Extra: train
@@ -196,6 +196,28 @@ def transcribe(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
196
196
  loader, writer, C,
197
197
  asr=asr, **kwargs)
198
198
 
199
+ #################### TRANSLATE ################################
200
+
201
+ @batchalign.command()
202
+ @common_options
203
+ @click.pass_context
204
+ def translate(ctx, in_dir, out_dir, **kwargs):
205
+ """Translate the transcript to English."""
206
+
207
+ def loader(file):
208
+ cf = CHATFile(path=os.path.abspath(file), special_mor_=True)
209
+ doc = cf.doc
210
+ # if str(cf).count("%mor") > 0:
211
+ # doc.ba_special_["special_mor_notation"] = True
212
+ return doc
213
+
214
+ def writer(doc, output):
215
+ CHATFile(doc=doc).write(output)
216
+
217
+ _dispatch("translate", "eng", 1, ["cha"], ctx,
218
+ in_dir, out_dir,
219
+ loader, writer, C)
220
+
199
221
  #################### MORPHOTAG ################################
200
222
 
201
223
  @batchalign.command()
@@ -48,6 +48,7 @@ Cmd2Task = {
48
48
  "benchmark": "asr,eval",
49
49
  "utseg": "utterance",
50
50
  "coref": "coref",
51
+ "translate": "translate",
51
52
  }
52
53
 
53
54
  # this is the main runner used by all functions
@@ -31,6 +31,7 @@ class Task(IntEnum):
31
31
  MORPHOSYNTAX = 11
32
32
  COREF = 12
33
33
  WER = 13
34
+ TRANSLATE = 14
34
35
 
35
36
 
36
37
  DEBUG__G = 0
@@ -54,6 +55,7 @@ TypeMap = {
54
55
  Task.DISFLUENCY_ANALYSIS: TaskType.PROCESSING,
55
56
  Task.COREF: TaskType.PROCESSING,
56
57
  Task.WER: TaskType.ANALYSIS,
58
+ Task.TRANSLATE: TaskType.PROCESSING,
57
59
 
58
60
  Task.DEBUG__G: TaskType.GENERATION,
59
61
  Task.DEBUG__P: TaskType.PROCESSING,
@@ -73,6 +75,7 @@ TaskFriendlyName = {
73
75
  Task.DISFLUENCY_ANALYSIS: "Disfluncy Analysis",
74
76
  Task.COREF: "Coreference Resolution",
75
77
  Task.WER: "Word Error Rate",
78
+ Task.TRANSLATE: "Translation",
76
79
  Task.DEBUG__G: "TEST_GENERATION",
77
80
  Task.DEBUG__P: "TEST_PROCESSING",
78
81
  Task.DEBUG__A: "TEST_ANALYSIS",
@@ -150,6 +153,7 @@ class Utterance(BaseModel):
150
153
  tier: Tier = Field(default=Tier())
151
154
  content: Sentence
152
155
  text: Optional[str] = Field(default=None)
156
+ translation: Optional[str] = Field(default=None)
153
157
  time: Optional[Tuple[int,int]] = Field(default=None)
154
158
  custom_dependencies: List[CustomLine] = Field(default=[])
155
159
 
@@ -95,7 +95,8 @@ def generate_chat_utterance(utterance: Utterance, special_mor=False, write_wor=T
95
95
  result.append("%wor:\t"+" ".join(wor_elems))
96
96
  if has_coref:
97
97
  result.append("%coref:\t"+" ".join(coref_elems))
98
-
98
+ if utterance.translation != None:
99
+ result.append("%xtra:\t"+utterance.translation)
99
100
 
100
101
 
101
102
  #### EXTRA LINE GENERATION ####
@@ -280,6 +280,7 @@ def chat_parse_doc(lines, special_mor=False):
280
280
  mor = None
281
281
  gra = None
282
282
  wor = None
283
+ translation = None
283
284
  additional = []
284
285
 
285
286
  while raw[0][0] == "%":
@@ -291,6 +292,8 @@ def chat_parse_doc(lines, special_mor=False):
291
292
  gra = line
292
293
  elif beg.strip() == "wor" or beg.strip() == "xwor":
293
294
  wor = line
295
+ elif beg.strip() == "xtra":
296
+ translation = line
294
297
  else:
295
298
  additional.append(CustomLine(id=beg.strip(),
296
299
  type=CustomLineType.DEPENDENT,
@@ -309,7 +312,8 @@ def chat_parse_doc(lines, special_mor=False):
309
312
  "content": parsed,
310
313
  "text": text,
311
314
  "delim": delim,
312
- "custom_dependencies": additional
315
+ "custom_dependencies": additional,
316
+ "translation": translation
313
317
  })
314
318
 
315
319
  timing = re.findall(rf"\x15(\d+)_(\d+)\x15", text)
@@ -1,4 +1,4 @@
1
- from .utterance import BertUtteranceModel
1
+ from .utterance import BertUtteranceModel, BertCantoneseUtteranceModel
2
2
  from .whisper import WhisperASRModel, WhisperFAModel
3
3
  from .speaker import NemoSpeakerModel
4
4
  from .utils import ASRAudioFile
@@ -8,7 +8,7 @@ resolver = {
8
8
  "utterance": {
9
9
  'eng': "talkbank/CHATUtterance-en",
10
10
  "zho": "talkbank/CHATUtterance-zh_CN",
11
- "yue": "talkbank/CHATUtterance-zh_CN",
11
+ "yue": "PolyU-AngelChanLab/Cantonese-Utterance-Segmentation",
12
12
  },
13
13
  "whisper": {
14
14
  'eng': ("talkbank/CHATWhisper-en-large-v1", "openai/whisper-large-v2"),
@@ -0,0 +1,4 @@
1
+ from .infer import BertUtteranceModel
2
+ from .cantonese_infer import BertCantoneseUtteranceModel
3
+
4
+
@@ -0,0 +1,164 @@
1
+ import re
2
+ import string
3
+ import random
4
+
5
+ # tokenization utilities
6
+ import nltk
7
+ from nltk import word_tokenize, sent_tokenize
8
+
9
+ # torch
10
+ import torch
11
+ from torch.utils.data import dataset
12
+ from torch.utils.data.dataloader import DataLoader
13
+ from torch.optim import AdamW
14
+
15
+ # import huggingface utils
16
+ from transformers import AutoTokenizer, BertForTokenClassification
17
+ from transformers import DataCollatorForTokenClassification
18
+
19
+ # tqdm
20
+ from tqdm import tqdm
21
+
22
+ # seed device and tokens
23
+ DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
24
+
25
+ # seed model
26
+ class BertCantoneseUtteranceModel(object):
27
+
28
+ def __init__(self, model):
29
+ # seed tokenizers and model
30
+ self.tokenizer = AutoTokenizer.from_pretrained(model)
31
+ self.model = BertForTokenClassification.from_pretrained(model).to(DEVICE)
32
+ self.max_length = 512
33
+ self.overlap = 20
34
+
35
+ # eval mode
36
+ self.model.eval()
37
+ print(f"Model and tokenizer initialized on device: {DEVICE}")
38
+ print(f"Max length set to {self.max_length} with overlap of {self.overlap}")
39
+
40
+ def __call__(self, passage):
41
+ # Step 1: Clean up passage
42
+ passage = passage.lower()
43
+ passage = passage.replace('.','')
44
+ passage = passage.replace(',','')
45
+ passage = passage.replace('!','')
46
+ passage = passage.replace('!','')
47
+ passage = passage.replace('?','')
48
+ passage = passage.replace('。','')
49
+ passage = passage.replace(',','')
50
+ passage = passage.replace('?','')
51
+ passage = passage.replace('(','')
52
+ passage = passage.replace(')','')
53
+ passage = passage.replace(':','')
54
+ passage = passage.replace('*','')
55
+ passage = passage.replace('l','')
56
+
57
+
58
+ # Step 2: Define keywords and split the passage based on them
59
+ keywords = ['呀', '啦', '喎', '嘞', '㗎喇', '囉', '㗎', '啊', '嗯'] # Replace with your desired keywords
60
+
61
+ chunks = []
62
+ start = 0
63
+
64
+ while start < len(passage):
65
+ # Find the position of each keyword in the passage starting from the current `start`
66
+ keyword_positions = [(keyword, passage.find(keyword, start)) for keyword in keywords]
67
+ # Filter out keywords that are not found (find() returns -1 if not found)
68
+ keyword_positions = [kp for kp in keyword_positions if kp[1] != -1]
69
+
70
+ if keyword_positions:
71
+ # Find the keyword that appears first in the passage from current start
72
+ first_keyword, keyword_pos = min(keyword_positions, key=lambda x: x[1])
73
+ chunk = passage[start:keyword_pos + len(first_keyword)]
74
+ chunks.append(chunk)
75
+ start = keyword_pos + len(first_keyword)
76
+ else:
77
+ # No more keywords found, add the rest of the passage as the last chunk
78
+ chunks.append(passage[start:])
79
+ break
80
+
81
+ # Debugging: Print number of chunks and their content
82
+ print(f"Created {len(chunks)} chunks based on keywords.")
83
+ for i, chunk in enumerate(chunks):
84
+ print(f"Chunk {i + 1}: {chunk[:100]}...") # Print the first 100 characters of each chunk
85
+
86
+ # Step 3: Process each chunk and restore punctuation
87
+ final_passage = []
88
+ for chunk_index, chunk in enumerate(chunks):
89
+ print(f"Processing chunk {chunk_index + 1}/{len(chunks)}...")
90
+
91
+ # Step 3.1: Split chunk by characters (Chinese tokenization)
92
+ tokenized_chunk = list(chunk) # Simply split by characters for Chinese text
93
+
94
+ # Step 3.2: Pass chunk through the tokenizer and model
95
+ tokd = self.tokenizer.batch_encode_plus([tokenized_chunk],
96
+ return_tensors='pt',
97
+ truncation=True,
98
+ padding=True,
99
+ max_length=self.max_length,
100
+ is_split_into_words=True).to(DEVICE)
101
+
102
+ try:
103
+ # Pass it through the model
104
+ res = self.model(**tokd).logits
105
+ except Exception as e:
106
+ print(f"Error during model inference: {e}")
107
+ return []
108
+
109
+ # Argmax for classification
110
+ classified_targets = torch.argmax(res, dim=2).cpu()
111
+
112
+ # Initialize result tokens list for the current chunk
113
+ res_toks = []
114
+ prev_word_idx = None
115
+
116
+ # Iterate over tokenized words
117
+ wids = tokd.word_ids(0)
118
+ for indx, elem in enumerate(wids):
119
+ if elem is None or elem == prev_word_idx:
120
+ continue
121
+
122
+ prev_word_idx = elem
123
+ action = classified_targets[0][indx]
124
+
125
+ # Get the word corresponding to the token
126
+ w = tokenized_chunk[elem] # Use tokenized chunk here
127
+
128
+ # Fix one word hanging issue (if needed)
129
+ will_action = False
130
+ if indx < len(wids) - 2 and classified_targets[0][indx + 1] > 0:
131
+ will_action = True
132
+
133
+ if not will_action:
134
+ # Perform the edits based on model predictions
135
+ if action == 1: # First capital letter
136
+ w = w[0].upper() + w[1:]
137
+ elif action == 2: # Add period
138
+ w = w + '.'
139
+ elif action == 3: # Add question mark
140
+ w = w + '?'
141
+ elif action == 4: # Add exclamation mark
142
+ w = w + '!'
143
+ elif action == 5: # Add comma
144
+ w = w + ','
145
+
146
+ # Append modified word to result list
147
+ res_toks.append(w)
148
+
149
+ # Convert list of tokens back to string and append to final_passage
150
+ final_passage.append(self.tokenizer.convert_tokens_to_string(res_toks))
151
+
152
+ # Step 4: Join processed chunks together into the final passage
153
+ final_text = ' '.join(final_passage)
154
+
155
+ print("Text processing completed. Generating final output...")
156
+
157
+ # Optionally, tokenize the final text into sentences based on punctuation
158
+ try:
159
+ split_passage = sent_tokenize(final_text)
160
+ except LookupError:
161
+ nltk.download('punkt')
162
+ split_passage = sent_tokenize(final_text)
163
+
164
+ return split_passage
@@ -33,6 +33,7 @@ import pycountry
33
33
  import logging
34
34
  L = logging.getLogger("batchalign")
35
35
 
36
+ # DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
36
37
  # DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
37
38
  DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device("mps") if torch.backends.mps.is_available() else torch.device('cpu')
38
39
  # PYTORCH_ENABLE_MPS_FALLBACK=1
@@ -12,3 +12,4 @@ from .utr import WhisperUTREngine, RevUTREngine
12
12
  from .analysis import EvaluationEngine
13
13
  from .utterance import StanzaUtteranceEngine
14
14
 
15
+ from .translate import SeamlessTranslationModel
@@ -10,7 +10,7 @@ from batchalign.utils.config import config_read
10
10
 
11
11
  from batchalign.errors import *
12
12
 
13
- from batchalign.models import BertUtteranceModel, resolve
13
+ from batchalign.models import BertUtteranceModel, BertCantoneseUtteranceModel, resolve
14
14
 
15
15
  import time
16
16
  import pathlib
@@ -49,7 +49,11 @@ class RevEngine(BatchalignEngine):
49
49
  self.__client = apiclient.RevAiAPIClient(key)
50
50
  if resolve("utterance", lang) != None:
51
51
  L.debug("Initializing utterance model...")
52
- self.__engine = BertUtteranceModel(resolve("utterance", lang))
52
+ if lang != "yue":
53
+ self.__engine = BertUtteranceModel(resolve("utterance", lang))
54
+ else:
55
+ # we have special inference procedure for cantonese
56
+ self.__engine = BertCantoneseUtteranceModel(resolve("utterance", lang))
53
57
  L.debug("Done.")
54
58
  else:
55
59
  self.__engine = None
@@ -94,7 +94,10 @@ def retokenize_with_engine(intermediate_output, engine):
94
94
  tmp = []
95
95
 
96
96
  for s in new_ut:
97
- tmp.append((s, utterance.pop(0)[1]))
97
+ try:
98
+ tmp.append((s, utterance.pop(0)[1]))
99
+ except IndexError:
100
+ continue
98
101
 
99
102
  final_outputs.append((speaker, tmp+[[delim, [None, None]]]))
100
103
 
@@ -159,7 +162,7 @@ def process_generation(output, lang="eng", utterance_engine=None):
159
162
  final_words.append([part.strip(), [cur, cur+div]])
160
163
  cur += div
161
164
 
162
- lang_2 = pycountry.languages.get(alpha_3=lang).alpha_2
165
+ lang_2 = "yue" if lang == "yue" else pycountry.languages.get(alpha_3=lang).alpha_2
163
166
  def catched_num2words(i):
164
167
  if not i.isdigit():
165
168
  return i
@@ -1,7 +1,7 @@
1
1
  from batchalign.document import *
2
2
  from batchalign.pipelines.base import *
3
3
  from batchalign.pipelines.asr.utils import *
4
- from batchalign.models import WhisperASRModel, BertUtteranceModel
4
+ from batchalign.models import WhisperASRModel, BertUtteranceModel, BertCantoneseUtteranceModel
5
5
 
6
6
  import pycountry
7
7
 
@@ -44,7 +44,11 @@ class WhisperEngine(BatchalignEngine):
44
44
 
45
45
  if resolve("utterance", self.__lang) != None:
46
46
  L.debug("Initializing utterance model...")
47
- self.__engine = BertUtteranceModel(resolve("utterance", self.__lang))
47
+ if lang != "yue":
48
+ self.__engine = BertUtteranceModel(resolve("utterance", lang))
49
+ else:
50
+ # we have special inference procedure for cantonese
51
+ self.__engine = BertCantoneseUtteranceModel(resolve("utterance", lang))
48
52
  L.debug("Done.")
49
53
  else:
50
54
  self.__engine = None
@@ -6,7 +6,7 @@ Tabulate default packages and options.
6
6
  from batchalign import (WhisperEngine, WhisperFAEngine, StanzaEngine, RevEngine,
7
7
  NgramRetraceEngine, DisfluencyReplacementEngine, WhisperUTREngine,
8
8
  RevUTREngine, EvaluationEngine, WhisperXEngine, NemoSpeakerEngine,
9
- StanzaUtteranceEngine, CorefEngine, Wave2VecFAEngine)
9
+ StanzaUtteranceEngine, CorefEngine, Wave2VecFAEngine, SeamlessTranslationModel)
10
10
  from batchalign import BatchalignPipeline
11
11
  from batchalign.models import resolve
12
12
 
@@ -28,6 +28,7 @@ DEFAULT_PACKAGES = {
28
28
  "eval": "evaluation",
29
29
  "utterance": "stanza_utt",
30
30
  "coref": "stanza_coref",
31
+ "translate": "seamless_translate",
31
32
  }
32
33
 
33
34
  LANGUAGE_OVERRIDE_PACKAGES = {
@@ -129,6 +130,8 @@ def dispatch_pipeline(pkg_str, lang, num_speakers=None, **arg_overrides):
129
130
  engines.append(CorefEngine())
130
131
  elif engine == "wav2vec_fa":
131
132
  engines.append(Wave2VecFAEngine())
133
+ elif engine == "seamless_translate":
134
+ engines.append(SeamlessTranslationModel())
132
135
 
133
136
  L.debug(f"Done initalizing packages.")
134
137
  return BatchalignPipeline(*engines)
@@ -154,9 +154,9 @@ class Wave2VecFAEngine(BatchalignEngine):
154
154
  if '\x15' not in ut.text:
155
155
  ut.text = (ut.text+f" \x15{ut.alignment[0]}_{ut.alignment[1]}\x15").strip()
156
156
  else:
157
- ut.text = re.sub("\x15\d+_\d+\x15",
157
+ ut.text = re.sub(r"\x15\d+_\d+\x15",
158
158
  f"\x15{ut.alignment[0]}_{ut.alignment[1]}\x15", ut.text).strip()
159
159
  elif ut.text != None:
160
- ut.text = re.sub("\x15\d+_\d+\x15", f"", ut.text).strip()
160
+ ut.text = re.sub(r"\x15\d+_\d+\x15", f"", ut.text).strip()
161
161
 
162
162
  return doc
@@ -179,9 +179,9 @@ class WhisperFAEngine(BatchalignEngine):
179
179
  if '\x15' not in ut.text:
180
180
  ut.text = (ut.text+f" \x15{ut.alignment[0]}_{ut.alignment[1]}\x15").strip()
181
181
  else:
182
- ut.text = re.sub("\x15\d+_\d+\x15",
182
+ ut.text = re.sub(r"\x15\d+_\d+\x15",
183
183
  f"\x15{ut.alignment[0]}_{ut.alignment[1]}\x15", ut.text).strip()
184
184
  elif ut.text != None:
185
- ut.text = re.sub("\x15\d+_\d+\x15", f"", ut.text).strip()
185
+ ut.text = re.sub(r"\x15\d+_\d+\x15", f"", ut.text).strip()
186
186
 
187
187
  return doc
@@ -0,0 +1 @@
1
+ from .seamless import SeamlessTranslationModel
@@ -0,0 +1,53 @@
1
+ from batchalign.models import WhisperFAModel
2
+ from batchalign.document import *
3
+ from batchalign.pipelines.base import *
4
+ from batchalign.utils import *
5
+ from batchalign.utils.dp import *
6
+ from batchalign.constants import *
7
+
8
+ from transformers import AutoProcessor, SeamlessM4TModel
9
+
10
+ import logging
11
+ L = logging.getLogger("batchalign")
12
+
13
+ import re
14
+
15
+ # !uv pip install sentencepiece
16
+
17
+ import pycountry
18
+ import warnings
19
+
20
+ class SeamlessTranslationModel(BatchalignEngine):
21
+ tasks = [ Task.TRANSLATE ]
22
+
23
+ def _hook_status(self, status_hook):
24
+ self.status_hook = status_hook
25
+
26
+ def __init__(self):
27
+ self.status_hook = None
28
+ self.processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")
29
+ self.model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")
30
+
31
+ def process(self, doc:Document, **kwargs):
32
+
33
+ for indx, i in enumerate(doc.content):
34
+ if not isinstance(i, Utterance):
35
+ continue
36
+ if i.translation:
37
+ continue
38
+
39
+ text = i.strip(join_with_spaces=False, include_retrace=True, include_fp=True)
40
+ text_inputs = self.processor(text=text, src_lang=doc.langs[0] if doc.langs[0] != "zho" else "cmn", return_tensors="pt")
41
+ output_tokens = self.model.generate(**text_inputs, tgt_lang="eng", generate_speech=False)
42
+ translated_text_from_text = self.processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
43
+
44
+ i.translation = translated_text_from_text
45
+ for j in MOR_PUNCT + ENDING_PUNCT:
46
+ i.translation = i.translation.replace(j, " "+j)
47
+
48
+ if self.status_hook != None:
49
+ self.status_hook(indx+1, len(doc.content))
50
+
51
+ return doc
52
+
53
+
@@ -0,0 +1,3 @@
1
+ 0.7.15
2
+ Feburary 23rd, 2025
3
+ Whisper ASR with Cantonese and tokenization!
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.13.post1
3
+ Version: 0.7.15
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -33,6 +33,9 @@ Requires-Dist: soundfile~=0.12.0
33
33
  Requires-Dist: rich-click>=1.7.0
34
34
  Requires-Dist: typing-extensions
35
35
  Requires-Dist: num2words
36
+ Requires-Dist: tiktoken
37
+ Requires-Dist: blobfile
38
+ Requires-Dist: sentencepiece
36
39
  Provides-Extra: dev
37
40
  Requires-Dist: pytest; extra == "dev"
38
41
  Provides-Extra: train
@@ -40,6 +40,7 @@ batchalign/models/training/__init__.py
40
40
  batchalign/models/training/run.py
41
41
  batchalign/models/training/utils.py
42
42
  batchalign/models/utterance/__init__.py
43
+ batchalign/models/utterance/cantonese_infer.py
43
44
  batchalign/models/utterance/dataset.py
44
45
  batchalign/models/utterance/execute.py
45
46
  batchalign/models/utterance/infer.py
@@ -83,6 +84,8 @@ batchalign/pipelines/morphosyntax/fr/case.py
83
84
  batchalign/pipelines/morphosyntax/ja/verbforms.py
84
85
  batchalign/pipelines/speaker/__init__.py
85
86
  batchalign/pipelines/speaker/nemo_speaker.py
87
+ batchalign/pipelines/translate/__init__.py
88
+ batchalign/pipelines/translate/seamless.py
86
89
  batchalign/pipelines/utr/__init__.py
87
90
  batchalign/pipelines/utr/rev_utr.py
88
91
  batchalign/pipelines/utr/utils.py
@@ -23,6 +23,9 @@ soundfile~=0.12.0
23
23
  rich-click>=1.7.0
24
24
  typing-extensions
25
25
  num2words
26
+ tiktoken
27
+ blobfile
28
+ sentencepiece
26
29
 
27
30
  [dev]
28
31
  pytest
@@ -52,6 +52,9 @@ setup(
52
52
  "rich-click>=1.7.0",
53
53
  "typing-extensions",
54
54
  "num2words",
55
+ "tiktoken",
56
+ "blobfile",
57
+ "sentencepiece"
55
58
  ],
56
59
  extras_require={
57
60
  'dev': [
@@ -1,2 +0,0 @@
1
- from .infer import BertUtteranceModel
2
-
@@ -1,3 +0,0 @@
1
- 0.7.13-post.1
2
- Feburary 14nd, 2025
3
- Remove hash sign.
File without changes
File without changes
File without changes