batchalign 0.7.14__tar.gz → 0.7.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. {batchalign-0.7.14/batchalign.egg-info → batchalign-0.7.15}/PKG-INFO +1 -1
  2. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/__init__.py +1 -1
  3. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/resolve.py +1 -1
  4. batchalign-0.7.15/batchalign/models/utterance/__init__.py +4 -0
  5. batchalign-0.7.15/batchalign/models/utterance/cantonese_infer.py +164 -0
  6. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/whisper/infer_asr.py +1 -0
  7. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/asr/rev.py +6 -2
  8. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/asr/utils.py +5 -2
  9. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/asr/whisper.py +6 -2
  10. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/fa/wave2vec_fa.py +2 -2
  11. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/fa/whisper_fa.py +2 -2
  12. batchalign-0.7.15/batchalign/version +3 -0
  13. {batchalign-0.7.14 → batchalign-0.7.15/batchalign.egg-info}/PKG-INFO +1 -1
  14. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign.egg-info/SOURCES.txt +1 -0
  15. batchalign-0.7.14/batchalign/models/utterance/__init__.py +0 -2
  16. batchalign-0.7.14/batchalign/version +0 -3
  17. {batchalign-0.7.14 → batchalign-0.7.15}/LICENSE +0 -0
  18. {batchalign-0.7.14 → batchalign-0.7.15}/MANIFEST.in +0 -0
  19. {batchalign-0.7.14 → batchalign-0.7.15}/README.md +0 -0
  20. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/__init__.py +0 -0
  21. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/__main__.py +0 -0
  22. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/cli/__init__.py +0 -0
  23. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/cli/cli.py +0 -0
  24. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/cli/dispatch.py +0 -0
  25. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/constants.py +0 -0
  26. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/document.py +0 -0
  27. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/errors.py +0 -0
  28. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/__init__.py +0 -0
  29. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/base.py +0 -0
  30. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/chat/__init__.py +0 -0
  31. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/chat/file.py +0 -0
  32. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/chat/generator.py +0 -0
  33. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/chat/lexer.py +0 -0
  34. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/chat/parser.py +0 -0
  35. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/chat/utils.py +0 -0
  36. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/textgrid/__init__.py +0 -0
  37. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/textgrid/file.py +0 -0
  38. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/textgrid/generator.py +0 -0
  39. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/textgrid/parser.py +0 -0
  40. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/speaker/__init__.py +0 -0
  41. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/speaker/config.yaml +0 -0
  42. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/speaker/infer.py +0 -0
  43. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/speaker/utils.py +0 -0
  44. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/training/__init__.py +0 -0
  45. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/training/run.py +0 -0
  46. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/training/utils.py +0 -0
  47. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/utils.py +0 -0
  48. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/utterance/dataset.py +0 -0
  49. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/utterance/execute.py +0 -0
  50. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/utterance/infer.py +0 -0
  51. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/utterance/prep.py +0 -0
  52. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/utterance/train.py +0 -0
  53. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/wave2vec/__init__.py +0 -0
  54. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/wave2vec/infer_fa.py +0 -0
  55. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/whisper/__init__.py +0 -0
  56. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/whisper/infer_fa.py +0 -0
  57. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/__init__.py +0 -0
  58. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/analysis/__init__.py +0 -0
  59. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/analysis/eval.py +0 -0
  60. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/asr/__init__.py +0 -0
  61. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/asr/num2chinese.py +0 -0
  62. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/asr/whisperx.py +0 -0
  63. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/base.py +0 -0
  64. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/cleanup/__init__.py +0 -0
  65. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  66. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  67. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  68. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/cleanup/retrace.py +0 -0
  69. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  70. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  71. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/cleanup/support/test.test +0 -0
  72. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/dispatch.py +0 -0
  73. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/fa/__init__.py +0 -0
  74. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  75. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/coref.py +0 -0
  76. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  77. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  78. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  79. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  80. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  81. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/ud.py +0 -0
  82. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/pipeline.py +0 -0
  83. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/speaker/__init__.py +0 -0
  84. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  85. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/translate/__init__.py +0 -0
  86. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/translate/seamless.py +0 -0
  87. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/utr/__init__.py +0 -0
  88. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/utr/rev_utr.py +0 -0
  89. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/utr/utils.py +0 -0
  90. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  91. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/utterance/__init__.py +0 -0
  92. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  93. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/__init__.py +0 -0
  94. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/conftest.py +0 -0
  95. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  96. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  97. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  98. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  99. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  100. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  101. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  102. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  103. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  104. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  105. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  106. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  107. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/pipelines/fixures.py +0 -0
  108. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  109. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  110. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/test_document.py +0 -0
  111. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/utils/__init__.py +0 -0
  112. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/utils/config.py +0 -0
  113. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/utils/dp.py +0 -0
  114. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/utils/utils.py +0 -0
  115. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign.egg-info/dependency_links.txt +0 -0
  116. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign.egg-info/entry_points.txt +0 -0
  117. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign.egg-info/requires.txt +0 -0
  118. {batchalign-0.7.14 → batchalign-0.7.15}/batchalign.egg-info/top_level.txt +0 -0
  119. {batchalign-0.7.14 → batchalign-0.7.15}/setup.cfg +0 -0
  120. {batchalign-0.7.14 → batchalign-0.7.15}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.14
3
+ Version: 0.7.15
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -1,4 +1,4 @@
1
- from .utterance import BertUtteranceModel
1
+ from .utterance import BertUtteranceModel, BertCantoneseUtteranceModel
2
2
  from .whisper import WhisperASRModel, WhisperFAModel
3
3
  from .speaker import NemoSpeakerModel
4
4
  from .utils import ASRAudioFile
@@ -8,7 +8,7 @@ resolver = {
8
8
  "utterance": {
9
9
  'eng': "talkbank/CHATUtterance-en",
10
10
  "zho": "talkbank/CHATUtterance-zh_CN",
11
- "yue": "talkbank/CHATUtterance-zh_CN",
11
+ "yue": "PolyU-AngelChanLab/Cantonese-Utterance-Segmentation",
12
12
  },
13
13
  "whisper": {
14
14
  'eng': ("talkbank/CHATWhisper-en-large-v1", "openai/whisper-large-v2"),
@@ -0,0 +1,4 @@
1
+ from .infer import BertUtteranceModel
2
+ from .cantonese_infer import BertCantoneseUtteranceModel
3
+
4
+
@@ -0,0 +1,164 @@
1
+ import re
2
+ import string
3
+ import random
4
+
5
+ # tokenization utilities
6
+ import nltk
7
+ from nltk import word_tokenize, sent_tokenize
8
+
9
+ # torch
10
+ import torch
11
+ from torch.utils.data import dataset
12
+ from torch.utils.data.dataloader import DataLoader
13
+ from torch.optim import AdamW
14
+
15
+ # import huggingface utils
16
+ from transformers import AutoTokenizer, BertForTokenClassification
17
+ from transformers import DataCollatorForTokenClassification
18
+
19
+ # tqdm
20
+ from tqdm import tqdm
21
+
22
+ # seed device and tokens
23
+ DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
24
+
25
+ # seed model
26
+ class BertCantoneseUtteranceModel(object):
27
+
28
+ def __init__(self, model):
29
+ # seed tokenizers and model
30
+ self.tokenizer = AutoTokenizer.from_pretrained(model)
31
+ self.model = BertForTokenClassification.from_pretrained(model).to(DEVICE)
32
+ self.max_length = 512
33
+ self.overlap = 20
34
+
35
+ # eval mode
36
+ self.model.eval()
37
+ print(f"Model and tokenizer initialized on device: {DEVICE}")
38
+ print(f"Max length set to {self.max_length} with overlap of {self.overlap}")
39
+
40
+ def __call__(self, passage):
41
+ # Step 1: Clean up passage
42
+ passage = passage.lower()
43
+ passage = passage.replace('.','')
44
+ passage = passage.replace(',','')
45
+ passage = passage.replace('!','')
46
+ passage = passage.replace('!','')
47
+ passage = passage.replace('?','')
48
+ passage = passage.replace('。','')
49
+ passage = passage.replace(',','')
50
+ passage = passage.replace('?','')
51
+ passage = passage.replace('(','')
52
+ passage = passage.replace(')','')
53
+ passage = passage.replace(':','')
54
+ passage = passage.replace('*','')
55
+ passage = passage.replace('l','')
56
+
57
+
58
+ # Step 2: Define keywords and split the passage based on them
59
+ keywords = ['呀', '啦', '喎', '嘞', '㗎喇', '囉', '㗎', '啊', '嗯'] # Replace with your desired keywords
60
+
61
+ chunks = []
62
+ start = 0
63
+
64
+ while start < len(passage):
65
+ # Find the position of each keyword in the passage starting from the current `start`
66
+ keyword_positions = [(keyword, passage.find(keyword, start)) for keyword in keywords]
67
+ # Filter out keywords that are not found (find() returns -1 if not found)
68
+ keyword_positions = [kp for kp in keyword_positions if kp[1] != -1]
69
+
70
+ if keyword_positions:
71
+ # Find the keyword that appears first in the passage from current start
72
+ first_keyword, keyword_pos = min(keyword_positions, key=lambda x: x[1])
73
+ chunk = passage[start:keyword_pos + len(first_keyword)]
74
+ chunks.append(chunk)
75
+ start = keyword_pos + len(first_keyword)
76
+ else:
77
+ # No more keywords found, add the rest of the passage as the last chunk
78
+ chunks.append(passage[start:])
79
+ break
80
+
81
+ # Debugging: Print number of chunks and their content
82
+ print(f"Created {len(chunks)} chunks based on keywords.")
83
+ for i, chunk in enumerate(chunks):
84
+ print(f"Chunk {i + 1}: {chunk[:100]}...") # Print the first 100 characters of each chunk
85
+
86
+ # Step 3: Process each chunk and restore punctuation
87
+ final_passage = []
88
+ for chunk_index, chunk in enumerate(chunks):
89
+ print(f"Processing chunk {chunk_index + 1}/{len(chunks)}...")
90
+
91
+ # Step 3.1: Split chunk by characters (Chinese tokenization)
92
+ tokenized_chunk = list(chunk) # Simply split by characters for Chinese text
93
+
94
+ # Step 3.2: Pass chunk through the tokenizer and model
95
+ tokd = self.tokenizer.batch_encode_plus([tokenized_chunk],
96
+ return_tensors='pt',
97
+ truncation=True,
98
+ padding=True,
99
+ max_length=self.max_length,
100
+ is_split_into_words=True).to(DEVICE)
101
+
102
+ try:
103
+ # Pass it through the model
104
+ res = self.model(**tokd).logits
105
+ except Exception as e:
106
+ print(f"Error during model inference: {e}")
107
+ return []
108
+
109
+ # Argmax for classification
110
+ classified_targets = torch.argmax(res, dim=2).cpu()
111
+
112
+ # Initialize result tokens list for the current chunk
113
+ res_toks = []
114
+ prev_word_idx = None
115
+
116
+ # Iterate over tokenized words
117
+ wids = tokd.word_ids(0)
118
+ for indx, elem in enumerate(wids):
119
+ if elem is None or elem == prev_word_idx:
120
+ continue
121
+
122
+ prev_word_idx = elem
123
+ action = classified_targets[0][indx]
124
+
125
+ # Get the word corresponding to the token
126
+ w = tokenized_chunk[elem] # Use tokenized chunk here
127
+
128
+ # Fix one word hanging issue (if needed)
129
+ will_action = False
130
+ if indx < len(wids) - 2 and classified_targets[0][indx + 1] > 0:
131
+ will_action = True
132
+
133
+ if not will_action:
134
+ # Perform the edits based on model predictions
135
+ if action == 1: # First capital letter
136
+ w = w[0].upper() + w[1:]
137
+ elif action == 2: # Add period
138
+ w = w + '.'
139
+ elif action == 3: # Add question mark
140
+ w = w + '?'
141
+ elif action == 4: # Add exclamation mark
142
+ w = w + '!'
143
+ elif action == 5: # Add comma
144
+ w = w + ','
145
+
146
+ # Append modified word to result list
147
+ res_toks.append(w)
148
+
149
+ # Convert list of tokens back to string and append to final_passage
150
+ final_passage.append(self.tokenizer.convert_tokens_to_string(res_toks))
151
+
152
+ # Step 4: Join processed chunks together into the final passage
153
+ final_text = ' '.join(final_passage)
154
+
155
+ print("Text processing completed. Generating final output...")
156
+
157
+ # Optionally, tokenize the final text into sentences based on punctuation
158
+ try:
159
+ split_passage = sent_tokenize(final_text)
160
+ except LookupError:
161
+ nltk.download('punkt')
162
+ split_passage = sent_tokenize(final_text)
163
+
164
+ return split_passage
@@ -33,6 +33,7 @@ import pycountry
33
33
  import logging
34
34
  L = logging.getLogger("batchalign")
35
35
 
36
+ # DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
36
37
  # DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
37
38
  DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device("mps") if torch.backends.mps.is_available() else torch.device('cpu')
38
39
  # PYTORCH_ENABLE_MPS_FALLBACK=1
@@ -10,7 +10,7 @@ from batchalign.utils.config import config_read
10
10
 
11
11
  from batchalign.errors import *
12
12
 
13
- from batchalign.models import BertUtteranceModel, resolve
13
+ from batchalign.models import BertUtteranceModel, BertCantoneseUtteranceModel, resolve
14
14
 
15
15
  import time
16
16
  import pathlib
@@ -49,7 +49,11 @@ class RevEngine(BatchalignEngine):
49
49
  self.__client = apiclient.RevAiAPIClient(key)
50
50
  if resolve("utterance", lang) != None:
51
51
  L.debug("Initializing utterance model...")
52
- self.__engine = BertUtteranceModel(resolve("utterance", lang))
52
+ if lang != "yue":
53
+ self.__engine = BertUtteranceModel(resolve("utterance", lang))
54
+ else:
55
+ # we have special inference procedure for cantonese
56
+ self.__engine = BertCantoneseUtteranceModel(resolve("utterance", lang))
53
57
  L.debug("Done.")
54
58
  else:
55
59
  self.__engine = None
@@ -94,7 +94,10 @@ def retokenize_with_engine(intermediate_output, engine):
94
94
  tmp = []
95
95
 
96
96
  for s in new_ut:
97
- tmp.append((s, utterance.pop(0)[1]))
97
+ try:
98
+ tmp.append((s, utterance.pop(0)[1]))
99
+ except IndexError:
100
+ continue
98
101
 
99
102
  final_outputs.append((speaker, tmp+[[delim, [None, None]]]))
100
103
 
@@ -159,7 +162,7 @@ def process_generation(output, lang="eng", utterance_engine=None):
159
162
  final_words.append([part.strip(), [cur, cur+div]])
160
163
  cur += div
161
164
 
162
- lang_2 = pycountry.languages.get(alpha_3=lang).alpha_2
165
+ lang_2 = "yue" if lang == "yue" else pycountry.languages.get(alpha_3=lang).alpha_2
163
166
  def catched_num2words(i):
164
167
  if not i.isdigit():
165
168
  return i
@@ -1,7 +1,7 @@
1
1
  from batchalign.document import *
2
2
  from batchalign.pipelines.base import *
3
3
  from batchalign.pipelines.asr.utils import *
4
- from batchalign.models import WhisperASRModel, BertUtteranceModel
4
+ from batchalign.models import WhisperASRModel, BertUtteranceModel, BertCantoneseUtteranceModel
5
5
 
6
6
  import pycountry
7
7
 
@@ -44,7 +44,11 @@ class WhisperEngine(BatchalignEngine):
44
44
 
45
45
  if resolve("utterance", self.__lang) != None:
46
46
  L.debug("Initializing utterance model...")
47
- self.__engine = BertUtteranceModel(resolve("utterance", self.__lang))
47
+ if lang != "yue":
48
+ self.__engine = BertUtteranceModel(resolve("utterance", lang))
49
+ else:
50
+ # we have special inference procedure for cantonese
51
+ self.__engine = BertCantoneseUtteranceModel(resolve("utterance", lang))
48
52
  L.debug("Done.")
49
53
  else:
50
54
  self.__engine = None
@@ -154,9 +154,9 @@ class Wave2VecFAEngine(BatchalignEngine):
154
154
  if '\x15' not in ut.text:
155
155
  ut.text = (ut.text+f" \x15{ut.alignment[0]}_{ut.alignment[1]}\x15").strip()
156
156
  else:
157
- ut.text = re.sub("\x15\d+_\d+\x15",
157
+ ut.text = re.sub(r"\x15\d+_\d+\x15",
158
158
  f"\x15{ut.alignment[0]}_{ut.alignment[1]}\x15", ut.text).strip()
159
159
  elif ut.text != None:
160
- ut.text = re.sub("\x15\d+_\d+\x15", f"", ut.text).strip()
160
+ ut.text = re.sub(r"\x15\d+_\d+\x15", f"", ut.text).strip()
161
161
 
162
162
  return doc
@@ -179,9 +179,9 @@ class WhisperFAEngine(BatchalignEngine):
179
179
  if '\x15' not in ut.text:
180
180
  ut.text = (ut.text+f" \x15{ut.alignment[0]}_{ut.alignment[1]}\x15").strip()
181
181
  else:
182
- ut.text = re.sub("\x15\d+_\d+\x15",
182
+ ut.text = re.sub(r"\x15\d+_\d+\x15",
183
183
  f"\x15{ut.alignment[0]}_{ut.alignment[1]}\x15", ut.text).strip()
184
184
  elif ut.text != None:
185
- ut.text = re.sub("\x15\d+_\d+\x15", f"", ut.text).strip()
185
+ ut.text = re.sub(r"\x15\d+_\d+\x15", f"", ut.text).strip()
186
186
 
187
187
  return doc
@@ -0,0 +1,3 @@
1
+ 0.7.15
2
+ Feburary 23rd, 2025
3
+ Whisper ASR with Cantonese and tokenization!
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.14
3
+ Version: 0.7.15
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -40,6 +40,7 @@ batchalign/models/training/__init__.py
40
40
  batchalign/models/training/run.py
41
41
  batchalign/models/training/utils.py
42
42
  batchalign/models/utterance/__init__.py
43
+ batchalign/models/utterance/cantonese_infer.py
43
44
  batchalign/models/utterance/dataset.py
44
45
  batchalign/models/utterance/execute.py
45
46
  batchalign/models/utterance/infer.py
@@ -1,2 +0,0 @@
1
- from .infer import BertUtteranceModel
2
-
@@ -1,3 +0,0 @@
1
- 0.7.14
2
- Feburary 19nd, 2025
3
- machine translation!
File without changes
File without changes
File without changes
File without changes
File without changes