batchalign 0.7.14__tar.gz → 0.7.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batchalign-0.7.14/batchalign.egg-info → batchalign-0.7.15}/PKG-INFO +1 -1
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/__init__.py +1 -1
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/resolve.py +1 -1
- batchalign-0.7.15/batchalign/models/utterance/__init__.py +4 -0
- batchalign-0.7.15/batchalign/models/utterance/cantonese_infer.py +164 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/whisper/infer_asr.py +1 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/asr/rev.py +6 -2
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/asr/utils.py +5 -2
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/asr/whisper.py +6 -2
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/fa/wave2vec_fa.py +2 -2
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/fa/whisper_fa.py +2 -2
- batchalign-0.7.15/batchalign/version +3 -0
- {batchalign-0.7.14 → batchalign-0.7.15/batchalign.egg-info}/PKG-INFO +1 -1
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign.egg-info/SOURCES.txt +1 -0
- batchalign-0.7.14/batchalign/models/utterance/__init__.py +0 -2
- batchalign-0.7.14/batchalign/version +0 -3
- {batchalign-0.7.14 → batchalign-0.7.15}/LICENSE +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/MANIFEST.in +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/README.md +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/__init__.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/__main__.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/cli/__init__.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/cli/cli.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/cli/dispatch.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/constants.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/document.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/errors.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/__init__.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/base.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/chat/__init__.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/chat/file.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/chat/generator.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/chat/lexer.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/chat/parser.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/chat/utils.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/textgrid/__init__.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/textgrid/file.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/textgrid/generator.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/formats/textgrid/parser.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/speaker/__init__.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/speaker/config.yaml +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/speaker/infer.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/speaker/utils.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/training/__init__.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/training/run.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/training/utils.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/utils.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/utterance/dataset.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/utterance/execute.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/utterance/infer.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/utterance/prep.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/utterance/train.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/wave2vec/__init__.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/wave2vec/infer_fa.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/whisper/__init__.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/models/whisper/infer_fa.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/__init__.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/analysis/__init__.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/analysis/eval.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/asr/__init__.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/asr/num2chinese.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/asr/whisperx.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/base.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/cleanup/__init__.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/cleanup/cleanup.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/cleanup/parse_support.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/cleanup/retrace.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/cleanup/support/test.test +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/dispatch.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/fa/__init__.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/coref.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/morphosyntax/ud.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/pipeline.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/speaker/__init__.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/translate/__init__.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/translate/seamless.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/utr/__init__.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/utr/rev_utr.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/utr/utils.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/utr/whisper_utr.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/utterance/__init__.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/__init__.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/conftest.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/pipelines/fixures.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/pipelines/test_pipeline.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/test_document.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/utils/__init__.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/utils/config.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/utils/dp.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign/utils/utils.py +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign.egg-info/dependency_links.txt +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign.egg-info/entry_points.txt +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign.egg-info/requires.txt +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/batchalign.egg-info/top_level.txt +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/setup.cfg +0 -0
- {batchalign-0.7.14 → batchalign-0.7.15}/setup.py +0 -0
@@ -8,7 +8,7 @@ resolver = {
|
|
8
8
|
"utterance": {
|
9
9
|
'eng': "talkbank/CHATUtterance-en",
|
10
10
|
"zho": "talkbank/CHATUtterance-zh_CN",
|
11
|
-
"yue": "
|
11
|
+
"yue": "PolyU-AngelChanLab/Cantonese-Utterance-Segmentation",
|
12
12
|
},
|
13
13
|
"whisper": {
|
14
14
|
'eng': ("talkbank/CHATWhisper-en-large-v1", "openai/whisper-large-v2"),
|
@@ -0,0 +1,164 @@
|
|
1
|
+
import re
|
2
|
+
import string
|
3
|
+
import random
|
4
|
+
|
5
|
+
# tokenization utilities
|
6
|
+
import nltk
|
7
|
+
from nltk import word_tokenize, sent_tokenize
|
8
|
+
|
9
|
+
# torch
|
10
|
+
import torch
|
11
|
+
from torch.utils.data import dataset
|
12
|
+
from torch.utils.data.dataloader import DataLoader
|
13
|
+
from torch.optim import AdamW
|
14
|
+
|
15
|
+
# import huggingface utils
|
16
|
+
from transformers import AutoTokenizer, BertForTokenClassification
|
17
|
+
from transformers import DataCollatorForTokenClassification
|
18
|
+
|
19
|
+
# tqdm
|
20
|
+
from tqdm import tqdm
|
21
|
+
|
22
|
+
# seed device and tokens
|
23
|
+
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
24
|
+
|
25
|
+
# seed model
|
26
|
+
class BertCantoneseUtteranceModel(object):
|
27
|
+
|
28
|
+
def __init__(self, model):
|
29
|
+
# seed tokenizers and model
|
30
|
+
self.tokenizer = AutoTokenizer.from_pretrained(model)
|
31
|
+
self.model = BertForTokenClassification.from_pretrained(model).to(DEVICE)
|
32
|
+
self.max_length = 512
|
33
|
+
self.overlap = 20
|
34
|
+
|
35
|
+
# eval mode
|
36
|
+
self.model.eval()
|
37
|
+
print(f"Model and tokenizer initialized on device: {DEVICE}")
|
38
|
+
print(f"Max length set to {self.max_length} with overlap of {self.overlap}")
|
39
|
+
|
40
|
+
def __call__(self, passage):
|
41
|
+
# Step 1: Clean up passage
|
42
|
+
passage = passage.lower()
|
43
|
+
passage = passage.replace('.','')
|
44
|
+
passage = passage.replace(',','')
|
45
|
+
passage = passage.replace('!','')
|
46
|
+
passage = passage.replace('!','')
|
47
|
+
passage = passage.replace('?','')
|
48
|
+
passage = passage.replace('。','')
|
49
|
+
passage = passage.replace(',','')
|
50
|
+
passage = passage.replace('?','')
|
51
|
+
passage = passage.replace('(','')
|
52
|
+
passage = passage.replace(')','')
|
53
|
+
passage = passage.replace(':','')
|
54
|
+
passage = passage.replace('*','')
|
55
|
+
passage = passage.replace('l','')
|
56
|
+
|
57
|
+
|
58
|
+
# Step 2: Define keywords and split the passage based on them
|
59
|
+
keywords = ['呀', '啦', '喎', '嘞', '㗎喇', '囉', '㗎', '啊', '嗯'] # Replace with your desired keywords
|
60
|
+
|
61
|
+
chunks = []
|
62
|
+
start = 0
|
63
|
+
|
64
|
+
while start < len(passage):
|
65
|
+
# Find the position of each keyword in the passage starting from the current `start`
|
66
|
+
keyword_positions = [(keyword, passage.find(keyword, start)) for keyword in keywords]
|
67
|
+
# Filter out keywords that are not found (find() returns -1 if not found)
|
68
|
+
keyword_positions = [kp for kp in keyword_positions if kp[1] != -1]
|
69
|
+
|
70
|
+
if keyword_positions:
|
71
|
+
# Find the keyword that appears first in the passage from current start
|
72
|
+
first_keyword, keyword_pos = min(keyword_positions, key=lambda x: x[1])
|
73
|
+
chunk = passage[start:keyword_pos + len(first_keyword)]
|
74
|
+
chunks.append(chunk)
|
75
|
+
start = keyword_pos + len(first_keyword)
|
76
|
+
else:
|
77
|
+
# No more keywords found, add the rest of the passage as the last chunk
|
78
|
+
chunks.append(passage[start:])
|
79
|
+
break
|
80
|
+
|
81
|
+
# Debugging: Print number of chunks and their content
|
82
|
+
print(f"Created {len(chunks)} chunks based on keywords.")
|
83
|
+
for i, chunk in enumerate(chunks):
|
84
|
+
print(f"Chunk {i + 1}: {chunk[:100]}...") # Print the first 100 characters of each chunk
|
85
|
+
|
86
|
+
# Step 3: Process each chunk and restore punctuation
|
87
|
+
final_passage = []
|
88
|
+
for chunk_index, chunk in enumerate(chunks):
|
89
|
+
print(f"Processing chunk {chunk_index + 1}/{len(chunks)}...")
|
90
|
+
|
91
|
+
# Step 3.1: Split chunk by characters (Chinese tokenization)
|
92
|
+
tokenized_chunk = list(chunk) # Simply split by characters for Chinese text
|
93
|
+
|
94
|
+
# Step 3.2: Pass chunk through the tokenizer and model
|
95
|
+
tokd = self.tokenizer.batch_encode_plus([tokenized_chunk],
|
96
|
+
return_tensors='pt',
|
97
|
+
truncation=True,
|
98
|
+
padding=True,
|
99
|
+
max_length=self.max_length,
|
100
|
+
is_split_into_words=True).to(DEVICE)
|
101
|
+
|
102
|
+
try:
|
103
|
+
# Pass it through the model
|
104
|
+
res = self.model(**tokd).logits
|
105
|
+
except Exception as e:
|
106
|
+
print(f"Error during model inference: {e}")
|
107
|
+
return []
|
108
|
+
|
109
|
+
# Argmax for classification
|
110
|
+
classified_targets = torch.argmax(res, dim=2).cpu()
|
111
|
+
|
112
|
+
# Initialize result tokens list for the current chunk
|
113
|
+
res_toks = []
|
114
|
+
prev_word_idx = None
|
115
|
+
|
116
|
+
# Iterate over tokenized words
|
117
|
+
wids = tokd.word_ids(0)
|
118
|
+
for indx, elem in enumerate(wids):
|
119
|
+
if elem is None or elem == prev_word_idx:
|
120
|
+
continue
|
121
|
+
|
122
|
+
prev_word_idx = elem
|
123
|
+
action = classified_targets[0][indx]
|
124
|
+
|
125
|
+
# Get the word corresponding to the token
|
126
|
+
w = tokenized_chunk[elem] # Use tokenized chunk here
|
127
|
+
|
128
|
+
# Fix one word hanging issue (if needed)
|
129
|
+
will_action = False
|
130
|
+
if indx < len(wids) - 2 and classified_targets[0][indx + 1] > 0:
|
131
|
+
will_action = True
|
132
|
+
|
133
|
+
if not will_action:
|
134
|
+
# Perform the edits based on model predictions
|
135
|
+
if action == 1: # First capital letter
|
136
|
+
w = w[0].upper() + w[1:]
|
137
|
+
elif action == 2: # Add period
|
138
|
+
w = w + '.'
|
139
|
+
elif action == 3: # Add question mark
|
140
|
+
w = w + '?'
|
141
|
+
elif action == 4: # Add exclamation mark
|
142
|
+
w = w + '!'
|
143
|
+
elif action == 5: # Add comma
|
144
|
+
w = w + ','
|
145
|
+
|
146
|
+
# Append modified word to result list
|
147
|
+
res_toks.append(w)
|
148
|
+
|
149
|
+
# Convert list of tokens back to string and append to final_passage
|
150
|
+
final_passage.append(self.tokenizer.convert_tokens_to_string(res_toks))
|
151
|
+
|
152
|
+
# Step 4: Join processed chunks together into the final passage
|
153
|
+
final_text = ' '.join(final_passage)
|
154
|
+
|
155
|
+
print("Text processing completed. Generating final output...")
|
156
|
+
|
157
|
+
# Optionally, tokenize the final text into sentences based on punctuation
|
158
|
+
try:
|
159
|
+
split_passage = sent_tokenize(final_text)
|
160
|
+
except LookupError:
|
161
|
+
nltk.download('punkt')
|
162
|
+
split_passage = sent_tokenize(final_text)
|
163
|
+
|
164
|
+
return split_passage
|
@@ -33,6 +33,7 @@ import pycountry
|
|
33
33
|
import logging
|
34
34
|
L = logging.getLogger("batchalign")
|
35
35
|
|
36
|
+
# DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
36
37
|
# DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
37
38
|
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device("mps") if torch.backends.mps.is_available() else torch.device('cpu')
|
38
39
|
# PYTORCH_ENABLE_MPS_FALLBACK=1
|
@@ -10,7 +10,7 @@ from batchalign.utils.config import config_read
|
|
10
10
|
|
11
11
|
from batchalign.errors import *
|
12
12
|
|
13
|
-
from batchalign.models import BertUtteranceModel, resolve
|
13
|
+
from batchalign.models import BertUtteranceModel, BertCantoneseUtteranceModel, resolve
|
14
14
|
|
15
15
|
import time
|
16
16
|
import pathlib
|
@@ -49,7 +49,11 @@ class RevEngine(BatchalignEngine):
|
|
49
49
|
self.__client = apiclient.RevAiAPIClient(key)
|
50
50
|
if resolve("utterance", lang) != None:
|
51
51
|
L.debug("Initializing utterance model...")
|
52
|
-
|
52
|
+
if lang != "yue":
|
53
|
+
self.__engine = BertUtteranceModel(resolve("utterance", lang))
|
54
|
+
else:
|
55
|
+
# we have special inference procedure for cantonese
|
56
|
+
self.__engine = BertCantoneseUtteranceModel(resolve("utterance", lang))
|
53
57
|
L.debug("Done.")
|
54
58
|
else:
|
55
59
|
self.__engine = None
|
@@ -94,7 +94,10 @@ def retokenize_with_engine(intermediate_output, engine):
|
|
94
94
|
tmp = []
|
95
95
|
|
96
96
|
for s in new_ut:
|
97
|
-
|
97
|
+
try:
|
98
|
+
tmp.append((s, utterance.pop(0)[1]))
|
99
|
+
except IndexError:
|
100
|
+
continue
|
98
101
|
|
99
102
|
final_outputs.append((speaker, tmp+[[delim, [None, None]]]))
|
100
103
|
|
@@ -159,7 +162,7 @@ def process_generation(output, lang="eng", utterance_engine=None):
|
|
159
162
|
final_words.append([part.strip(), [cur, cur+div]])
|
160
163
|
cur += div
|
161
164
|
|
162
|
-
lang_2 = pycountry.languages.get(alpha_3=lang).alpha_2
|
165
|
+
lang_2 = "yue" if lang == "yue" else pycountry.languages.get(alpha_3=lang).alpha_2
|
163
166
|
def catched_num2words(i):
|
164
167
|
if not i.isdigit():
|
165
168
|
return i
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from batchalign.document import *
|
2
2
|
from batchalign.pipelines.base import *
|
3
3
|
from batchalign.pipelines.asr.utils import *
|
4
|
-
from batchalign.models import WhisperASRModel, BertUtteranceModel
|
4
|
+
from batchalign.models import WhisperASRModel, BertUtteranceModel, BertCantoneseUtteranceModel
|
5
5
|
|
6
6
|
import pycountry
|
7
7
|
|
@@ -44,7 +44,11 @@ class WhisperEngine(BatchalignEngine):
|
|
44
44
|
|
45
45
|
if resolve("utterance", self.__lang) != None:
|
46
46
|
L.debug("Initializing utterance model...")
|
47
|
-
|
47
|
+
if lang != "yue":
|
48
|
+
self.__engine = BertUtteranceModel(resolve("utterance", lang))
|
49
|
+
else:
|
50
|
+
# we have special inference procedure for cantonese
|
51
|
+
self.__engine = BertCantoneseUtteranceModel(resolve("utterance", lang))
|
48
52
|
L.debug("Done.")
|
49
53
|
else:
|
50
54
|
self.__engine = None
|
@@ -154,9 +154,9 @@ class Wave2VecFAEngine(BatchalignEngine):
|
|
154
154
|
if '\x15' not in ut.text:
|
155
155
|
ut.text = (ut.text+f" \x15{ut.alignment[0]}_{ut.alignment[1]}\x15").strip()
|
156
156
|
else:
|
157
|
-
ut.text = re.sub("\x15\d+_\d+\x15",
|
157
|
+
ut.text = re.sub(r"\x15\d+_\d+\x15",
|
158
158
|
f"\x15{ut.alignment[0]}_{ut.alignment[1]}\x15", ut.text).strip()
|
159
159
|
elif ut.text != None:
|
160
|
-
ut.text = re.sub("\x15\d+_\d+\x15", f"", ut.text).strip()
|
160
|
+
ut.text = re.sub(r"\x15\d+_\d+\x15", f"", ut.text).strip()
|
161
161
|
|
162
162
|
return doc
|
@@ -179,9 +179,9 @@ class WhisperFAEngine(BatchalignEngine):
|
|
179
179
|
if '\x15' not in ut.text:
|
180
180
|
ut.text = (ut.text+f" \x15{ut.alignment[0]}_{ut.alignment[1]}\x15").strip()
|
181
181
|
else:
|
182
|
-
ut.text = re.sub("\x15\d+_\d+\x15",
|
182
|
+
ut.text = re.sub(r"\x15\d+_\d+\x15",
|
183
183
|
f"\x15{ut.alignment[0]}_{ut.alignment[1]}\x15", ut.text).strip()
|
184
184
|
elif ut.text != None:
|
185
|
-
ut.text = re.sub("\x15\d+_\d+\x15", f"", ut.text).strip()
|
185
|
+
ut.text = re.sub(r"\x15\d+_\d+\x15", f"", ut.text).strip()
|
186
186
|
|
187
187
|
return doc
|
@@ -40,6 +40,7 @@ batchalign/models/training/__init__.py
|
|
40
40
|
batchalign/models/training/run.py
|
41
41
|
batchalign/models/training/utils.py
|
42
42
|
batchalign/models/utterance/__init__.py
|
43
|
+
batchalign/models/utterance/cantonese_infer.py
|
43
44
|
batchalign/models/utterance/dataset.py
|
44
45
|
batchalign/models/utterance/execute.py
|
45
46
|
batchalign/models/utterance/infer.py
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/cleanup/support/filled_pauses.eng
RENAMED
File without changes
|
{batchalign-0.7.14 → batchalign-0.7.15}/batchalign/pipelines/cleanup/support/replacements.eng
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/formats/chat/test_chat_generator.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/pipelines/cleanup/test_disfluency.py
RENAMED
File without changes
|
{batchalign-0.7.14 → batchalign-0.7.15}/batchalign/tests/pipelines/cleanup/test_parse_support.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|