lattifai 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +0 -25
- lattifai/alignment/lattice1_aligner.py +12 -9
- lattifai/alignment/lattice1_worker.py +124 -155
- lattifai/alignment/segmenter.py +1 -1
- lattifai/alignment/sentence_splitter.py +219 -0
- lattifai/alignment/tokenizer.py +23 -179
- lattifai/audio2.py +1 -1
- lattifai/caption/caption.py +0 -2
- lattifai/caption/gemini_reader.py +151 -60
- lattifai/cli/diarization.py +3 -1
- lattifai/cli/transcribe.py +3 -8
- lattifai/cli/youtube.py +11 -0
- lattifai/client.py +96 -47
- lattifai/config/alignment.py +2 -2
- lattifai/config/client.py +5 -0
- lattifai/mixin.py +17 -8
- lattifai/utils.py +40 -4
- lattifai/workflow/youtube.py +55 -57
- {lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/METADATA +331 -48
- {lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/RECORD +24 -23
- {lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/WHEEL +0 -0
- {lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/entry_points.txt +0 -0
- {lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
|
|
4
|
+
from lattifai.caption import Supervision
|
|
5
|
+
from lattifai.utils import _resolve_model_path
|
|
6
|
+
|
|
7
|
+
END_PUNCTUATION = '.!?"]。!?"】'
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SentenceSplitter:
|
|
11
|
+
"""Lazy-initialized sentence splitter using wtpsplit."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, device: str = "cpu", model_hub: Optional[str] = None, lazy_init: bool = True):
|
|
14
|
+
"""Initialize sentence splitter with lazy loading.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
device: Device to run the model on (cpu, cuda, mps)
|
|
18
|
+
model_hub: Model hub to use (None for huggingface, "modelscope" for modelscope)
|
|
19
|
+
"""
|
|
20
|
+
self.device = device
|
|
21
|
+
self.model_hub = model_hub
|
|
22
|
+
if lazy_init:
|
|
23
|
+
self._splitter = None
|
|
24
|
+
else:
|
|
25
|
+
self._init_splitter()
|
|
26
|
+
|
|
27
|
+
def _init_splitter(self):
|
|
28
|
+
"""Initialize the sentence splitter model on first use."""
|
|
29
|
+
if self._splitter is not None:
|
|
30
|
+
return
|
|
31
|
+
|
|
32
|
+
import onnxruntime as ort
|
|
33
|
+
from wtpsplit import SaT
|
|
34
|
+
|
|
35
|
+
providers = []
|
|
36
|
+
device = self.device
|
|
37
|
+
if device.startswith("cuda") and ort.get_all_providers().count("CUDAExecutionProvider") > 0:
|
|
38
|
+
providers.append("CUDAExecutionProvider")
|
|
39
|
+
elif device.startswith("mps") and ort.get_all_providers().count("MPSExecutionProvider") > 0:
|
|
40
|
+
providers.append("MPSExecutionProvider")
|
|
41
|
+
|
|
42
|
+
if self.model_hub == "modelscope":
|
|
43
|
+
downloaded_path = _resolve_model_path("LattifAI/OmniTokenizer", model_hub="modelscope")
|
|
44
|
+
sat = SaT(
|
|
45
|
+
f"{downloaded_path}/sat-3l-sm",
|
|
46
|
+
tokenizer_name_or_path=f"{downloaded_path}/xlm-roberta-base",
|
|
47
|
+
ort_providers=providers + ["CPUExecutionProvider"],
|
|
48
|
+
)
|
|
49
|
+
else:
|
|
50
|
+
sat_path = _resolve_model_path("segment-any-text/sat-3l-sm", model_hub="huggingface")
|
|
51
|
+
sat = SaT(
|
|
52
|
+
sat_path,
|
|
53
|
+
tokenizer_name_or_path="facebookAI/xlm-roberta-base",
|
|
54
|
+
hub_prefix="segment-any-text",
|
|
55
|
+
ort_providers=providers + ["CPUExecutionProvider"],
|
|
56
|
+
)
|
|
57
|
+
self._splitter = sat
|
|
58
|
+
|
|
59
|
+
@staticmethod
|
|
60
|
+
def _resplit_special_sentence_types(sentence: str) -> List[str]:
|
|
61
|
+
"""
|
|
62
|
+
Re-split special sentence types.
|
|
63
|
+
|
|
64
|
+
Examples:
|
|
65
|
+
'[APPLAUSE] >> MIRA MURATI:' -> ['[APPLAUSE]', '>> MIRA MURATI:']
|
|
66
|
+
'[MUSIC] >> SPEAKER:' -> ['[MUSIC]', '>> SPEAKER:']
|
|
67
|
+
|
|
68
|
+
Special handling patterns:
|
|
69
|
+
1. Separate special marks at the beginning (e.g., [APPLAUSE], [MUSIC], etc.) from subsequent speaker marks
|
|
70
|
+
2. Use speaker marks (>> or other separators) as split points
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
sentence: Input sentence string
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
List of re-split sentences. If no special marks are found, returns the original sentence in a list
|
|
77
|
+
"""
|
|
78
|
+
# Detect special mark patterns: [SOMETHING] >> SPEAKER:
|
|
79
|
+
# or other forms like [SOMETHING] SPEAKER:
|
|
80
|
+
|
|
81
|
+
# Pattern 1: [mark] HTML-encoded separator speaker:
|
|
82
|
+
pattern1 = r"^(\[[^\]]+\])\s+(>>|>>)\s+(.+)$"
|
|
83
|
+
match1 = re.match(pattern1, sentence.strip())
|
|
84
|
+
if match1:
|
|
85
|
+
special_mark = match1.group(1)
|
|
86
|
+
separator = match1.group(2)
|
|
87
|
+
speaker_part = match1.group(3)
|
|
88
|
+
return [special_mark, f"{separator} {speaker_part}"]
|
|
89
|
+
|
|
90
|
+
# Pattern 2: [mark] speaker:
|
|
91
|
+
pattern2 = r"^(\[[^\]]+\])\s+([^:]+:)(.*)$"
|
|
92
|
+
match2 = re.match(pattern2, sentence.strip())
|
|
93
|
+
if match2:
|
|
94
|
+
special_mark = match2.group(1)
|
|
95
|
+
speaker_label = match2.group(2)
|
|
96
|
+
remaining = match2.group(3).strip()
|
|
97
|
+
if remaining:
|
|
98
|
+
return [special_mark, f"{speaker_label} {remaining}"]
|
|
99
|
+
else:
|
|
100
|
+
return [special_mark, speaker_label]
|
|
101
|
+
|
|
102
|
+
# If no special pattern matches, return the original sentence
|
|
103
|
+
return [sentence]
|
|
104
|
+
|
|
105
|
+
def split_sentences(self, supervisions: List[Supervision], strip_whitespace=True) -> List[Supervision]:
|
|
106
|
+
"""Split supervisions into sentences using the sentence splitter.
|
|
107
|
+
|
|
108
|
+
Careful about speaker changes.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
supervisions: List of Supervision objects to split
|
|
112
|
+
strip_whitespace: Whether to strip whitespace from split sentences
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
List of Supervision objects with split sentences
|
|
116
|
+
"""
|
|
117
|
+
self._init_splitter()
|
|
118
|
+
|
|
119
|
+
texts, speakers = [], []
|
|
120
|
+
text_len, sidx = 0, 0
|
|
121
|
+
|
|
122
|
+
def flush_segment(end_idx: int, speaker: Optional[str] = None):
|
|
123
|
+
"""Flush accumulated text from sidx to end_idx with given speaker."""
|
|
124
|
+
nonlocal text_len, sidx
|
|
125
|
+
if sidx <= end_idx:
|
|
126
|
+
if len(speakers) < len(texts) + 1:
|
|
127
|
+
speakers.append(speaker)
|
|
128
|
+
text = " ".join(sup.text for sup in supervisions[sidx : end_idx + 1])
|
|
129
|
+
texts.append(text)
|
|
130
|
+
sidx = end_idx + 1
|
|
131
|
+
text_len = 0
|
|
132
|
+
|
|
133
|
+
for s, supervision in enumerate(supervisions):
|
|
134
|
+
text_len += len(supervision.text)
|
|
135
|
+
is_last = s == len(supervisions) - 1
|
|
136
|
+
|
|
137
|
+
if supervision.speaker:
|
|
138
|
+
# Flush previous segment without speaker (if any)
|
|
139
|
+
if sidx < s:
|
|
140
|
+
flush_segment(s - 1, None)
|
|
141
|
+
text_len = len(supervision.text)
|
|
142
|
+
|
|
143
|
+
# Check if we should flush this speaker's segment now
|
|
144
|
+
next_has_speaker = not is_last and supervisions[s + 1].speaker
|
|
145
|
+
if is_last or next_has_speaker:
|
|
146
|
+
flush_segment(s, supervision.speaker)
|
|
147
|
+
else:
|
|
148
|
+
speakers.append(supervision.speaker)
|
|
149
|
+
|
|
150
|
+
elif text_len >= 2000 or is_last:
|
|
151
|
+
flush_segment(s, None)
|
|
152
|
+
|
|
153
|
+
assert len(speakers) == len(texts), f"len(speakers)={len(speakers)} != len(texts)={len(texts)}"
|
|
154
|
+
sentences = self._splitter.split(texts, threshold=0.15, strip_whitespace=strip_whitespace, batch_size=8)
|
|
155
|
+
|
|
156
|
+
supervisions, remainder = [], ""
|
|
157
|
+
for k, (_speaker, _sentences) in enumerate(zip(speakers, sentences)):
|
|
158
|
+
# Prepend remainder from previous iteration to the first sentence
|
|
159
|
+
if _sentences and remainder:
|
|
160
|
+
_sentences[0] = remainder + _sentences[0]
|
|
161
|
+
remainder = ""
|
|
162
|
+
|
|
163
|
+
if not _sentences:
|
|
164
|
+
continue
|
|
165
|
+
|
|
166
|
+
# Process and re-split special sentence types
|
|
167
|
+
processed_sentences = []
|
|
168
|
+
for s, _sentence in enumerate(_sentences):
|
|
169
|
+
if remainder:
|
|
170
|
+
_sentence = remainder + _sentence
|
|
171
|
+
remainder = ""
|
|
172
|
+
# Detect and split special sentence types: e.g., '[APPLAUSE] >> MIRA MURATI:' -> ['[APPLAUSE]', '>> MIRA MURATI:'] # noqa: E501
|
|
173
|
+
resplit_parts = self._resplit_special_sentence_types(_sentence)
|
|
174
|
+
if any(resplit_parts[-1].endswith(sp) for sp in [":", ":"]):
|
|
175
|
+
if s < len(_sentences) - 1:
|
|
176
|
+
_sentences[s + 1] = resplit_parts[-1] + " " + _sentences[s + 1]
|
|
177
|
+
else: # last part
|
|
178
|
+
remainder = resplit_parts[-1] + " "
|
|
179
|
+
processed_sentences.extend(resplit_parts[:-1])
|
|
180
|
+
else:
|
|
181
|
+
processed_sentences.extend(resplit_parts)
|
|
182
|
+
_sentences = processed_sentences
|
|
183
|
+
|
|
184
|
+
if not _sentences:
|
|
185
|
+
if remainder:
|
|
186
|
+
_sentences, remainder = [remainder.strip()], ""
|
|
187
|
+
else:
|
|
188
|
+
continue
|
|
189
|
+
|
|
190
|
+
if any(_sentences[-1].endswith(ep) for ep in END_PUNCTUATION):
|
|
191
|
+
supervisions.extend(
|
|
192
|
+
Supervision(text=text, speaker=(_speaker if s == 0 else None)) for s, text in enumerate(_sentences)
|
|
193
|
+
)
|
|
194
|
+
_speaker = None # reset speaker after use
|
|
195
|
+
else:
|
|
196
|
+
supervisions.extend(
|
|
197
|
+
Supervision(text=text, speaker=(_speaker if s == 0 else None))
|
|
198
|
+
for s, text in enumerate(_sentences[:-1])
|
|
199
|
+
)
|
|
200
|
+
remainder = _sentences[-1] + " " + remainder
|
|
201
|
+
if k < len(speakers) - 1 and speakers[k + 1] is not None: # next speaker is set
|
|
202
|
+
supervisions.append(
|
|
203
|
+
Supervision(text=remainder.strip(), speaker=_speaker if len(_sentences) == 1 else None)
|
|
204
|
+
)
|
|
205
|
+
remainder = ""
|
|
206
|
+
elif len(_sentences) == 1:
|
|
207
|
+
if k == len(speakers) - 1:
|
|
208
|
+
pass # keep _speaker for the last supervision
|
|
209
|
+
else:
|
|
210
|
+
assert speakers[k + 1] is None
|
|
211
|
+
speakers[k + 1] = _speaker
|
|
212
|
+
else:
|
|
213
|
+
assert len(_sentences) > 1
|
|
214
|
+
_speaker = None # reset speaker if sentence not ended
|
|
215
|
+
|
|
216
|
+
if remainder.strip():
|
|
217
|
+
supervisions.append(Supervision(text=remainder.strip(), speaker=_speaker))
|
|
218
|
+
|
|
219
|
+
return supervisions
|
lattifai/alignment/tokenizer.py
CHANGED
|
@@ -4,9 +4,8 @@ import re
|
|
|
4
4
|
from collections import defaultdict
|
|
5
5
|
from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar
|
|
6
6
|
|
|
7
|
-
import
|
|
7
|
+
import numpy as np
|
|
8
8
|
|
|
9
|
-
from lattifai.alignment.phonemizer import G2Phonemizer
|
|
10
9
|
from lattifai.caption import Supervision
|
|
11
10
|
from lattifai.caption import normalize_text as normalize_html_text
|
|
12
11
|
from lattifai.errors import (
|
|
@@ -16,8 +15,10 @@ from lattifai.errors import (
|
|
|
16
15
|
QuotaExceededError,
|
|
17
16
|
)
|
|
18
17
|
|
|
18
|
+
from .phonemizer import G2Phonemizer
|
|
19
|
+
from .sentence_splitter import SentenceSplitter
|
|
20
|
+
|
|
19
21
|
PUNCTUATION = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
|
|
20
|
-
END_PUNCTUATION = '.!?"]。!?”】'
|
|
21
22
|
PUNCTUATION_SPACE = PUNCTUATION + " "
|
|
22
23
|
STAR_TOKEN = "※"
|
|
23
24
|
|
|
@@ -121,78 +122,17 @@ class LatticeTokenizer:
|
|
|
121
122
|
def __init__(self, client_wrapper: Any):
|
|
122
123
|
self.client_wrapper = client_wrapper
|
|
123
124
|
self.model_name = ""
|
|
125
|
+
self.model_hub: Optional[str] = None
|
|
124
126
|
self.words: List[str] = []
|
|
125
127
|
self.g2p_model: Any = None # Placeholder for G2P model
|
|
126
128
|
self.dictionaries = defaultdict(lambda: [])
|
|
127
129
|
self.oov_word = "<unk>"
|
|
128
|
-
self.sentence_splitter = None
|
|
130
|
+
self.sentence_splitter: Optional[SentenceSplitter] = None
|
|
129
131
|
self.device = "cpu"
|
|
130
132
|
|
|
131
133
|
def init_sentence_splitter(self):
|
|
132
|
-
if self.sentence_splitter is
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
import onnxruntime as ort
|
|
136
|
-
from wtpsplit import SaT
|
|
137
|
-
|
|
138
|
-
providers = []
|
|
139
|
-
device = self.device
|
|
140
|
-
if device.startswith("cuda") and ort.get_all_providers().count("CUDAExecutionProvider") > 0:
|
|
141
|
-
providers.append("CUDAExecutionProvider")
|
|
142
|
-
elif device.startswith("mps") and ort.get_all_providers().count("MPSExecutionProvider") > 0:
|
|
143
|
-
providers.append("MPSExecutionProvider")
|
|
144
|
-
|
|
145
|
-
sat = SaT(
|
|
146
|
-
"sat-3l-sm",
|
|
147
|
-
ort_providers=providers + ["CPUExecutionProvider"],
|
|
148
|
-
)
|
|
149
|
-
self.sentence_splitter = sat
|
|
150
|
-
|
|
151
|
-
@staticmethod
|
|
152
|
-
def _resplit_special_sentence_types(sentence: str) -> List[str]:
|
|
153
|
-
"""
|
|
154
|
-
Re-split special sentence types.
|
|
155
|
-
|
|
156
|
-
Examples:
|
|
157
|
-
'[APPLAUSE] >> MIRA MURATI:' -> ['[APPLAUSE]', '>> MIRA MURATI:']
|
|
158
|
-
'[MUSIC] >> SPEAKER:' -> ['[MUSIC]', '>> SPEAKER:']
|
|
159
|
-
|
|
160
|
-
Special handling patterns:
|
|
161
|
-
1. Separate special marks at the beginning (e.g., [APPLAUSE], [MUSIC], etc.) from subsequent speaker marks
|
|
162
|
-
2. Use speaker marks (>> or other separators) as split points
|
|
163
|
-
|
|
164
|
-
Args:
|
|
165
|
-
sentence: Input sentence string
|
|
166
|
-
|
|
167
|
-
Returns:
|
|
168
|
-
List of re-split sentences. If no special marks are found, returns the original sentence in a list
|
|
169
|
-
"""
|
|
170
|
-
# Detect special mark patterns: [SOMETHING] >> SPEAKER:
|
|
171
|
-
# or other forms like [SOMETHING] SPEAKER:
|
|
172
|
-
|
|
173
|
-
# Pattern 1: [mark] HTML-encoded separator speaker:
|
|
174
|
-
pattern1 = r"^(\[[^\]]+\])\s+(>>|>>)\s+(.+)$"
|
|
175
|
-
match1 = re.match(pattern1, sentence.strip())
|
|
176
|
-
if match1:
|
|
177
|
-
special_mark = match1.group(1)
|
|
178
|
-
separator = match1.group(2)
|
|
179
|
-
speaker_part = match1.group(3)
|
|
180
|
-
return [special_mark, f"{separator} {speaker_part}"]
|
|
181
|
-
|
|
182
|
-
# Pattern 2: [mark] speaker:
|
|
183
|
-
pattern2 = r"^(\[[^\]]+\])\s+([^:]+:)(.*)$"
|
|
184
|
-
match2 = re.match(pattern2, sentence.strip())
|
|
185
|
-
if match2:
|
|
186
|
-
special_mark = match2.group(1)
|
|
187
|
-
speaker_label = match2.group(2)
|
|
188
|
-
remaining = match2.group(3).strip()
|
|
189
|
-
if remaining:
|
|
190
|
-
return [special_mark, f"{speaker_label} {remaining}"]
|
|
191
|
-
else:
|
|
192
|
-
return [special_mark, speaker_label]
|
|
193
|
-
|
|
194
|
-
# If no special pattern matches, return the original sentence
|
|
195
|
-
return [sentence]
|
|
134
|
+
if self.sentence_splitter is None:
|
|
135
|
+
self.sentence_splitter = SentenceSplitter(device=self.device, model_hub=self.model_hub)
|
|
196
136
|
|
|
197
137
|
@classmethod
|
|
198
138
|
def from_pretrained(
|
|
@@ -200,6 +140,7 @@ class LatticeTokenizer:
|
|
|
200
140
|
client_wrapper: Any,
|
|
201
141
|
model_path: str,
|
|
202
142
|
model_name: str,
|
|
143
|
+
model_hub: Optional[str] = None,
|
|
203
144
|
device: str = "cpu",
|
|
204
145
|
compressed: bool = True,
|
|
205
146
|
) -> TokenizerT:
|
|
@@ -227,6 +168,7 @@ class LatticeTokenizer:
|
|
|
227
168
|
|
|
228
169
|
tokenizer = cls(client_wrapper=client_wrapper)
|
|
229
170
|
tokenizer.model_name = model_name
|
|
171
|
+
tokenizer.model_hub = model_hub
|
|
230
172
|
tokenizer.words = data["words"]
|
|
231
173
|
tokenizer.dictionaries = defaultdict(list, data["dictionaries"])
|
|
232
174
|
tokenizer.oov_word = data["oov_word"]
|
|
@@ -295,116 +237,16 @@ class LatticeTokenizer:
|
|
|
295
237
|
|
|
296
238
|
return {}
|
|
297
239
|
|
|
298
|
-
def split_sentences(self, supervisions: List[Supervision], strip_whitespace=True) -> List[
|
|
240
|
+
def split_sentences(self, supervisions: List[Supervision], strip_whitespace=True) -> List[Supervision]:
|
|
299
241
|
"""Split supervisions into sentences using the sentence splitter.
|
|
300
242
|
|
|
301
|
-
|
|
243
|
+
Careful about speaker changes.
|
|
302
244
|
"""
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
def flush_segment(end_idx: int, speaker: Optional[str] = None):
|
|
307
|
-
"""Flush accumulated text from sidx to end_idx with given speaker."""
|
|
308
|
-
nonlocal text_len, sidx
|
|
309
|
-
if sidx <= end_idx:
|
|
310
|
-
if len(speakers) < len(texts) + 1:
|
|
311
|
-
speakers.append(speaker)
|
|
312
|
-
text = " ".join(sup.text for sup in supervisions[sidx : end_idx + 1])
|
|
313
|
-
texts.append(text)
|
|
314
|
-
sidx = end_idx + 1
|
|
315
|
-
text_len = 0
|
|
316
|
-
|
|
317
|
-
for s, supervision in enumerate(supervisions):
|
|
318
|
-
text_len += len(supervision.text)
|
|
319
|
-
is_last = s == len(supervisions) - 1
|
|
320
|
-
|
|
321
|
-
if supervision.speaker:
|
|
322
|
-
# Flush previous segment without speaker (if any)
|
|
323
|
-
if sidx < s:
|
|
324
|
-
flush_segment(s - 1, None)
|
|
325
|
-
text_len = len(supervision.text)
|
|
326
|
-
|
|
327
|
-
# Check if we should flush this speaker's segment now
|
|
328
|
-
next_has_speaker = not is_last and supervisions[s + 1].speaker
|
|
329
|
-
if is_last or next_has_speaker:
|
|
330
|
-
flush_segment(s, supervision.speaker)
|
|
331
|
-
else:
|
|
332
|
-
speakers.append(supervision.speaker)
|
|
333
|
-
|
|
334
|
-
elif text_len >= 2000 or is_last:
|
|
335
|
-
flush_segment(s, None)
|
|
336
|
-
|
|
337
|
-
assert len(speakers) == len(texts), f"len(speakers)={len(speakers)} != len(texts)={len(texts)}"
|
|
338
|
-
sentences = self.sentence_splitter.split(texts, threshold=0.15, strip_whitespace=strip_whitespace, batch_size=8)
|
|
339
|
-
|
|
340
|
-
supervisions, remainder = [], ""
|
|
341
|
-
for k, (_speaker, _sentences) in enumerate(zip(speakers, sentences)):
|
|
342
|
-
# Prepend remainder from previous iteration to the first sentence
|
|
343
|
-
if _sentences and remainder:
|
|
344
|
-
_sentences[0] = remainder + _sentences[0]
|
|
345
|
-
remainder = ""
|
|
346
|
-
|
|
347
|
-
if not _sentences:
|
|
348
|
-
continue
|
|
349
|
-
|
|
350
|
-
# Process and re-split special sentence types
|
|
351
|
-
processed_sentences = []
|
|
352
|
-
for s, _sentence in enumerate(_sentences):
|
|
353
|
-
if remainder:
|
|
354
|
-
_sentence = remainder + _sentence
|
|
355
|
-
remainder = ""
|
|
356
|
-
# Detect and split special sentence types: e.g., '[APPLAUSE] >> MIRA MURATI:' -> ['[APPLAUSE]', '>> MIRA MURATI:'] # noqa: E501
|
|
357
|
-
resplit_parts = self._resplit_special_sentence_types(_sentence)
|
|
358
|
-
if any(resplit_parts[-1].endswith(sp) for sp in [":", ":"]):
|
|
359
|
-
if s < len(_sentences) - 1:
|
|
360
|
-
_sentences[s + 1] = resplit_parts[-1] + " " + _sentences[s + 1]
|
|
361
|
-
else: # last part
|
|
362
|
-
remainder = resplit_parts[-1] + " "
|
|
363
|
-
processed_sentences.extend(resplit_parts[:-1])
|
|
364
|
-
else:
|
|
365
|
-
processed_sentences.extend(resplit_parts)
|
|
366
|
-
_sentences = processed_sentences
|
|
367
|
-
|
|
368
|
-
if not _sentences:
|
|
369
|
-
if remainder:
|
|
370
|
-
_sentences, remainder = [remainder.strip()], ""
|
|
371
|
-
else:
|
|
372
|
-
continue
|
|
373
|
-
|
|
374
|
-
if any(_sentences[-1].endswith(ep) for ep in END_PUNCTUATION):
|
|
375
|
-
supervisions.extend(
|
|
376
|
-
Supervision(text=text, speaker=(_speaker if s == 0 else None)) for s, text in enumerate(_sentences)
|
|
377
|
-
)
|
|
378
|
-
_speaker = None # reset speaker after use
|
|
379
|
-
else:
|
|
380
|
-
supervisions.extend(
|
|
381
|
-
Supervision(text=text, speaker=(_speaker if s == 0 else None))
|
|
382
|
-
for s, text in enumerate(_sentences[:-1])
|
|
383
|
-
)
|
|
384
|
-
remainder = _sentences[-1] + " " + remainder
|
|
385
|
-
if k < len(speakers) - 1 and speakers[k + 1] is not None: # next speaker is set
|
|
386
|
-
supervisions.append(
|
|
387
|
-
Supervision(text=remainder.strip(), speaker=_speaker if len(_sentences) == 1 else None)
|
|
388
|
-
)
|
|
389
|
-
remainder = ""
|
|
390
|
-
elif len(_sentences) == 1:
|
|
391
|
-
if k == len(speakers) - 1:
|
|
392
|
-
pass # keep _speaker for the last supervision
|
|
393
|
-
else:
|
|
394
|
-
assert speakers[k + 1] is None
|
|
395
|
-
speakers[k + 1] = _speaker
|
|
396
|
-
else:
|
|
397
|
-
assert len(_sentences) > 1
|
|
398
|
-
_speaker = None # reset speaker if sentence not ended
|
|
399
|
-
|
|
400
|
-
if remainder.strip():
|
|
401
|
-
supervisions.append(Supervision(text=remainder.strip(), speaker=_speaker))
|
|
402
|
-
|
|
403
|
-
return supervisions
|
|
245
|
+
self.init_sentence_splitter()
|
|
246
|
+
return self.sentence_splitter.split_sentences(supervisions, strip_whitespace=strip_whitespace)
|
|
404
247
|
|
|
405
248
|
def tokenize(self, supervisions: List[Supervision], split_sentence: bool = False) -> Tuple[str, Dict[str, Any]]:
|
|
406
249
|
if split_sentence:
|
|
407
|
-
self.init_sentence_splitter()
|
|
408
250
|
supervisions = self.split_sentences(supervisions)
|
|
409
251
|
|
|
410
252
|
pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions])
|
|
@@ -431,7 +273,7 @@ class LatticeTokenizer:
|
|
|
431
273
|
def detokenize(
|
|
432
274
|
self,
|
|
433
275
|
lattice_id: str,
|
|
434
|
-
lattice_results: Tuple[
|
|
276
|
+
lattice_results: Tuple[np.ndarray, Any, Any, float, float],
|
|
435
277
|
supervisions: List[Supervision],
|
|
436
278
|
return_details: bool = False,
|
|
437
279
|
start_margin: float = 0.08,
|
|
@@ -481,7 +323,7 @@ class LatticeTokenizer:
|
|
|
481
323
|
|
|
482
324
|
def _add_confidence_scores(
|
|
483
325
|
supervisions: List[Supervision],
|
|
484
|
-
emission:
|
|
326
|
+
emission: np.ndarray,
|
|
485
327
|
labels: List[int],
|
|
486
328
|
frame_shift: float,
|
|
487
329
|
offset: float = 0.0,
|
|
@@ -499,17 +341,17 @@ def _add_confidence_scores(
|
|
|
499
341
|
labels: Token labels corresponding to aligned tokens
|
|
500
342
|
frame_shift: Frame shift in seconds for converting frames to time
|
|
501
343
|
"""
|
|
502
|
-
tokens =
|
|
344
|
+
tokens = np.array(labels, dtype=np.int64)
|
|
503
345
|
|
|
504
346
|
for supervision in supervisions:
|
|
505
347
|
start_frame = int((supervision.start - offset) / frame_shift)
|
|
506
348
|
end_frame = int((supervision.end - offset) / frame_shift)
|
|
507
349
|
|
|
508
350
|
# Compute segment-level confidence
|
|
509
|
-
probabilities = emission[0, start_frame:end_frame]
|
|
351
|
+
probabilities = np.exp(emission[0, start_frame:end_frame])
|
|
510
352
|
aligned = probabilities[range(0, end_frame - start_frame), tokens[start_frame:end_frame]]
|
|
511
|
-
diffprobs =
|
|
512
|
-
supervision.score = round(1.0 - diffprobs.mean()
|
|
353
|
+
diffprobs = np.max(probabilities, axis=-1) - aligned
|
|
354
|
+
supervision.score = round(1.0 - diffprobs.mean(), ndigits=4)
|
|
513
355
|
|
|
514
356
|
# Compute word-level confidence if alignment exists
|
|
515
357
|
if hasattr(supervision, "alignment") and supervision.alignment:
|
|
@@ -517,7 +359,7 @@ def _add_confidence_scores(
|
|
|
517
359
|
for w, item in enumerate(words):
|
|
518
360
|
start = int((item.start - offset) / frame_shift) - start_frame
|
|
519
361
|
end = int((item.end - offset) / frame_shift) - start_frame
|
|
520
|
-
words[w] = item._replace(score=round(1.0 - diffprobs[start:end].mean()
|
|
362
|
+
words[w] = item._replace(score=round(1.0 - diffprobs[start:end].mean(), ndigits=4))
|
|
521
363
|
|
|
522
364
|
|
|
523
365
|
def _update_alignments_speaker(supervisions: List[Supervision], alignments: List[Supervision]) -> List[Supervision]:
|
|
@@ -539,6 +381,7 @@ def _load_tokenizer(
|
|
|
539
381
|
model_name: str,
|
|
540
382
|
device: str,
|
|
541
383
|
*,
|
|
384
|
+
model_hub: Optional[str] = None,
|
|
542
385
|
tokenizer_cls: Type[LatticeTokenizer] = LatticeTokenizer,
|
|
543
386
|
) -> LatticeTokenizer:
|
|
544
387
|
"""Instantiate tokenizer with consistent error handling."""
|
|
@@ -546,5 +389,6 @@ def _load_tokenizer(
|
|
|
546
389
|
client_wrapper=client_wrapper,
|
|
547
390
|
model_path=model_path,
|
|
548
391
|
model_name=model_name,
|
|
392
|
+
model_hub=model_hub,
|
|
549
393
|
device=device,
|
|
550
394
|
)
|
lattifai/audio2.py
CHANGED
|
@@ -36,7 +36,7 @@ class AudioData(namedtuple("AudioData", ["sampling_rate", "ndarray", "path", "st
|
|
|
36
36
|
@property
|
|
37
37
|
def streaming_mode(self) -> bool:
|
|
38
38
|
"""Indicates whether streaming mode is enabled based on streaming_chunk_secs."""
|
|
39
|
-
if self.streaming_chunk_secs
|
|
39
|
+
if self.streaming_chunk_secs:
|
|
40
40
|
return self.duration > self.streaming_chunk_secs * 1.1
|
|
41
41
|
return False
|
|
42
42
|
|
lattifai/caption/caption.py
CHANGED
|
@@ -467,7 +467,6 @@ class Caption:
|
|
|
467
467
|
sup_dict = sup.to_dict()
|
|
468
468
|
json_data.append(sup_dict)
|
|
469
469
|
json.dump(json_data, f, ensure_ascii=False, indent=4)
|
|
470
|
-
|
|
471
470
|
elif str(output_path).lower().endswith(".textgrid"):
|
|
472
471
|
from tgt import Interval, IntervalTier, TextGrid, write_to_file
|
|
473
472
|
|
|
@@ -506,7 +505,6 @@ class Caption:
|
|
|
506
505
|
tg.add_tier(IntervalTier(name="word_scores", objects=scores["words"]))
|
|
507
506
|
|
|
508
507
|
write_to_file(tg, output_path, format="long")
|
|
509
|
-
|
|
510
508
|
elif str(output_path)[-4:].lower() == ".tsv":
|
|
511
509
|
cls._write_tsv(alignments, output_path, include_speaker_in_text)
|
|
512
510
|
elif str(output_path)[-4:].lower() == ".csv":
|