lattifai 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +0 -24
- lattifai/alignment/lattice1_aligner.py +1 -1
- lattifai/alignment/lattice1_worker.py +1 -6
- lattifai/alignment/segmenter.py +1 -1
- lattifai/alignment/sentence_splitter.py +219 -0
- lattifai/alignment/tokenizer.py +10 -181
- lattifai/caption/caption.py +0 -2
- lattifai/caption/gemini_reader.py +151 -60
- lattifai/cli/transcribe.py +3 -8
- lattifai/client.py +91 -47
- lattifai/config/alignment.py +2 -2
- lattifai/mixin.py +10 -4
- lattifai/utils.py +74 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.1.dist-info}/METADATA +2 -1
- {lattifai-1.2.0.dist-info → lattifai-1.2.1.dist-info}/RECORD +19 -18
- {lattifai-1.2.0.dist-info → lattifai-1.2.1.dist-info}/WHEEL +0 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.1.dist-info}/entry_points.txt +0 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.1.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.1.dist-info}/top_level.txt +0 -0
lattifai/__init__.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import os
|
|
2
|
-
import sys
|
|
3
2
|
import warnings
|
|
4
3
|
from importlib.metadata import version
|
|
5
4
|
|
|
@@ -52,29 +51,6 @@ except Exception:
|
|
|
52
51
|
__version__ = "0.1.0" # fallback version
|
|
53
52
|
|
|
54
53
|
|
|
55
|
-
# Check and auto-install k2py if not present
|
|
56
|
-
def _check_and_install_k2py():
|
|
57
|
-
"""Check if k2py is installed and attempt to install it if not."""
|
|
58
|
-
try:
|
|
59
|
-
import k2py
|
|
60
|
-
except ImportError:
|
|
61
|
-
import subprocess
|
|
62
|
-
|
|
63
|
-
print("k2py is not installed. Attempting to install k2py...")
|
|
64
|
-
try:
|
|
65
|
-
subprocess.check_call([sys.executable, "-m", "pip", "install", "k2py"])
|
|
66
|
-
import k2py # Try importing again after installation
|
|
67
|
-
|
|
68
|
-
print("k2py installed successfully.")
|
|
69
|
-
except Exception as e:
|
|
70
|
-
warnings.warn(f"Failed to install k2py automatically. Please install it manually. Error: {e}")
|
|
71
|
-
return True
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
# Auto-install k2py on first import
|
|
75
|
-
_check_and_install_k2py()
|
|
76
|
-
|
|
77
|
-
|
|
78
54
|
__all__ = [
|
|
79
55
|
# Client classes
|
|
80
56
|
"LattifAI",
|
|
@@ -117,7 +117,7 @@ class Lattice1Aligner(object):
|
|
|
117
117
|
|
|
118
118
|
if verbose:
|
|
119
119
|
safe_print(colorful.cyan(f"🔍 Step 3: Searching lattice graph with media: {audio}"))
|
|
120
|
-
if audio.
|
|
120
|
+
if audio.streaming_mode:
|
|
121
121
|
safe_print(
|
|
122
122
|
colorful.yellow(
|
|
123
123
|
f" ⚡Using streaming mode with {audio.streaming_chunk_secs}s (chunk duration)"
|
|
@@ -7,8 +7,6 @@ from typing import Any, Dict, Optional, Tuple
|
|
|
7
7
|
import colorful
|
|
8
8
|
import numpy as np
|
|
9
9
|
import onnxruntime as ort
|
|
10
|
-
from lhotse import FbankConfig
|
|
11
|
-
from lhotse.features.kaldi.layers import Wav2LogFilterBank
|
|
12
10
|
from lhotse.utils import Pathlike
|
|
13
11
|
from tqdm import tqdm
|
|
14
12
|
|
|
@@ -159,10 +157,7 @@ class Lattice1Worker:
|
|
|
159
157
|
DependencyError: If required dependencies are missing
|
|
160
158
|
AlignmentError: If alignment process fails
|
|
161
159
|
"""
|
|
162
|
-
|
|
163
|
-
import k2py as k2
|
|
164
|
-
except ImportError:
|
|
165
|
-
raise DependencyError("k2py", install_command="pip install k2py")
|
|
160
|
+
import k2py as k2
|
|
166
161
|
|
|
167
162
|
lattice_graph_str, final_state, acoustic_scale = lattice_graph
|
|
168
163
|
|
lattifai/alignment/segmenter.py
CHANGED
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
|
|
4
|
+
from lattifai.caption import Supervision
|
|
5
|
+
from lattifai.utils import _resolve_model_path
|
|
6
|
+
|
|
7
|
+
END_PUNCTUATION = '.!?"]。!?"】'
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SentenceSplitter:
|
|
11
|
+
"""Lazy-initialized sentence splitter using wtpsplit."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, device: str = "cpu", model_hub: Optional[str] = None, lazy_init: bool = True):
|
|
14
|
+
"""Initialize sentence splitter with lazy loading.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
device: Device to run the model on (cpu, cuda, mps)
|
|
18
|
+
model_hub: Model hub to use (None for huggingface, "modelscope" for modelscope)
|
|
19
|
+
"""
|
|
20
|
+
self.device = device
|
|
21
|
+
self.model_hub = model_hub
|
|
22
|
+
if lazy_init:
|
|
23
|
+
self._splitter = None
|
|
24
|
+
else:
|
|
25
|
+
self._init_splitter()
|
|
26
|
+
|
|
27
|
+
def _init_splitter(self):
|
|
28
|
+
"""Initialize the sentence splitter model on first use."""
|
|
29
|
+
if self._splitter is not None:
|
|
30
|
+
return
|
|
31
|
+
|
|
32
|
+
import onnxruntime as ort
|
|
33
|
+
from wtpsplit import SaT
|
|
34
|
+
|
|
35
|
+
providers = []
|
|
36
|
+
device = self.device
|
|
37
|
+
if device.startswith("cuda") and ort.get_all_providers().count("CUDAExecutionProvider") > 0:
|
|
38
|
+
providers.append("CUDAExecutionProvider")
|
|
39
|
+
elif device.startswith("mps") and ort.get_all_providers().count("MPSExecutionProvider") > 0:
|
|
40
|
+
providers.append("MPSExecutionProvider")
|
|
41
|
+
|
|
42
|
+
if self.model_hub == "modelscope":
|
|
43
|
+
downloaded_path = _resolve_model_path("LattifAI/OmniTokenizer", model_hub="modelscope")
|
|
44
|
+
sat = SaT(
|
|
45
|
+
f"{downloaded_path}/sat-3l-sm",
|
|
46
|
+
tokenizer_name_or_path=f"{downloaded_path}/xlm-roberta-base",
|
|
47
|
+
ort_providers=providers + ["CPUExecutionProvider"],
|
|
48
|
+
)
|
|
49
|
+
else:
|
|
50
|
+
sat_path = _resolve_model_path("segment-any-text/sat-3l-sm", model_hub="huggingface")
|
|
51
|
+
sat = SaT(
|
|
52
|
+
sat_path,
|
|
53
|
+
tokenizer_name_or_path="facebookAI/xlm-roberta-base",
|
|
54
|
+
hub_prefix="segment-any-text",
|
|
55
|
+
ort_providers=providers + ["CPUExecutionProvider"],
|
|
56
|
+
)
|
|
57
|
+
self._splitter = sat
|
|
58
|
+
|
|
59
|
+
@staticmethod
|
|
60
|
+
def _resplit_special_sentence_types(sentence: str) -> List[str]:
|
|
61
|
+
"""
|
|
62
|
+
Re-split special sentence types.
|
|
63
|
+
|
|
64
|
+
Examples:
|
|
65
|
+
'[APPLAUSE] >> MIRA MURATI:' -> ['[APPLAUSE]', '>> MIRA MURATI:']
|
|
66
|
+
'[MUSIC] >> SPEAKER:' -> ['[MUSIC]', '>> SPEAKER:']
|
|
67
|
+
|
|
68
|
+
Special handling patterns:
|
|
69
|
+
1. Separate special marks at the beginning (e.g., [APPLAUSE], [MUSIC], etc.) from subsequent speaker marks
|
|
70
|
+
2. Use speaker marks (>> or other separators) as split points
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
sentence: Input sentence string
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
List of re-split sentences. If no special marks are found, returns the original sentence in a list
|
|
77
|
+
"""
|
|
78
|
+
# Detect special mark patterns: [SOMETHING] >> SPEAKER:
|
|
79
|
+
# or other forms like [SOMETHING] SPEAKER:
|
|
80
|
+
|
|
81
|
+
# Pattern 1: [mark] HTML-encoded separator speaker:
|
|
82
|
+
pattern1 = r"^(\[[^\]]+\])\s+(>>|>>)\s+(.+)$"
|
|
83
|
+
match1 = re.match(pattern1, sentence.strip())
|
|
84
|
+
if match1:
|
|
85
|
+
special_mark = match1.group(1)
|
|
86
|
+
separator = match1.group(2)
|
|
87
|
+
speaker_part = match1.group(3)
|
|
88
|
+
return [special_mark, f"{separator} {speaker_part}"]
|
|
89
|
+
|
|
90
|
+
# Pattern 2: [mark] speaker:
|
|
91
|
+
pattern2 = r"^(\[[^\]]+\])\s+([^:]+:)(.*)$"
|
|
92
|
+
match2 = re.match(pattern2, sentence.strip())
|
|
93
|
+
if match2:
|
|
94
|
+
special_mark = match2.group(1)
|
|
95
|
+
speaker_label = match2.group(2)
|
|
96
|
+
remaining = match2.group(3).strip()
|
|
97
|
+
if remaining:
|
|
98
|
+
return [special_mark, f"{speaker_label} {remaining}"]
|
|
99
|
+
else:
|
|
100
|
+
return [special_mark, speaker_label]
|
|
101
|
+
|
|
102
|
+
# If no special pattern matches, return the original sentence
|
|
103
|
+
return [sentence]
|
|
104
|
+
|
|
105
|
+
def split_sentences(self, supervisions: List[Supervision], strip_whitespace=True) -> List[Supervision]:
|
|
106
|
+
"""Split supervisions into sentences using the sentence splitter.
|
|
107
|
+
|
|
108
|
+
Careful about speaker changes.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
supervisions: List of Supervision objects to split
|
|
112
|
+
strip_whitespace: Whether to strip whitespace from split sentences
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
List of Supervision objects with split sentences
|
|
116
|
+
"""
|
|
117
|
+
self._init_splitter()
|
|
118
|
+
|
|
119
|
+
texts, speakers = [], []
|
|
120
|
+
text_len, sidx = 0, 0
|
|
121
|
+
|
|
122
|
+
def flush_segment(end_idx: int, speaker: Optional[str] = None):
|
|
123
|
+
"""Flush accumulated text from sidx to end_idx with given speaker."""
|
|
124
|
+
nonlocal text_len, sidx
|
|
125
|
+
if sidx <= end_idx:
|
|
126
|
+
if len(speakers) < len(texts) + 1:
|
|
127
|
+
speakers.append(speaker)
|
|
128
|
+
text = " ".join(sup.text for sup in supervisions[sidx : end_idx + 1])
|
|
129
|
+
texts.append(text)
|
|
130
|
+
sidx = end_idx + 1
|
|
131
|
+
text_len = 0
|
|
132
|
+
|
|
133
|
+
for s, supervision in enumerate(supervisions):
|
|
134
|
+
text_len += len(supervision.text)
|
|
135
|
+
is_last = s == len(supervisions) - 1
|
|
136
|
+
|
|
137
|
+
if supervision.speaker:
|
|
138
|
+
# Flush previous segment without speaker (if any)
|
|
139
|
+
if sidx < s:
|
|
140
|
+
flush_segment(s - 1, None)
|
|
141
|
+
text_len = len(supervision.text)
|
|
142
|
+
|
|
143
|
+
# Check if we should flush this speaker's segment now
|
|
144
|
+
next_has_speaker = not is_last and supervisions[s + 1].speaker
|
|
145
|
+
if is_last or next_has_speaker:
|
|
146
|
+
flush_segment(s, supervision.speaker)
|
|
147
|
+
else:
|
|
148
|
+
speakers.append(supervision.speaker)
|
|
149
|
+
|
|
150
|
+
elif text_len >= 2000 or is_last:
|
|
151
|
+
flush_segment(s, None)
|
|
152
|
+
|
|
153
|
+
assert len(speakers) == len(texts), f"len(speakers)={len(speakers)} != len(texts)={len(texts)}"
|
|
154
|
+
sentences = self._splitter.split(texts, threshold=0.15, strip_whitespace=strip_whitespace, batch_size=8)
|
|
155
|
+
|
|
156
|
+
supervisions, remainder = [], ""
|
|
157
|
+
for k, (_speaker, _sentences) in enumerate(zip(speakers, sentences)):
|
|
158
|
+
# Prepend remainder from previous iteration to the first sentence
|
|
159
|
+
if _sentences and remainder:
|
|
160
|
+
_sentences[0] = remainder + _sentences[0]
|
|
161
|
+
remainder = ""
|
|
162
|
+
|
|
163
|
+
if not _sentences:
|
|
164
|
+
continue
|
|
165
|
+
|
|
166
|
+
# Process and re-split special sentence types
|
|
167
|
+
processed_sentences = []
|
|
168
|
+
for s, _sentence in enumerate(_sentences):
|
|
169
|
+
if remainder:
|
|
170
|
+
_sentence = remainder + _sentence
|
|
171
|
+
remainder = ""
|
|
172
|
+
# Detect and split special sentence types: e.g., '[APPLAUSE] >> MIRA MURATI:' -> ['[APPLAUSE]', '>> MIRA MURATI:'] # noqa: E501
|
|
173
|
+
resplit_parts = self._resplit_special_sentence_types(_sentence)
|
|
174
|
+
if any(resplit_parts[-1].endswith(sp) for sp in [":", ":"]):
|
|
175
|
+
if s < len(_sentences) - 1:
|
|
176
|
+
_sentences[s + 1] = resplit_parts[-1] + " " + _sentences[s + 1]
|
|
177
|
+
else: # last part
|
|
178
|
+
remainder = resplit_parts[-1] + " "
|
|
179
|
+
processed_sentences.extend(resplit_parts[:-1])
|
|
180
|
+
else:
|
|
181
|
+
processed_sentences.extend(resplit_parts)
|
|
182
|
+
_sentences = processed_sentences
|
|
183
|
+
|
|
184
|
+
if not _sentences:
|
|
185
|
+
if remainder:
|
|
186
|
+
_sentences, remainder = [remainder.strip()], ""
|
|
187
|
+
else:
|
|
188
|
+
continue
|
|
189
|
+
|
|
190
|
+
if any(_sentences[-1].endswith(ep) for ep in END_PUNCTUATION):
|
|
191
|
+
supervisions.extend(
|
|
192
|
+
Supervision(text=text, speaker=(_speaker if s == 0 else None)) for s, text in enumerate(_sentences)
|
|
193
|
+
)
|
|
194
|
+
_speaker = None # reset speaker after use
|
|
195
|
+
else:
|
|
196
|
+
supervisions.extend(
|
|
197
|
+
Supervision(text=text, speaker=(_speaker if s == 0 else None))
|
|
198
|
+
for s, text in enumerate(_sentences[:-1])
|
|
199
|
+
)
|
|
200
|
+
remainder = _sentences[-1] + " " + remainder
|
|
201
|
+
if k < len(speakers) - 1 and speakers[k + 1] is not None: # next speaker is set
|
|
202
|
+
supervisions.append(
|
|
203
|
+
Supervision(text=remainder.strip(), speaker=_speaker if len(_sentences) == 1 else None)
|
|
204
|
+
)
|
|
205
|
+
remainder = ""
|
|
206
|
+
elif len(_sentences) == 1:
|
|
207
|
+
if k == len(speakers) - 1:
|
|
208
|
+
pass # keep _speaker for the last supervision
|
|
209
|
+
else:
|
|
210
|
+
assert speakers[k + 1] is None
|
|
211
|
+
speakers[k + 1] = _speaker
|
|
212
|
+
else:
|
|
213
|
+
assert len(_sentences) > 1
|
|
214
|
+
_speaker = None # reset speaker if sentence not ended
|
|
215
|
+
|
|
216
|
+
if remainder.strip():
|
|
217
|
+
supervisions.append(Supervision(text=remainder.strip(), speaker=_speaker))
|
|
218
|
+
|
|
219
|
+
return supervisions
|
lattifai/alignment/tokenizer.py
CHANGED
|
@@ -6,7 +6,6 @@ from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar
|
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
|
|
9
|
-
from lattifai.alignment.phonemizer import G2Phonemizer
|
|
10
9
|
from lattifai.caption import Supervision
|
|
11
10
|
from lattifai.caption import normalize_text as normalize_html_text
|
|
12
11
|
from lattifai.errors import (
|
|
@@ -16,8 +15,10 @@ from lattifai.errors import (
|
|
|
16
15
|
QuotaExceededError,
|
|
17
16
|
)
|
|
18
17
|
|
|
18
|
+
from .phonemizer import G2Phonemizer
|
|
19
|
+
from .sentence_splitter import SentenceSplitter
|
|
20
|
+
|
|
19
21
|
PUNCTUATION = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
|
|
20
|
-
END_PUNCTUATION = '.!?"]。!?”】'
|
|
21
22
|
PUNCTUATION_SPACE = PUNCTUATION + " "
|
|
22
23
|
STAR_TOKEN = "※"
|
|
23
24
|
|
|
@@ -126,84 +127,12 @@ class LatticeTokenizer:
|
|
|
126
127
|
self.g2p_model: Any = None # Placeholder for G2P model
|
|
127
128
|
self.dictionaries = defaultdict(lambda: [])
|
|
128
129
|
self.oov_word = "<unk>"
|
|
129
|
-
self.sentence_splitter = None
|
|
130
|
+
self.sentence_splitter: Optional[SentenceSplitter] = None
|
|
130
131
|
self.device = "cpu"
|
|
131
132
|
|
|
132
133
|
def init_sentence_splitter(self):
|
|
133
|
-
if self.sentence_splitter is
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
import onnxruntime as ort
|
|
137
|
-
from wtpsplit import SaT
|
|
138
|
-
|
|
139
|
-
providers = []
|
|
140
|
-
device = self.device
|
|
141
|
-
if device.startswith("cuda") and ort.get_all_providers().count("CUDAExecutionProvider") > 0:
|
|
142
|
-
providers.append("CUDAExecutionProvider")
|
|
143
|
-
elif device.startswith("mps") and ort.get_all_providers().count("MPSExecutionProvider") > 0:
|
|
144
|
-
providers.append("MPSExecutionProvider")
|
|
145
|
-
|
|
146
|
-
if self.model_hub == "modelscope":
|
|
147
|
-
from modelscope.hub.snapshot_download import snapshot_download as ms_snapshot
|
|
148
|
-
|
|
149
|
-
downloaded_path = ms_snapshot("LattifAI/OmniTokenizer")
|
|
150
|
-
sat = SaT(
|
|
151
|
-
f"{downloaded_path}/sat-3l-sm",
|
|
152
|
-
tokenizer_name_or_path=f"{downloaded_path}/xlm-roberta-base",
|
|
153
|
-
ort_providers=providers + ["CPUExecutionProvider"],
|
|
154
|
-
)
|
|
155
|
-
else:
|
|
156
|
-
sat = SaT(
|
|
157
|
-
"sat-3l-sm",
|
|
158
|
-
ort_providers=providers + ["CPUExecutionProvider"],
|
|
159
|
-
)
|
|
160
|
-
self.sentence_splitter = sat
|
|
161
|
-
|
|
162
|
-
@staticmethod
|
|
163
|
-
def _resplit_special_sentence_types(sentence: str) -> List[str]:
|
|
164
|
-
"""
|
|
165
|
-
Re-split special sentence types.
|
|
166
|
-
|
|
167
|
-
Examples:
|
|
168
|
-
'[APPLAUSE] >> MIRA MURATI:' -> ['[APPLAUSE]', '>> MIRA MURATI:']
|
|
169
|
-
'[MUSIC] >> SPEAKER:' -> ['[MUSIC]', '>> SPEAKER:']
|
|
170
|
-
|
|
171
|
-
Special handling patterns:
|
|
172
|
-
1. Separate special marks at the beginning (e.g., [APPLAUSE], [MUSIC], etc.) from subsequent speaker marks
|
|
173
|
-
2. Use speaker marks (>> or other separators) as split points
|
|
174
|
-
|
|
175
|
-
Args:
|
|
176
|
-
sentence: Input sentence string
|
|
177
|
-
|
|
178
|
-
Returns:
|
|
179
|
-
List of re-split sentences. If no special marks are found, returns the original sentence in a list
|
|
180
|
-
"""
|
|
181
|
-
# Detect special mark patterns: [SOMETHING] >> SPEAKER:
|
|
182
|
-
# or other forms like [SOMETHING] SPEAKER:
|
|
183
|
-
|
|
184
|
-
# Pattern 1: [mark] HTML-encoded separator speaker:
|
|
185
|
-
pattern1 = r"^(\[[^\]]+\])\s+(>>|>>)\s+(.+)$"
|
|
186
|
-
match1 = re.match(pattern1, sentence.strip())
|
|
187
|
-
if match1:
|
|
188
|
-
special_mark = match1.group(1)
|
|
189
|
-
separator = match1.group(2)
|
|
190
|
-
speaker_part = match1.group(3)
|
|
191
|
-
return [special_mark, f"{separator} {speaker_part}"]
|
|
192
|
-
|
|
193
|
-
# Pattern 2: [mark] speaker:
|
|
194
|
-
pattern2 = r"^(\[[^\]]+\])\s+([^:]+:)(.*)$"
|
|
195
|
-
match2 = re.match(pattern2, sentence.strip())
|
|
196
|
-
if match2:
|
|
197
|
-
special_mark = match2.group(1)
|
|
198
|
-
speaker_label = match2.group(2)
|
|
199
|
-
remaining = match2.group(3).strip()
|
|
200
|
-
if remaining:
|
|
201
|
-
return [special_mark, f"{speaker_label} {remaining}"]
|
|
202
|
-
else:
|
|
203
|
-
return [special_mark, speaker_label]
|
|
204
|
-
|
|
205
|
-
# If no special pattern matches, return the original sentence
|
|
206
|
-
return [sentence]
|
|
134
|
+
if self.sentence_splitter is None:
|
|
135
|
+
self.sentence_splitter = SentenceSplitter(device=self.device, model_hub=self.model_hub)
|
|
207
136
|
|
|
208
137
|
@classmethod
|
|
209
138
|
def from_pretrained(
|
|
@@ -308,116 +237,16 @@ class LatticeTokenizer:
|
|
|
308
237
|
|
|
309
238
|
return {}
|
|
310
239
|
|
|
311
|
-
def split_sentences(self, supervisions: List[Supervision], strip_whitespace=True) -> List[
|
|
240
|
+
def split_sentences(self, supervisions: List[Supervision], strip_whitespace=True) -> List[Supervision]:
|
|
312
241
|
"""Split supervisions into sentences using the sentence splitter.
|
|
313
242
|
|
|
314
|
-
|
|
243
|
+
Careful about speaker changes.
|
|
315
244
|
"""
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
def flush_segment(end_idx: int, speaker: Optional[str] = None):
|
|
320
|
-
"""Flush accumulated text from sidx to end_idx with given speaker."""
|
|
321
|
-
nonlocal text_len, sidx
|
|
322
|
-
if sidx <= end_idx:
|
|
323
|
-
if len(speakers) < len(texts) + 1:
|
|
324
|
-
speakers.append(speaker)
|
|
325
|
-
text = " ".join(sup.text for sup in supervisions[sidx : end_idx + 1])
|
|
326
|
-
texts.append(text)
|
|
327
|
-
sidx = end_idx + 1
|
|
328
|
-
text_len = 0
|
|
329
|
-
|
|
330
|
-
for s, supervision in enumerate(supervisions):
|
|
331
|
-
text_len += len(supervision.text)
|
|
332
|
-
is_last = s == len(supervisions) - 1
|
|
333
|
-
|
|
334
|
-
if supervision.speaker:
|
|
335
|
-
# Flush previous segment without speaker (if any)
|
|
336
|
-
if sidx < s:
|
|
337
|
-
flush_segment(s - 1, None)
|
|
338
|
-
text_len = len(supervision.text)
|
|
339
|
-
|
|
340
|
-
# Check if we should flush this speaker's segment now
|
|
341
|
-
next_has_speaker = not is_last and supervisions[s + 1].speaker
|
|
342
|
-
if is_last or next_has_speaker:
|
|
343
|
-
flush_segment(s, supervision.speaker)
|
|
344
|
-
else:
|
|
345
|
-
speakers.append(supervision.speaker)
|
|
346
|
-
|
|
347
|
-
elif text_len >= 2000 or is_last:
|
|
348
|
-
flush_segment(s, None)
|
|
349
|
-
|
|
350
|
-
assert len(speakers) == len(texts), f"len(speakers)={len(speakers)} != len(texts)={len(texts)}"
|
|
351
|
-
sentences = self.sentence_splitter.split(texts, threshold=0.15, strip_whitespace=strip_whitespace, batch_size=8)
|
|
352
|
-
|
|
353
|
-
supervisions, remainder = [], ""
|
|
354
|
-
for k, (_speaker, _sentences) in enumerate(zip(speakers, sentences)):
|
|
355
|
-
# Prepend remainder from previous iteration to the first sentence
|
|
356
|
-
if _sentences and remainder:
|
|
357
|
-
_sentences[0] = remainder + _sentences[0]
|
|
358
|
-
remainder = ""
|
|
359
|
-
|
|
360
|
-
if not _sentences:
|
|
361
|
-
continue
|
|
362
|
-
|
|
363
|
-
# Process and re-split special sentence types
|
|
364
|
-
processed_sentences = []
|
|
365
|
-
for s, _sentence in enumerate(_sentences):
|
|
366
|
-
if remainder:
|
|
367
|
-
_sentence = remainder + _sentence
|
|
368
|
-
remainder = ""
|
|
369
|
-
# Detect and split special sentence types: e.g., '[APPLAUSE] >> MIRA MURATI:' -> ['[APPLAUSE]', '>> MIRA MURATI:'] # noqa: E501
|
|
370
|
-
resplit_parts = self._resplit_special_sentence_types(_sentence)
|
|
371
|
-
if any(resplit_parts[-1].endswith(sp) for sp in [":", ":"]):
|
|
372
|
-
if s < len(_sentences) - 1:
|
|
373
|
-
_sentences[s + 1] = resplit_parts[-1] + " " + _sentences[s + 1]
|
|
374
|
-
else: # last part
|
|
375
|
-
remainder = resplit_parts[-1] + " "
|
|
376
|
-
processed_sentences.extend(resplit_parts[:-1])
|
|
377
|
-
else:
|
|
378
|
-
processed_sentences.extend(resplit_parts)
|
|
379
|
-
_sentences = processed_sentences
|
|
380
|
-
|
|
381
|
-
if not _sentences:
|
|
382
|
-
if remainder:
|
|
383
|
-
_sentences, remainder = [remainder.strip()], ""
|
|
384
|
-
else:
|
|
385
|
-
continue
|
|
386
|
-
|
|
387
|
-
if any(_sentences[-1].endswith(ep) for ep in END_PUNCTUATION):
|
|
388
|
-
supervisions.extend(
|
|
389
|
-
Supervision(text=text, speaker=(_speaker if s == 0 else None)) for s, text in enumerate(_sentences)
|
|
390
|
-
)
|
|
391
|
-
_speaker = None # reset speaker after use
|
|
392
|
-
else:
|
|
393
|
-
supervisions.extend(
|
|
394
|
-
Supervision(text=text, speaker=(_speaker if s == 0 else None))
|
|
395
|
-
for s, text in enumerate(_sentences[:-1])
|
|
396
|
-
)
|
|
397
|
-
remainder = _sentences[-1] + " " + remainder
|
|
398
|
-
if k < len(speakers) - 1 and speakers[k + 1] is not None: # next speaker is set
|
|
399
|
-
supervisions.append(
|
|
400
|
-
Supervision(text=remainder.strip(), speaker=_speaker if len(_sentences) == 1 else None)
|
|
401
|
-
)
|
|
402
|
-
remainder = ""
|
|
403
|
-
elif len(_sentences) == 1:
|
|
404
|
-
if k == len(speakers) - 1:
|
|
405
|
-
pass # keep _speaker for the last supervision
|
|
406
|
-
else:
|
|
407
|
-
assert speakers[k + 1] is None
|
|
408
|
-
speakers[k + 1] = _speaker
|
|
409
|
-
else:
|
|
410
|
-
assert len(_sentences) > 1
|
|
411
|
-
_speaker = None # reset speaker if sentence not ended
|
|
412
|
-
|
|
413
|
-
if remainder.strip():
|
|
414
|
-
supervisions.append(Supervision(text=remainder.strip(), speaker=_speaker))
|
|
415
|
-
|
|
416
|
-
return supervisions
|
|
245
|
+
self.init_sentence_splitter()
|
|
246
|
+
return self.sentence_splitter.split_sentences(supervisions, strip_whitespace=strip_whitespace)
|
|
417
247
|
|
|
418
248
|
def tokenize(self, supervisions: List[Supervision], split_sentence: bool = False) -> Tuple[str, Dict[str, Any]]:
|
|
419
249
|
if split_sentence:
|
|
420
|
-
self.init_sentence_splitter()
|
|
421
250
|
supervisions = self.split_sentences(supervisions)
|
|
422
251
|
|
|
423
252
|
pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions])
|
lattifai/caption/caption.py
CHANGED
|
@@ -467,7 +467,6 @@ class Caption:
|
|
|
467
467
|
sup_dict = sup.to_dict()
|
|
468
468
|
json_data.append(sup_dict)
|
|
469
469
|
json.dump(json_data, f, ensure_ascii=False, indent=4)
|
|
470
|
-
|
|
471
470
|
elif str(output_path).lower().endswith(".textgrid"):
|
|
472
471
|
from tgt import Interval, IntervalTier, TextGrid, write_to_file
|
|
473
472
|
|
|
@@ -506,7 +505,6 @@ class Caption:
|
|
|
506
505
|
tg.add_tier(IntervalTier(name="word_scores", objects=scores["words"]))
|
|
507
506
|
|
|
508
507
|
write_to_file(tg, output_path, format="long")
|
|
509
|
-
|
|
510
508
|
elif str(output_path)[-4:].lower() == ".tsv":
|
|
511
509
|
cls._write_tsv(alignments, output_path, include_speaker_in_text)
|
|
512
510
|
elif str(output_path)[-4:].lower() == ".csv":
|