lattifai 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lattifai/__init__.py CHANGED
@@ -1,5 +1,4 @@
1
1
  import os
2
- import sys
3
2
  import warnings
4
3
  from importlib.metadata import version
5
4
 
@@ -52,29 +51,6 @@ except Exception:
52
51
  __version__ = "0.1.0" # fallback version
53
52
 
54
53
 
55
- # Check and auto-install k2py if not present
56
- def _check_and_install_k2py():
57
- """Check if k2py is installed and attempt to install it if not."""
58
- try:
59
- import k2py
60
- except ImportError:
61
- import subprocess
62
-
63
- print("k2py is not installed. Attempting to install k2py...")
64
- try:
65
- subprocess.check_call([sys.executable, "-m", "pip", "install", "k2py"])
66
- import k2py # Try importing again after installation
67
-
68
- print("k2py installed successfully.")
69
- except Exception as e:
70
- warnings.warn(f"Failed to install k2py automatically. Please install it manually. Error: {e}")
71
- return True
72
-
73
-
74
- # Auto-install k2py on first import
75
- _check_and_install_k2py()
76
-
77
-
78
54
  __all__ = [
79
55
  # Client classes
80
56
  "LattifAI",
@@ -117,7 +117,7 @@ class Lattice1Aligner(object):
117
117
 
118
118
  if verbose:
119
119
  safe_print(colorful.cyan(f"🔍 Step 3: Searching lattice graph with media: {audio}"))
120
- if audio.streaming_chunk_secs:
120
+ if audio.streaming_mode:
121
121
  safe_print(
122
122
  colorful.yellow(
123
123
  f" ⚡Using streaming mode with {audio.streaming_chunk_secs}s (chunk duration)"
@@ -7,8 +7,6 @@ from typing import Any, Dict, Optional, Tuple
7
7
  import colorful
8
8
  import numpy as np
9
9
  import onnxruntime as ort
10
- from lhotse import FbankConfig
11
- from lhotse.features.kaldi.layers import Wav2LogFilterBank
12
10
  from lhotse.utils import Pathlike
13
11
  from tqdm import tqdm
14
12
 
@@ -159,10 +157,7 @@ class Lattice1Worker:
159
157
  DependencyError: If required dependencies are missing
160
158
  AlignmentError: If alignment process fails
161
159
  """
162
- try:
163
- import k2py as k2
164
- except ImportError:
165
- raise DependencyError("k2py", install_command="pip install k2py")
160
+ import k2py as k2
166
161
 
167
162
  lattice_graph_str, final_state, acoustic_scale = lattice_graph
168
163
 
@@ -9,7 +9,7 @@ from lattifai.caption import Caption, Supervision
9
9
  from lattifai.config import AlignmentConfig
10
10
  from lattifai.utils import safe_print
11
11
 
12
- from .tokenizer import END_PUNCTUATION
12
+ from .sentence_splitter import END_PUNCTUATION
13
13
 
14
14
 
15
15
  class Segmenter:
@@ -0,0 +1,219 @@
1
+ import re
2
+ from typing import List, Optional
3
+
4
+ from lattifai.caption import Supervision
5
+ from lattifai.utils import _resolve_model_path
6
+
7
+ END_PUNCTUATION = '.!?"]。!?"】'
8
+
9
+
10
+ class SentenceSplitter:
11
+ """Lazy-initialized sentence splitter using wtpsplit."""
12
+
13
+ def __init__(self, device: str = "cpu", model_hub: Optional[str] = None, lazy_init: bool = True):
14
+ """Initialize sentence splitter with lazy loading.
15
+
16
+ Args:
17
+ device: Device to run the model on (cpu, cuda, mps)
18
+ model_hub: Model hub to use (None for huggingface, "modelscope" for modelscope)
19
+ """
20
+ self.device = device
21
+ self.model_hub = model_hub
22
+ if lazy_init:
23
+ self._splitter = None
24
+ else:
25
+ self._init_splitter()
26
+
27
+ def _init_splitter(self):
28
+ """Initialize the sentence splitter model on first use."""
29
+ if self._splitter is not None:
30
+ return
31
+
32
+ import onnxruntime as ort
33
+ from wtpsplit import SaT
34
+
35
+ providers = []
36
+ device = self.device
37
+ if device.startswith("cuda") and ort.get_all_providers().count("CUDAExecutionProvider") > 0:
38
+ providers.append("CUDAExecutionProvider")
39
+ elif device.startswith("mps") and ort.get_all_providers().count("MPSExecutionProvider") > 0:
40
+ providers.append("MPSExecutionProvider")
41
+
42
+ if self.model_hub == "modelscope":
43
+ downloaded_path = _resolve_model_path("LattifAI/OmniTokenizer", model_hub="modelscope")
44
+ sat = SaT(
45
+ f"{downloaded_path}/sat-3l-sm",
46
+ tokenizer_name_or_path=f"{downloaded_path}/xlm-roberta-base",
47
+ ort_providers=providers + ["CPUExecutionProvider"],
48
+ )
49
+ else:
50
+ sat_path = _resolve_model_path("segment-any-text/sat-3l-sm", model_hub="huggingface")
51
+ sat = SaT(
52
+ sat_path,
53
+ tokenizer_name_or_path="facebookAI/xlm-roberta-base",
54
+ hub_prefix="segment-any-text",
55
+ ort_providers=providers + ["CPUExecutionProvider"],
56
+ )
57
+ self._splitter = sat
58
+
59
+ @staticmethod
60
+ def _resplit_special_sentence_types(sentence: str) -> List[str]:
61
+ """
62
+ Re-split special sentence types.
63
+
64
+ Examples:
65
+ '[APPLAUSE] >> MIRA MURATI:' -> ['[APPLAUSE]', '>> MIRA MURATI:']
66
+ '[MUSIC] >> SPEAKER:' -> ['[MUSIC]', '>> SPEAKER:']
67
+
68
+ Special handling patterns:
69
+ 1. Separate special marks at the beginning (e.g., [APPLAUSE], [MUSIC], etc.) from subsequent speaker marks
70
+ 2. Use speaker marks (>> or other separators) as split points
71
+
72
+ Args:
73
+ sentence: Input sentence string
74
+
75
+ Returns:
76
+ List of re-split sentences. If no special marks are found, returns the original sentence in a list
77
+ """
78
+ # Detect special mark patterns: [SOMETHING] >> SPEAKER:
79
+ # or other forms like [SOMETHING] SPEAKER:
80
+
81
+ # Pattern 1: [mark] HTML-encoded separator speaker:
82
+ pattern1 = r"^(\[[^\]]+\])\s+(>>|>>)\s+(.+)$"
83
+ match1 = re.match(pattern1, sentence.strip())
84
+ if match1:
85
+ special_mark = match1.group(1)
86
+ separator = match1.group(2)
87
+ speaker_part = match1.group(3)
88
+ return [special_mark, f"{separator} {speaker_part}"]
89
+
90
+ # Pattern 2: [mark] speaker:
91
+ pattern2 = r"^(\[[^\]]+\])\s+([^:]+:)(.*)$"
92
+ match2 = re.match(pattern2, sentence.strip())
93
+ if match2:
94
+ special_mark = match2.group(1)
95
+ speaker_label = match2.group(2)
96
+ remaining = match2.group(3).strip()
97
+ if remaining:
98
+ return [special_mark, f"{speaker_label} {remaining}"]
99
+ else:
100
+ return [special_mark, speaker_label]
101
+
102
+ # If no special pattern matches, return the original sentence
103
+ return [sentence]
104
+
105
+ def split_sentences(self, supervisions: List[Supervision], strip_whitespace=True) -> List[Supervision]:
106
+ """Split supervisions into sentences using the sentence splitter.
107
+
108
+ Careful about speaker changes.
109
+
110
+ Args:
111
+ supervisions: List of Supervision objects to split
112
+ strip_whitespace: Whether to strip whitespace from split sentences
113
+
114
+ Returns:
115
+ List of Supervision objects with split sentences
116
+ """
117
+ self._init_splitter()
118
+
119
+ texts, speakers = [], []
120
+ text_len, sidx = 0, 0
121
+
122
+ def flush_segment(end_idx: int, speaker: Optional[str] = None):
123
+ """Flush accumulated text from sidx to end_idx with given speaker."""
124
+ nonlocal text_len, sidx
125
+ if sidx <= end_idx:
126
+ if len(speakers) < len(texts) + 1:
127
+ speakers.append(speaker)
128
+ text = " ".join(sup.text for sup in supervisions[sidx : end_idx + 1])
129
+ texts.append(text)
130
+ sidx = end_idx + 1
131
+ text_len = 0
132
+
133
+ for s, supervision in enumerate(supervisions):
134
+ text_len += len(supervision.text)
135
+ is_last = s == len(supervisions) - 1
136
+
137
+ if supervision.speaker:
138
+ # Flush previous segment without speaker (if any)
139
+ if sidx < s:
140
+ flush_segment(s - 1, None)
141
+ text_len = len(supervision.text)
142
+
143
+ # Check if we should flush this speaker's segment now
144
+ next_has_speaker = not is_last and supervisions[s + 1].speaker
145
+ if is_last or next_has_speaker:
146
+ flush_segment(s, supervision.speaker)
147
+ else:
148
+ speakers.append(supervision.speaker)
149
+
150
+ elif text_len >= 2000 or is_last:
151
+ flush_segment(s, None)
152
+
153
+ assert len(speakers) == len(texts), f"len(speakers)={len(speakers)} != len(texts)={len(texts)}"
154
+ sentences = self._splitter.split(texts, threshold=0.15, strip_whitespace=strip_whitespace, batch_size=8)
155
+
156
+ supervisions, remainder = [], ""
157
+ for k, (_speaker, _sentences) in enumerate(zip(speakers, sentences)):
158
+ # Prepend remainder from previous iteration to the first sentence
159
+ if _sentences and remainder:
160
+ _sentences[0] = remainder + _sentences[0]
161
+ remainder = ""
162
+
163
+ if not _sentences:
164
+ continue
165
+
166
+ # Process and re-split special sentence types
167
+ processed_sentences = []
168
+ for s, _sentence in enumerate(_sentences):
169
+ if remainder:
170
+ _sentence = remainder + _sentence
171
+ remainder = ""
172
+ # Detect and split special sentence types: e.g., '[APPLAUSE] &gt;&gt; MIRA MURATI:' -> ['[APPLAUSE]', '&gt;&gt; MIRA MURATI:'] # noqa: E501
173
+ resplit_parts = self._resplit_special_sentence_types(_sentence)
174
+ if any(resplit_parts[-1].endswith(sp) for sp in [":", ":"]):
175
+ if s < len(_sentences) - 1:
176
+ _sentences[s + 1] = resplit_parts[-1] + " " + _sentences[s + 1]
177
+ else: # last part
178
+ remainder = resplit_parts[-1] + " "
179
+ processed_sentences.extend(resplit_parts[:-1])
180
+ else:
181
+ processed_sentences.extend(resplit_parts)
182
+ _sentences = processed_sentences
183
+
184
+ if not _sentences:
185
+ if remainder:
186
+ _sentences, remainder = [remainder.strip()], ""
187
+ else:
188
+ continue
189
+
190
+ if any(_sentences[-1].endswith(ep) for ep in END_PUNCTUATION):
191
+ supervisions.extend(
192
+ Supervision(text=text, speaker=(_speaker if s == 0 else None)) for s, text in enumerate(_sentences)
193
+ )
194
+ _speaker = None # reset speaker after use
195
+ else:
196
+ supervisions.extend(
197
+ Supervision(text=text, speaker=(_speaker if s == 0 else None))
198
+ for s, text in enumerate(_sentences[:-1])
199
+ )
200
+ remainder = _sentences[-1] + " " + remainder
201
+ if k < len(speakers) - 1 and speakers[k + 1] is not None: # next speaker is set
202
+ supervisions.append(
203
+ Supervision(text=remainder.strip(), speaker=_speaker if len(_sentences) == 1 else None)
204
+ )
205
+ remainder = ""
206
+ elif len(_sentences) == 1:
207
+ if k == len(speakers) - 1:
208
+ pass # keep _speaker for the last supervision
209
+ else:
210
+ assert speakers[k + 1] is None
211
+ speakers[k + 1] = _speaker
212
+ else:
213
+ assert len(_sentences) > 1
214
+ _speaker = None # reset speaker if sentence not ended
215
+
216
+ if remainder.strip():
217
+ supervisions.append(Supervision(text=remainder.strip(), speaker=_speaker))
218
+
219
+ return supervisions
@@ -6,7 +6,6 @@ from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar
6
6
 
7
7
  import numpy as np
8
8
 
9
- from lattifai.alignment.phonemizer import G2Phonemizer
10
9
  from lattifai.caption import Supervision
11
10
  from lattifai.caption import normalize_text as normalize_html_text
12
11
  from lattifai.errors import (
@@ -16,8 +15,10 @@ from lattifai.errors import (
16
15
  QuotaExceededError,
17
16
  )
18
17
 
18
+ from .phonemizer import G2Phonemizer
19
+ from .sentence_splitter import SentenceSplitter
20
+
19
21
  PUNCTUATION = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
20
- END_PUNCTUATION = '.!?"]。!?”】'
21
22
  PUNCTUATION_SPACE = PUNCTUATION + " "
22
23
  STAR_TOKEN = "※"
23
24
 
@@ -126,84 +127,12 @@ class LatticeTokenizer:
126
127
  self.g2p_model: Any = None # Placeholder for G2P model
127
128
  self.dictionaries = defaultdict(lambda: [])
128
129
  self.oov_word = "<unk>"
129
- self.sentence_splitter = None
130
+ self.sentence_splitter: Optional[SentenceSplitter] = None
130
131
  self.device = "cpu"
131
132
 
132
133
  def init_sentence_splitter(self):
133
- if self.sentence_splitter is not None:
134
- return
135
-
136
- import onnxruntime as ort
137
- from wtpsplit import SaT
138
-
139
- providers = []
140
- device = self.device
141
- if device.startswith("cuda") and ort.get_all_providers().count("CUDAExecutionProvider") > 0:
142
- providers.append("CUDAExecutionProvider")
143
- elif device.startswith("mps") and ort.get_all_providers().count("MPSExecutionProvider") > 0:
144
- providers.append("MPSExecutionProvider")
145
-
146
- if self.model_hub == "modelscope":
147
- from modelscope.hub.snapshot_download import snapshot_download as ms_snapshot
148
-
149
- downloaded_path = ms_snapshot("LattifAI/OmniTokenizer")
150
- sat = SaT(
151
- f"{downloaded_path}/sat-3l-sm",
152
- tokenizer_name_or_path=f"{downloaded_path}/xlm-roberta-base",
153
- ort_providers=providers + ["CPUExecutionProvider"],
154
- )
155
- else:
156
- sat = SaT(
157
- "sat-3l-sm",
158
- ort_providers=providers + ["CPUExecutionProvider"],
159
- )
160
- self.sentence_splitter = sat
161
-
162
- @staticmethod
163
- def _resplit_special_sentence_types(sentence: str) -> List[str]:
164
- """
165
- Re-split special sentence types.
166
-
167
- Examples:
168
- '[APPLAUSE] &gt;&gt; MIRA MURATI:' -> ['[APPLAUSE]', '&gt;&gt; MIRA MURATI:']
169
- '[MUSIC] &gt;&gt; SPEAKER:' -> ['[MUSIC]', '&gt;&gt; SPEAKER:']
170
-
171
- Special handling patterns:
172
- 1. Separate special marks at the beginning (e.g., [APPLAUSE], [MUSIC], etc.) from subsequent speaker marks
173
- 2. Use speaker marks (&gt;&gt; or other separators) as split points
174
-
175
- Args:
176
- sentence: Input sentence string
177
-
178
- Returns:
179
- List of re-split sentences. If no special marks are found, returns the original sentence in a list
180
- """
181
- # Detect special mark patterns: [SOMETHING] &gt;&gt; SPEAKER:
182
- # or other forms like [SOMETHING] SPEAKER:
183
-
184
- # Pattern 1: [mark] HTML-encoded separator speaker:
185
- pattern1 = r"^(\[[^\]]+\])\s+(&gt;&gt;|>>)\s+(.+)$"
186
- match1 = re.match(pattern1, sentence.strip())
187
- if match1:
188
- special_mark = match1.group(1)
189
- separator = match1.group(2)
190
- speaker_part = match1.group(3)
191
- return [special_mark, f"{separator} {speaker_part}"]
192
-
193
- # Pattern 2: [mark] speaker:
194
- pattern2 = r"^(\[[^\]]+\])\s+([^:]+:)(.*)$"
195
- match2 = re.match(pattern2, sentence.strip())
196
- if match2:
197
- special_mark = match2.group(1)
198
- speaker_label = match2.group(2)
199
- remaining = match2.group(3).strip()
200
- if remaining:
201
- return [special_mark, f"{speaker_label} {remaining}"]
202
- else:
203
- return [special_mark, speaker_label]
204
-
205
- # If no special pattern matches, return the original sentence
206
- return [sentence]
134
+ if self.sentence_splitter is None:
135
+ self.sentence_splitter = SentenceSplitter(device=self.device, model_hub=self.model_hub)
207
136
 
208
137
  @classmethod
209
138
  def from_pretrained(
@@ -308,116 +237,16 @@ class LatticeTokenizer:
308
237
 
309
238
  return {}
310
239
 
311
- def split_sentences(self, supervisions: List[Supervision], strip_whitespace=True) -> List[str]:
240
+ def split_sentences(self, supervisions: List[Supervision], strip_whitespace=True) -> List[Supervision]:
312
241
  """Split supervisions into sentences using the sentence splitter.
313
242
 
314
- Carefull about speaker changes.
243
+ Careful about speaker changes.
315
244
  """
316
- texts, speakers = [], []
317
- text_len, sidx = 0, 0
318
-
319
- def flush_segment(end_idx: int, speaker: Optional[str] = None):
320
- """Flush accumulated text from sidx to end_idx with given speaker."""
321
- nonlocal text_len, sidx
322
- if sidx <= end_idx:
323
- if len(speakers) < len(texts) + 1:
324
- speakers.append(speaker)
325
- text = " ".join(sup.text for sup in supervisions[sidx : end_idx + 1])
326
- texts.append(text)
327
- sidx = end_idx + 1
328
- text_len = 0
329
-
330
- for s, supervision in enumerate(supervisions):
331
- text_len += len(supervision.text)
332
- is_last = s == len(supervisions) - 1
333
-
334
- if supervision.speaker:
335
- # Flush previous segment without speaker (if any)
336
- if sidx < s:
337
- flush_segment(s - 1, None)
338
- text_len = len(supervision.text)
339
-
340
- # Check if we should flush this speaker's segment now
341
- next_has_speaker = not is_last and supervisions[s + 1].speaker
342
- if is_last or next_has_speaker:
343
- flush_segment(s, supervision.speaker)
344
- else:
345
- speakers.append(supervision.speaker)
346
-
347
- elif text_len >= 2000 or is_last:
348
- flush_segment(s, None)
349
-
350
- assert len(speakers) == len(texts), f"len(speakers)={len(speakers)} != len(texts)={len(texts)}"
351
- sentences = self.sentence_splitter.split(texts, threshold=0.15, strip_whitespace=strip_whitespace, batch_size=8)
352
-
353
- supervisions, remainder = [], ""
354
- for k, (_speaker, _sentences) in enumerate(zip(speakers, sentences)):
355
- # Prepend remainder from previous iteration to the first sentence
356
- if _sentences and remainder:
357
- _sentences[0] = remainder + _sentences[0]
358
- remainder = ""
359
-
360
- if not _sentences:
361
- continue
362
-
363
- # Process and re-split special sentence types
364
- processed_sentences = []
365
- for s, _sentence in enumerate(_sentences):
366
- if remainder:
367
- _sentence = remainder + _sentence
368
- remainder = ""
369
- # Detect and split special sentence types: e.g., '[APPLAUSE] &gt;&gt; MIRA MURATI:' -> ['[APPLAUSE]', '&gt;&gt; MIRA MURATI:'] # noqa: E501
370
- resplit_parts = self._resplit_special_sentence_types(_sentence)
371
- if any(resplit_parts[-1].endswith(sp) for sp in [":", ":"]):
372
- if s < len(_sentences) - 1:
373
- _sentences[s + 1] = resplit_parts[-1] + " " + _sentences[s + 1]
374
- else: # last part
375
- remainder = resplit_parts[-1] + " "
376
- processed_sentences.extend(resplit_parts[:-1])
377
- else:
378
- processed_sentences.extend(resplit_parts)
379
- _sentences = processed_sentences
380
-
381
- if not _sentences:
382
- if remainder:
383
- _sentences, remainder = [remainder.strip()], ""
384
- else:
385
- continue
386
-
387
- if any(_sentences[-1].endswith(ep) for ep in END_PUNCTUATION):
388
- supervisions.extend(
389
- Supervision(text=text, speaker=(_speaker if s == 0 else None)) for s, text in enumerate(_sentences)
390
- )
391
- _speaker = None # reset speaker after use
392
- else:
393
- supervisions.extend(
394
- Supervision(text=text, speaker=(_speaker if s == 0 else None))
395
- for s, text in enumerate(_sentences[:-1])
396
- )
397
- remainder = _sentences[-1] + " " + remainder
398
- if k < len(speakers) - 1 and speakers[k + 1] is not None: # next speaker is set
399
- supervisions.append(
400
- Supervision(text=remainder.strip(), speaker=_speaker if len(_sentences) == 1 else None)
401
- )
402
- remainder = ""
403
- elif len(_sentences) == 1:
404
- if k == len(speakers) - 1:
405
- pass # keep _speaker for the last supervision
406
- else:
407
- assert speakers[k + 1] is None
408
- speakers[k + 1] = _speaker
409
- else:
410
- assert len(_sentences) > 1
411
- _speaker = None # reset speaker if sentence not ended
412
-
413
- if remainder.strip():
414
- supervisions.append(Supervision(text=remainder.strip(), speaker=_speaker))
415
-
416
- return supervisions
245
+ self.init_sentence_splitter()
246
+ return self.sentence_splitter.split_sentences(supervisions, strip_whitespace=strip_whitespace)
417
247
 
418
248
  def tokenize(self, supervisions: List[Supervision], split_sentence: bool = False) -> Tuple[str, Dict[str, Any]]:
419
249
  if split_sentence:
420
- self.init_sentence_splitter()
421
250
  supervisions = self.split_sentences(supervisions)
422
251
 
423
252
  pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions])
@@ -467,7 +467,6 @@ class Caption:
467
467
  sup_dict = sup.to_dict()
468
468
  json_data.append(sup_dict)
469
469
  json.dump(json_data, f, ensure_ascii=False, indent=4)
470
-
471
470
  elif str(output_path).lower().endswith(".textgrid"):
472
471
  from tgt import Interval, IntervalTier, TextGrid, write_to_file
473
472
 
@@ -506,7 +505,6 @@ class Caption:
506
505
  tg.add_tier(IntervalTier(name="word_scores", objects=scores["words"]))
507
506
 
508
507
  write_to_file(tg, output_path, format="long")
509
-
510
508
  elif str(output_path)[-4:].lower() == ".tsv":
511
509
  cls._write_tsv(alignments, output_path, include_speaker_in_text)
512
510
  elif str(output_path)[-4:].lower() == ".csv":