lattifai 0.2.0__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lattifai/bin/align.py CHANGED
@@ -23,6 +23,13 @@ from lattifai.bin.cli_base import cli
23
23
  @click.option(
24
24
  '-M', '--model_name_or_path', type=str, default='Lattifai/Lattice-1-Alpha', help='Lattifai model name or path'
25
25
  )
26
+ @click.option(
27
+ '-S',
28
+ '--split_sentence',
29
+ is_flag=True,
30
+ default=False,
31
+ help='Re-segment subtitles by semantics.',
32
+ )
26
33
  @click.argument(
27
34
  'input_audio_path',
28
35
  type=click.Path(exists=True, dir_okay=False),
@@ -42,6 +49,7 @@ def align(
42
49
  input_format: str = 'auto',
43
50
  device: str = 'cpu',
44
51
  model_name_or_path: str = 'Lattifai/Lattice-1-Alpha',
52
+ split_sentence: bool = False,
45
53
  ):
46
54
  """
47
55
  Command used to align audio with subtitles
@@ -50,5 +58,9 @@ def align(
50
58
 
51
59
  client = LattifAI(model_name_or_path=model_name_or_path, device=device)
52
60
  client.alignment(
53
- input_audio_path, input_subtitle_path, format=input_format.lower(), output_subtitle_path=output_subtitle_path
61
+ input_audio_path,
62
+ input_subtitle_path,
63
+ format=input_format.lower(),
64
+ split_sentence=split_sentence,
65
+ output_subtitle_path=output_subtitle_path,
54
66
  )
lattifai/client.py CHANGED
@@ -11,7 +11,7 @@ from lhotse.utils import Pathlike
11
11
 
12
12
  from lattifai.base_client import AsyncAPIClient, LattifAIError, SyncAPIClient
13
13
  from lattifai.io import SubtitleFormat, SubtitleIO
14
- from lattifai.tokenizers import LatticeTokenizer
14
+ from lattifai.tokenizer import LatticeTokenizer
15
15
  from lattifai.workers import Lattice1AlphaWorker
16
16
 
17
17
  load_dotenv()
@@ -87,6 +87,7 @@ class LattifAI(SyncAPIClient):
87
87
  audio: Pathlike,
88
88
  subtitle: Pathlike,
89
89
  format: Optional[SubtitleFormat] = None,
90
+ split_sentence: bool = False,
90
91
  output_subtitle_path: Optional[Pathlike] = None,
91
92
  ) -> str:
92
93
  """Perform alignment on audio and subtitle/text.
@@ -102,11 +103,11 @@ class LattifAI(SyncAPIClient):
102
103
  # step1: parse text or subtitles
103
104
  print(colorful.cyan(f'📖 Step 1: Reading subtitle file from {subtitle}'))
104
105
  supervisions = SubtitleIO.read(subtitle, format=format)
105
- print(colorful.green(f' ✓ Parsed {len(supervisions)} supervision segments'))
106
+ print(colorful.green(f' ✓ Parsed {len(supervisions)} subtitle segments'))
106
107
 
107
108
  # step2: make lattice by call Lattifai API
108
109
  print(colorful.cyan('🔗 Step 2: Creating lattice graph from text'))
109
- lattice_id, lattice_graph = self.tokenizer.tokenize(supervisions)
110
+ lattice_id, lattice_graph = self.tokenizer.tokenize(supervisions, split_sentence=split_sentence)
110
111
  print(colorful.green(f' ✓ Generated lattice graph with ID: {lattice_id}'))
111
112
 
112
113
  # step3: align audio with text
@@ -138,4 +139,4 @@ if __name__ == '__main__':
138
139
  subtitle = 'tests/data/SA1.TXT'
139
140
  output = None
140
141
 
141
- alignments = client.alignment(audio, subtitle, output_subtitle_path=output)
142
+ alignments = client.alignment(audio, subtitle, output_subtitle_path=output, split_sentence=True)
lattifai/io/reader.py CHANGED
@@ -58,13 +58,12 @@ class SubtitleReader(ABCMeta):
58
58
  subs: pysubs2.SSAFile = pysubs2.load(subtitle, encoding='utf-8') # auto detect format
59
59
 
60
60
  supervisions = []
61
-
62
61
  for event in subs.events:
63
62
  supervisions.append(
64
63
  Supervision(
65
64
  text=event.text,
66
65
  # "start": event.start / 1000.0 if event.start is not None else None,
67
- # "duration": event.end / 1000.0 if event.end is not None else None,
66
+ # "duration": (event.end - event.start) / 1000.0 if event.end is not None else None,
68
67
  # }
69
68
  )
70
69
  )
@@ -0,0 +1,284 @@
1
+ import gzip
2
+ import pickle
3
+ import re
4
+ from collections import defaultdict
5
+ from itertools import chain
6
+ from typing import Any, Dict, List, Optional, Tuple, Union
7
+
8
+ import torch
9
+
10
+ from lattifai.base_client import SyncAPIClient
11
+ from lattifai.io import Supervision
12
+ from lattifai.tokenizer.phonemizer import G2Phonemizer
13
+
14
+ PUNCTUATION = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
15
+ END_PUNCTUATION = '.!?"]。!?”】'
16
+ PUNCTUATION_SPACE = PUNCTUATION + ' '
17
+ STAR_TOKEN = '※'
18
+
19
+ GROUPING_SEPARATOR = '✹'
20
+
21
+ MAXIMUM_WORD_LENGTH = 40
22
+
23
+
24
+ class LatticeTokenizer:
25
+ """Tokenizer for converting Lhotse Cut to LatticeGraph."""
26
+
27
+ def __init__(self, client_wrapper: SyncAPIClient):
28
+ self.client_wrapper = client_wrapper
29
+ self.words: List[str] = []
30
+ self.g2p_model: Any = None # Placeholder for G2P model
31
+ self.dictionaries = defaultdict(lambda: [])
32
+ self.oov_word = '<unk>'
33
+ self.sentence_splitter = None
34
+ self.device = 'cpu'
35
+
36
+ def init_sentence_splitter(self):
37
+ if self.sentence_splitter is not None:
38
+ return
39
+
40
+ import onnxruntime as ort
41
+ from wtpsplit import SaT
42
+
43
+ providers = []
44
+ device = self.device
45
+ if device.startswith('cuda') and ort.get_all_providers().count('CUDAExecutionProvider') > 0:
46
+ providers.append('CUDAExecutionProvider')
47
+ elif device.startswith('mps') and ort.get_all_providers().count('MPSExecutionProvider') > 0:
48
+ providers.append('MPSExecutionProvider')
49
+
50
+ sat = SaT(
51
+ 'sat-3l-sm',
52
+ ort_providers=providers + ['CPUExecutionProvider'],
53
+ )
54
+ self.sentence_splitter = sat
55
+
56
+ @staticmethod
57
+ def _resplit_special_sentence_types(sentence: str) -> List[str]:
58
+ """
59
+ Re-split special sentence types.
60
+
61
+ Examples:
62
+ '[APPLAUSE] &gt;&gt; MIRA MURATI:' -> ['[APPLAUSE]', '&gt;&gt; MIRA MURATI:']
63
+ '[MUSIC] &gt;&gt; SPEAKER:' -> ['[MUSIC]', '&gt;&gt; SPEAKER:']
64
+
65
+ Special handling patterns:
66
+ 1. Separate special marks at the beginning (e.g., [APPLAUSE], [MUSIC], etc.) from subsequent speaker marks
67
+ 2. Use speaker marks (&gt;&gt; or other separators) as split points
68
+
69
+ Args:
70
+ sentence: Input sentence string
71
+
72
+ Returns:
73
+ List of re-split sentences. If no special marks are found, returns the original sentence in a list
74
+ """
75
+ # Detect special mark patterns: [SOMETHING] &gt;&gt; SPEAKER:
76
+ # or other forms like [SOMETHING] SPEAKER:
77
+
78
+ # Pattern 1: [mark] HTML-encoded separator speaker:
79
+ pattern1 = r'^(\[[^\]]+\])\s+(&gt;&gt;|>>)\s+(.+)$'
80
+ match1 = re.match(pattern1, sentence.strip())
81
+ if match1:
82
+ special_mark = match1.group(1)
83
+ separator = match1.group(2)
84
+ speaker_part = match1.group(3)
85
+ return [special_mark, f'{separator} {speaker_part}']
86
+
87
+ # Pattern 2: [mark] speaker:
88
+ pattern2 = r'^(\[[^\]]+\])\s+([^:]+:)(.*)$'
89
+ match2 = re.match(pattern2, sentence.strip())
90
+ if match2:
91
+ special_mark = match2.group(1)
92
+ speaker_label = match2.group(2)
93
+ remaining = match2.group(3).strip()
94
+ if remaining:
95
+ return [special_mark, f'{speaker_label} {remaining}']
96
+ else:
97
+ return [special_mark, speaker_label]
98
+
99
+ # If no special pattern matches, return the original sentence
100
+ return [sentence]
101
+
102
+ @staticmethod
103
+ def from_pretrained(
104
+ client_wrapper: SyncAPIClient,
105
+ model_path: str,
106
+ device: str = 'cpu',
107
+ compressed: bool = True,
108
+ ):
109
+ """Load tokenizer from exported binary file"""
110
+ from pathlib import Path
111
+
112
+ words_model_path = f'{model_path}/words.bin'
113
+ if compressed:
114
+ with gzip.open(words_model_path, 'rb') as f:
115
+ data = pickle.load(f)
116
+ else:
117
+ with open(words_model_path, 'rb') as f:
118
+ data = pickle.load(f)
119
+
120
+ tokenizer = LatticeTokenizer(client_wrapper=client_wrapper)
121
+ tokenizer.words = data['words']
122
+ tokenizer.dictionaries = defaultdict(list, data['dictionaries'])
123
+ tokenizer.oov_word = data['oov_word']
124
+
125
+ g2p_model_path = f'{model_path}/g2p.bin' if Path(f'{model_path}/g2p.bin').exists() else None
126
+ if g2p_model_path:
127
+ tokenizer.g2p_model = G2Phonemizer(g2p_model_path, device=device)
128
+
129
+ tokenizer.device = device
130
+ tokenizer.add_special_tokens()
131
+ return tokenizer
132
+
133
+ def add_special_tokens(self):
134
+ tokenizer = self
135
+ for special_token in ['&gt;&gt;', '&gt;']:
136
+ if special_token not in tokenizer.dictionaries:
137
+ tokenizer.dictionaries[special_token] = tokenizer.dictionaries[tokenizer.oov_word]
138
+ return self
139
+
140
+ def prenormalize(self, texts: List[str], language: Optional[str] = None) -> List[str]:
141
+ if not self.g2p_model:
142
+ raise ValueError('G2P model is not loaded, cannot prenormalize texts')
143
+
144
+ oov_words = []
145
+ for text in texts:
146
+ words = text.lower().replace('-', ' ').replace('—', ' ').replace('–', ' ').split()
147
+ oovs = [w for w in words if w not in self.words]
148
+ if oovs:
149
+ oov_words.extend([w for w in oovs if (w not in self.words and len(w) <= MAXIMUM_WORD_LENGTH)])
150
+
151
+ oov_words = list(set(oov_words))
152
+ if oov_words:
153
+ indexs = []
154
+ for k, _word in enumerate(oov_words):
155
+ if any(_word.startswith(p) and _word.endswith(q) for (p, q) in [('(', ')'), ('[', ']')]):
156
+ self.dictionaries[_word] = self.dictionaries[self.oov_word]
157
+ else:
158
+ _word = _word.strip(PUNCTUATION_SPACE)
159
+ if not _word or _word in self.words:
160
+ indexs.append(k)
161
+ for idx in sorted(indexs, reverse=True):
162
+ del oov_words[idx]
163
+
164
+ g2p_words = [w for w in oov_words if w not in self.dictionaries]
165
+ if g2p_words:
166
+ predictions = self.g2p_model(words=g2p_words, lang=language, batch_size=len(g2p_words), num_prons=4)
167
+ for _word, _predictions in zip(g2p_words, predictions):
168
+ for pronuncation in _predictions:
169
+ if pronuncation and pronuncation not in self.dictionaries[_word]:
170
+ self.dictionaries[_word].append(pronuncation)
171
+ if not self.dictionaries[_word]:
172
+ self.dictionaries[_word] = self.dictionaries[self.oov_word]
173
+
174
+ pronunciation_dictionaries: Dict[str, List[List[str]]] = {
175
+ w: self.dictionaries[w] for w in oov_words if self.dictionaries[w]
176
+ }
177
+ return pronunciation_dictionaries
178
+
179
+ return {}
180
+
181
+ def split_sentences(self, supervisions: List[Supervision], strip_whitespace=True) -> List[str]:
182
+ texts, text_len, sidx = [], 0, 0
183
+ for s, supervision in enumerate(supervisions):
184
+ text_len += len(supervision.text)
185
+ if text_len >= 2000 or s == len(supervisions) - 1:
186
+ text = ' '.join([sup.text for sup in supervisions[sidx : s + 1]])
187
+ texts.append(text)
188
+ sidx = s + 1
189
+ text_len = 0
190
+ if sidx < len(supervisions):
191
+ text = ' '.join([sup.text for sup in supervisions[sidx:]])
192
+ texts.append(text)
193
+ sentences = self.sentence_splitter.split(texts, threshold=0.15, strip_whitespace=strip_whitespace)
194
+
195
+ supervisions, remainder = [], ''
196
+ for _sentences in sentences:
197
+ # Process and re-split special sentence types
198
+ processed_sentences = []
199
+ for s, _sentence in enumerate(_sentences):
200
+ if remainder:
201
+ _sentence = remainder + _sentence
202
+ remainder = ''
203
+
204
+ # Detect and split special sentence types: e.g., '[APPLAUSE] &gt;&gt; MIRA MURATI:' -> ['[APPLAUSE]', '&gt;&gt; MIRA MURATI:'] # noqa: E501
205
+ resplit_parts = self._resplit_special_sentence_types(_sentence)
206
+ if any(resplit_parts[-1].endswith(sp) for sp in [':', ':']):
207
+ if s < len(_sentences) - 1:
208
+ _sentences[s + 1] = resplit_parts[-1] + ' ' + _sentences[s + 1]
209
+ else: # last part
210
+ remainder = resplit_parts[-1] + ' ' + remainder
211
+ processed_sentences.extend(resplit_parts[:-1])
212
+ else:
213
+ processed_sentences.extend(resplit_parts)
214
+
215
+ _sentences = processed_sentences
216
+
217
+ if remainder:
218
+ _sentences[0] = remainder + _sentences[0]
219
+ remainder = ''
220
+
221
+ if any(_sentences[-1].endswith(ep) for ep in END_PUNCTUATION):
222
+ supervisions.extend(Supervision(text=s) for s in _sentences)
223
+ else:
224
+ supervisions.extend(Supervision(text=s) for s in _sentences[:-1])
225
+ remainder += _sentences[-1] + ' '
226
+
227
+ if remainder.strip():
228
+ supervisions.append(Supervision(text=remainder.strip()))
229
+
230
+ return supervisions
231
+
232
+ def tokenize(self, supervisions: List[Supervision], split_sentence: bool = False) -> Tuple[str, Dict[str, Any]]:
233
+ if split_sentence:
234
+ self.init_sentence_splitter()
235
+ supervisions = self.split_sentences(supervisions)
236
+
237
+ pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions])
238
+ response = self.client_wrapper.post(
239
+ 'tokenize',
240
+ json={
241
+ 'supervisions': [s.to_dict() for s in supervisions],
242
+ 'pronunciation_dictionaries': pronunciation_dictionaries,
243
+ },
244
+ )
245
+ if response.status_code != 200:
246
+ raise Exception(f'Failed to tokenize texts: {response.text}')
247
+ result = response.json()
248
+ lattice_id = result['id']
249
+ return lattice_id, (result['lattice_graph'], result['final_state'], result.get('acoustic_scale', 1.0))
250
+
251
+ def detokenize(
252
+ self,
253
+ lattice_id: str,
254
+ lattice_results: Tuple[torch.Tensor, Any, Any, float, float],
255
+ # return_supervisions: bool = True,
256
+ # return_details: bool = False,
257
+ ) -> List[Supervision]:
258
+ emission, results, labels, frame_shift, offset, channel = lattice_results # noqa: F841
259
+ response = self.client_wrapper.post(
260
+ 'detokenize',
261
+ json={
262
+ 'lattice_id': lattice_id,
263
+ 'frame_shift': frame_shift,
264
+ 'results': [t.to_dict() for t in results[0]],
265
+ 'labels': labels[0],
266
+ 'offset': offset,
267
+ 'channel': channel,
268
+ 'destroy_lattice': True,
269
+ },
270
+ )
271
+ if response.status_code != 200:
272
+ raise Exception(f'Failed to detokenize lattice: {response.text}')
273
+ result = response.json()
274
+ # if return_details:
275
+ # raise NotImplementedError("return_details is not implemented yet")
276
+ return [Supervision.from_dict(s) for s in result['supervisions']]
277
+
278
+
279
+ # Compute average score weighted by the span length
280
+ def _score(spans):
281
+ if not spans:
282
+ return 0.0
283
+ # TokenSpan(token=token, start=start, end=end, score=scores[start:end].mean().item())
284
+ return round(sum(s.score * len(s) for s in spans) / sum(len(s) for s in spans), ndigits=4)
@@ -9,6 +9,7 @@ import resampy
9
9
  import soundfile as sf
10
10
  import torch
11
11
  from lhotse import FbankConfig
12
+ from lhotse.audio import read_audio
12
13
  from lhotse.features.kaldi.layers import Wav2LogFilterBank
13
14
  from lhotse.utils import Pathlike
14
15
 
@@ -76,13 +77,55 @@ class Lattice1AlphaWorker:
76
77
 
77
78
  def load_audio(self, audio: Union[Pathlike, BinaryIO]) -> Tuple[torch.Tensor, int]:
78
79
  # load audio
79
- waveform, sample_rate = sf.read(audio, always_2d=True, dtype='float32')
80
- if waveform.shape[1] > 1: # TODO: support choose channel
81
- waveform = np.mean(waveform, axis=1, keepdims=True)
80
+ try:
81
+ waveform, sample_rate = read_audio(audio) # numpy array
82
+ if len(waveform.shape) == 1:
83
+ waveform = waveform.reshape([1, -1]) # (1, L)
84
+ else: # make sure channel first
85
+ if waveform.shape[0] > waveform.shape[1]:
86
+ waveform = waveform.transpose(0, 1)
87
+ # average multiple channels
88
+ waveform = np.mean(waveform, axis=0, keepdims=True) # (1, L)
89
+ except Exception:
90
+ # Fallback to PyAV for formats not supported by soundfile
91
+ import av
92
+
93
+ container = av.open(audio)
94
+ audio_stream = next((s for s in container.streams if s.type == 'audio'), None)
95
+
96
+ if audio_stream is None:
97
+ raise ValueError(f'No audio stream found in {audio}')
98
+
99
+ # Resample to target sample rate during decoding
100
+ audio_stream.codec_context.format = av.AudioFormat('flt') # 32-bit float
101
+
102
+ frames = []
103
+ for frame in container.decode(audio_stream):
104
+ # Convert frame to numpy array
105
+ array = frame.to_ndarray()
106
+ # Ensure shape is (channels, samples)
107
+ if array.ndim == 1:
108
+ array = array.reshape(1, -1)
109
+ elif array.ndim == 2 and array.shape[0] > array.shape[1]:
110
+ array = array.T
111
+ frames.append(array)
112
+
113
+ container.close()
114
+
115
+ if not frames:
116
+ raise ValueError(f'No audio data found in {audio}')
117
+
118
+ # Concatenate all frames
119
+ waveform = np.concatenate(frames, axis=1)
120
+ # Average multiple channels to mono
121
+ if waveform.shape[0] > 1:
122
+ waveform = np.mean(waveform, axis=0, keepdims=True)
123
+
124
+ sample_rate = audio_stream.codec_context.sample_rate
125
+
82
126
  if sample_rate != self.config['sample_rate']:
83
- waveform = resampy.resample(waveform, sample_rate, self.config['sample_rate'], axis=0)
84
- waveform = torch.from_numpy(waveform.T).to(self.device) # (1, L)
85
- return waveform
127
+ waveform = resampy.resample(waveform, sample_rate, self.config['sample_rate'], axis=1)
128
+ return torch.from_numpy(waveform).to(self.device) # (1, L)
86
129
 
87
130
  def alignment(
88
131
  self, audio: Union[Union[Pathlike, BinaryIO], torch.tensor], lattice_graph: Tuple[str, int, float]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lattifai
3
- Version: 0.2.0
3
+ Version: 0.2.4
4
4
  Summary: Lattifai Python SDK: Seamless Integration with Lattifai's Speech and Video AI Services
5
5
  Author-email: Lattifai Technologies <tech@lattifai.com>
6
6
  Maintainer-email: Lattice <tech@lattifai.com>
@@ -60,6 +60,8 @@ Requires-Dist: tgt
60
60
  Requires-Dist: onnxruntime
61
61
  Requires-Dist: resampy
62
62
  Requires-Dist: g2p-phonemizer==0.1.1
63
+ Requires-Dist: wtpsplit>=2.1.6
64
+ Requires-Dist: av
63
65
  Provides-Extra: numpy
64
66
  Requires-Dist: numpy; extra == "numpy"
65
67
  Provides-Extra: test
@@ -117,9 +119,37 @@ Usage: lattifai align [OPTIONS] INPUT_AUDIO_PATH INPUT_SUBTITLE_PATH OUTPUT_SUBT
117
119
  Options:
118
120
  -F, --input_format [srt|vtt|ass|txt|auto] Input Subtitle format.
119
121
  -D, --device [cpu|cuda|mps] Device to use for inference.
122
+ --split_sentence Smart sentence splitting based on punctuation semantics.
120
123
  --help Show this message and exit.
121
124
  ```
122
125
 
126
+ #### Understanding --split_sentence
127
+
128
+ The `--split_sentence` option performs intelligent sentence re-splitting based on punctuation and semantic boundaries. This is especially useful when processing subtitles that combine multiple semantic units in a single segment, such as:
129
+
130
+ - **Mixed content**: Non-speech elements (e.g., `[APPLAUSE]`, `[MUSIC]`) followed by actual dialogue
131
+ - **Natural punctuation boundaries**: Colons, periods, and other punctuation marks that indicate semantic breaks
132
+ - **Concatenated phrases**: Multiple distinct utterances joined together without proper separation
133
+
134
+ **Example transformations**:
135
+ ```
136
+ Input: "[APPLAUSE] >> MIRA MURATI: Thank you all"
137
+ Output: ["[APPLAUSE]", ">> MIRA MURATI: Thank you all"]
138
+
139
+ Input: "[MUSIC] Welcome back. Today we discuss AI."
140
+ Output: ["[MUSIC]", "Welcome back.", "Today we discuss AI."]
141
+ ```
142
+
143
+ This feature helps improve alignment accuracy by:
144
+ 1. Respecting punctuation-based semantic boundaries
145
+ 2. Separating distinct utterances for more precise timing
146
+ 3. Maintaining semantic context for each independent phrase
147
+
148
+ **Usage**:
149
+ ```bash
150
+ lattifai align --split_sentence audio.wav subtitle.srt output.srt
151
+ ```
152
+
123
153
  ### Python API
124
154
 
125
155
  ```python
@@ -136,6 +166,7 @@ client = LattifAI(
136
166
  result = client.alignment(
137
167
  audio="audio.wav",
138
168
  subtitle="subtitle.srt",
169
+ split_sentence=False,
139
170
  output_subtitle_path="output.srt"
140
171
  )
141
172
  ```
@@ -161,13 +192,21 @@ LattifAI(
161
192
 
162
193
  ```python
163
194
  client.alignment(
164
- audio: str, # Path to audio file
165
- subtitle: str, # Path to subtitle/text file
166
- format: Optional[str] = None, # 'srt', 'vtt', 'ass', 'txt' (auto-detect if None)
195
+ audio: str, # Path to audio file
196
+ subtitle: str, # Path to subtitle/text file
197
+ format: Optional[str] = None, # 'srt', 'vtt', 'ass', 'txt' (auto-detect if None)
198
+ split_sentence: bool = False, # Smart sentence splitting based on punctuation semantics
167
199
  output_subtitle_path: Optional[str] = None
168
200
  ) -> str
169
201
  ```
170
202
 
203
+ **Parameters**:
204
+ - `audio`: Path to the audio file to be aligned
205
+ - `subtitle`: Path to the subtitle or text file
206
+ - `format`: Subtitle format ('srt', 'vtt', 'ass', 'txt'). Auto-detected if None
207
+ - `split_sentence`: Enable intelligent sentence re-splitting (default: False). Set to True when subtitles combine multiple semantic units (non-speech elements + dialogue, or multiple sentences) that would benefit from separate timing alignment
208
+ - `output_subtitle_path`: Output path for aligned subtitle (optional)
209
+
171
210
  ## Examples
172
211
 
173
212
  ### Basic Text Alignment
@@ -178,6 +217,7 @@ client.alignment(
178
217
  audio="speech.wav",
179
218
  subtitle="transcript.txt",
180
219
  format="txt",
220
+ split_sentence=False,
181
221
  output_subtitle_path="output.srt"
182
222
  )
183
223
  ```
@@ -0,0 +1,22 @@
1
+ lattifai/__init__.py,sha256=JXUg0dT74UyAtKOjewRs9ijr5sl9SYsc6oU_WItY314,1497
2
+ lattifai/base_client.py,sha256=ktFtATjL9pLSJUD-VqeJKA1FHkrsGHX7Uq_x00H7gO8,3322
3
+ lattifai/client.py,sha256=QXbdTuDA5Aap2udu4iig7CVxlgwOIrydpuLlVASs0aA,5145
4
+ lattifai/bin/__init__.py,sha256=7YhmtEM8kbxJtz2-KIskvpLKBZAvkMSceVx8z4fkgQ4,61
5
+ lattifai/bin/align.py,sha256=nQs901SDYmxyH2AXBtjgZGzrpwLaxANQRYP49Bd1AWo,1669
6
+ lattifai/bin/cli_base.py,sha256=y535WXDRX8StloFn9icpfw7nQt0JxuWBIuPMnRxAYy8,392
7
+ lattifai/bin/subtitle.py,sha256=bUWImAHpvyY59Vskqb5loQiD5ytQOxR8lTQRiQ4LyNA,647
8
+ lattifai/io/__init__.py,sha256=vHWRN7MvAch-GUeFqqO-gM57SM-4YOpGUjIxFJdjfPA,671
9
+ lattifai/io/reader.py,sha256=mtgxT5c_BiHbqqJvPE3nf7TIe_OcWgGu1zr6iXasfrk,2591
10
+ lattifai/io/supervision.py,sha256=5UfSsgBhXoDU3-6drDtoD7y8HIiA4xRKZnbOKgeejwM,354
11
+ lattifai/io/writer.py,sha256=1eAEFLlL8kricxRDPFBtVmeC4IiFyFnjbWXvw0VU-q4,2036
12
+ lattifai/tokenizer/__init__.py,sha256=aqv44PDtq6g3oFFKW_l4HSR5ywT5W8eP1dHHywIvBfs,72
13
+ lattifai/tokenizer/phonemizer.py,sha256=SfRi1KIMpmaao6OVmR1h_I_3QU-vrE6D5bh72Afg5XM,1759
14
+ lattifai/tokenizer/tokenizer.py,sha256=Yuo0pLPQnF2uX0Fm5g8i5vtcADn7GeLpSqdGpMJgTww,11492
15
+ lattifai/workers/__init__.py,sha256=s6YfkIq4FDIAzY9sPjRpXnJfszj2repqnMTqydRM5Zw,83
16
+ lattifai/workers/lattice1_alpha.py,sha256=fnimZqhPQ1fBCUjcDVblnFGTWP0vbNLRM7E7lOdHJu8,7428
17
+ lattifai-0.2.4.dist-info/licenses/LICENSE,sha256=LNuoH5jpXXNKgjQ3XLwztFq8D3O7kZI-LSg81o4ym2M,1065
18
+ lattifai-0.2.4.dist-info/METADATA,sha256=7SNTA_Egpv3F5rd0F20-4MigC7muz5x6kyr_xxj4CIk,10989
19
+ lattifai-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
20
+ lattifai-0.2.4.dist-info/entry_points.txt,sha256=CwTI2NbJvF9msIHboAfTA99cmDr_HOWoODjS8R64JOw,131
21
+ lattifai-0.2.4.dist-info/top_level.txt,sha256=-OVWZ68YYFcTN13ARkLasp2OUappe9wEVq-CKes7jM4,17
22
+ lattifai-0.2.4.dist-info/RECORD,,
@@ -1,147 +0,0 @@
1
- import gzip
2
- import pickle
3
- from collections import defaultdict
4
- from itertools import chain
5
- from typing import Any, Dict, List, Optional, Tuple
6
-
7
- import torch
8
-
9
- from lattifai.base_client import SyncAPIClient
10
- from lattifai.io import Supervision
11
- from lattifai.tokenizers.phonemizer import G2Phonemizer
12
-
13
- PUNCTUATION = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
14
- PUNCTUATION_SPACE = PUNCTUATION + ' '
15
- STAR_TOKEN = '※'
16
-
17
- GROUPING_SEPARATOR = '✹'
18
-
19
- MAXIMUM_WORD_LENGTH = 40
20
-
21
-
22
- class LatticeTokenizer:
23
- """Tokenizer for converting Lhotse Cut to LatticeGraph."""
24
-
25
- def __init__(self, client_wrapper: SyncAPIClient):
26
- self.client_wrapper = client_wrapper
27
- self.words: List[str] = []
28
- self.g2p_model: Any = None # Placeholder for G2P model
29
- self.dictionaries = defaultdict(lambda: [])
30
- self.oov_word = '<unk>'
31
-
32
- @staticmethod
33
- def from_pretrained(
34
- client_wrapper: SyncAPIClient,
35
- model_path: str,
36
- device: str = 'cpu',
37
- compressed: bool = True,
38
- ):
39
- """Load tokenizer from exported binary file"""
40
- from pathlib import Path
41
-
42
- words_model_path = f'{model_path}/words.bin'
43
- if compressed:
44
- with gzip.open(words_model_path, 'rb') as f:
45
- data = pickle.load(f)
46
- else:
47
- with open(words_model_path, 'rb') as f:
48
- data = pickle.load(f)
49
-
50
- tokenizer = LatticeTokenizer(client_wrapper=client_wrapper)
51
- tokenizer.words = data['words']
52
- tokenizer.dictionaries = defaultdict(list, data['dictionaries'])
53
- tokenizer.oov_word = data['oov_word']
54
-
55
- g2p_model_path = f'{model_path}/g2p.bin' if Path(f'{model_path}/g2p.bin').exists() else None
56
- if g2p_model_path:
57
- tokenizer.g2p_model = G2Phonemizer(g2p_model_path, device=device)
58
- return tokenizer
59
-
60
- def prenormalize(self, texts: List[str], language: Optional[str] = None) -> List[str]:
61
- if not self.g2p_model:
62
- raise ValueError('G2P model is not loaded, cannot prenormalize texts')
63
-
64
- oov_words = []
65
- for text in texts:
66
- words = text.lower().replace('-', ' ').replace('—', ' ').replace('–', ' ').split()
67
- oovs = [w for w in words if w not in self.words]
68
- if oovs:
69
- oov_words.extend([w for w in oovs if (w not in self.words and len(w) <= MAXIMUM_WORD_LENGTH)])
70
-
71
- oov_words = list(set(oov_words))
72
- if oov_words:
73
- indexs = []
74
- for k, _word in enumerate(oov_words):
75
- if any(_word.startswith(p) and _word.endswith(q) for (p, q) in [('(', ')'), ('[', ']')]):
76
- self.dictionaries[_word] = self.dictionaries[self.oov_word]
77
- else:
78
- _word = _word.strip(PUNCTUATION_SPACE)
79
- if not _word or _word in self.words:
80
- indexs.append(k)
81
- for idx in sorted(indexs, reverse=True):
82
- del oov_words[idx]
83
-
84
- g2p_words = [w for w in oov_words if w not in self.dictionaries]
85
- if g2p_words:
86
- predictions = self.g2p_model(words=g2p_words, lang=language, batch_size=len(g2p_words), num_prons=4)
87
- for _word, _predictions in zip(g2p_words, predictions):
88
- for pronuncation in _predictions:
89
- if pronuncation and pronuncation not in self.dictionaries[_word]:
90
- self.dictionaries[_word].append(pronuncation)
91
-
92
- pronunciation_dictionaries: Dict[str, List[List[str]]] = {
93
- w: self.dictionaries[w] for w in oov_words if self.dictionaries[w]
94
- }
95
- return pronunciation_dictionaries
96
-
97
- return {}
98
-
99
- def tokenize(self, supervisions: List[Supervision]) -> Tuple[str, Dict[str, Any]]:
100
- pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions])
101
- response = self.client_wrapper.post(
102
- 'tokenize',
103
- json={
104
- 'supervisions': [s.to_dict() for s in supervisions],
105
- 'pronunciation_dictionaries': pronunciation_dictionaries,
106
- },
107
- )
108
- if response.status_code != 200:
109
- raise Exception(f'Failed to tokenize texts: {response.text}')
110
- result = response.json()
111
- lattice_id = result['id']
112
- return lattice_id, (result['lattice_graph'], result['final_state'], result.get('acoustic_scale', 1.0))
113
-
114
- def detokenize(
115
- self,
116
- lattice_id: str,
117
- lattice_results: Tuple[torch.Tensor, Any, Any, float, float],
118
- # return_supervisions: bool = True,
119
- # return_details: bool = False,
120
- ) -> List[Supervision]:
121
- emission, results, labels, frame_shift, offset, channel = lattice_results # noqa: F841
122
- response = self.client_wrapper.post(
123
- 'detokenize',
124
- json={
125
- 'lattice_id': lattice_id,
126
- 'frame_shift': frame_shift,
127
- 'results': [t.to_dict() for t in results[0]],
128
- 'labels': labels[0],
129
- 'offset': offset,
130
- 'channel': channel,
131
- 'destroy_lattice': True,
132
- },
133
- )
134
- if response.status_code != 200:
135
- raise Exception(f'Failed to detokenize lattice: {response.text}')
136
- result = response.json()
137
- # if return_details:
138
- # raise NotImplementedError("return_details is not implemented yet")
139
- return [Supervision.from_dict(s) for s in result['supervisions']]
140
-
141
-
142
- # Compute average score weighted by the span length
143
- def _score(spans):
144
- if not spans:
145
- return 0.0
146
- # TokenSpan(token=token, start=start, end=end, score=scores[start:end].mean().item())
147
- return round(sum(s.score * len(s) for s in spans) / sum(len(s) for s in spans), ndigits=4)
@@ -1,22 +0,0 @@
1
- lattifai/__init__.py,sha256=JXUg0dT74UyAtKOjewRs9ijr5sl9SYsc6oU_WItY314,1497
2
- lattifai/base_client.py,sha256=ktFtATjL9pLSJUD-VqeJKA1FHkrsGHX7Uq_x00H7gO8,3322
3
- lattifai/client.py,sha256=NjHUqMZFoRxuyxdzBNEcn5kU57gJl31FSb6i9DDcoCw,5059
4
- lattifai/bin/__init__.py,sha256=7YhmtEM8kbxJtz2-KIskvpLKBZAvkMSceVx8z4fkgQ4,61
5
- lattifai/bin/align.py,sha256=nw-wABsNyxC8zN3siiqgNi1Foou3cYhVzIAomuVrFOY,1436
6
- lattifai/bin/cli_base.py,sha256=y535WXDRX8StloFn9icpfw7nQt0JxuWBIuPMnRxAYy8,392
7
- lattifai/bin/subtitle.py,sha256=bUWImAHpvyY59Vskqb5loQiD5ytQOxR8lTQRiQ4LyNA,647
8
- lattifai/io/__init__.py,sha256=vHWRN7MvAch-GUeFqqO-gM57SM-4YOpGUjIxFJdjfPA,671
9
- lattifai/io/reader.py,sha256=ErPnPMUvYQpjZ7Vd86EsHUkOcEfKdoI8iM3yKHRzSOQ,2576
10
- lattifai/io/supervision.py,sha256=5UfSsgBhXoDU3-6drDtoD7y8HIiA4xRKZnbOKgeejwM,354
11
- lattifai/io/writer.py,sha256=1eAEFLlL8kricxRDPFBtVmeC4IiFyFnjbWXvw0VU-q4,2036
12
- lattifai/tokenizers/__init__.py,sha256=aqv44PDtq6g3oFFKW_l4HSR5ywT5W8eP1dHHywIvBfs,72
13
- lattifai/tokenizers/phonemizer.py,sha256=SfRi1KIMpmaao6OVmR1h_I_3QU-vrE6D5bh72Afg5XM,1759
14
- lattifai/tokenizers/tokenizer.py,sha256=u4lgS6-jN9cLuMNIojA4Swfsqb1EcyI7Bh_iw7tuL-s,5818
15
- lattifai/workers/__init__.py,sha256=s6YfkIq4FDIAzY9sPjRpXnJfszj2repqnMTqydRM5Zw,83
16
- lattifai/workers/lattice1_alpha.py,sha256=1VFo59EcygEctTHOhkcII8v3_mrj8JEJ8Fcaqk_7LVo,5762
17
- lattifai-0.2.0.dist-info/licenses/LICENSE,sha256=LNuoH5jpXXNKgjQ3XLwztFq8D3O7kZI-LSg81o4ym2M,1065
18
- lattifai-0.2.0.dist-info/METADATA,sha256=bXb6z5D1C-9YwHeycSFs8SAhUp8VNJbE9u-J9lvYMZ8,8997
19
- lattifai-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
20
- lattifai-0.2.0.dist-info/entry_points.txt,sha256=CwTI2NbJvF9msIHboAfTA99cmDr_HOWoODjS8R64JOw,131
21
- lattifai-0.2.0.dist-info/top_level.txt,sha256=-OVWZ68YYFcTN13ARkLasp2OUappe9wEVq-CKes7jM4,17
22
- lattifai-0.2.0.dist-info/RECORD,,
File without changes
File without changes