lattifai 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lattifai/bin/align.py CHANGED
@@ -23,6 +23,13 @@ from lattifai.bin.cli_base import cli
23
23
  @click.option(
24
24
  '-M', '--model_name_or_path', type=str, default='Lattifai/Lattice-1-Alpha', help='Lattifai model name or path'
25
25
  )
26
+ @click.option(
27
+ '-S',
28
+ '--split_sentence',
29
+ is_flag=True,
30
+ default=False,
31
+ help='Re-segment subtitles by semantics.',
32
+ )
26
33
  @click.argument(
27
34
  'input_audio_path',
28
35
  type=click.Path(exists=True, dir_okay=False),
@@ -42,6 +49,7 @@ def align(
42
49
  input_format: str = 'auto',
43
50
  device: str = 'cpu',
44
51
  model_name_or_path: str = 'Lattifai/Lattice-1-Alpha',
52
+ split_sentence: bool = False,
45
53
  ):
46
54
  """
47
55
  Command used to align audio with subtitles
@@ -50,5 +58,9 @@ def align(
50
58
 
51
59
  client = LattifAI(model_name_or_path=model_name_or_path, device=device)
52
60
  client.alignment(
53
- input_audio_path, input_subtitle_path, format=input_format.lower(), output_subtitle_path=output_subtitle_path
61
+ input_audio_path,
62
+ input_subtitle_path,
63
+ format=input_format.lower(),
64
+ split_sentence=split_sentence,
65
+ output_subtitle_path=output_subtitle_path,
54
66
  )
lattifai/client.py CHANGED
@@ -11,7 +11,7 @@ from lhotse.utils import Pathlike
11
11
 
12
12
  from lattifai.base_client import AsyncAPIClient, LattifAIError, SyncAPIClient
13
13
  from lattifai.io import SubtitleFormat, SubtitleIO
14
- from lattifai.tokenizers import LatticeTokenizer
14
+ from lattifai.tokenizer import LatticeTokenizer
15
15
  from lattifai.workers import Lattice1AlphaWorker
16
16
 
17
17
  load_dotenv()
@@ -87,6 +87,7 @@ class LattifAI(SyncAPIClient):
87
87
  audio: Pathlike,
88
88
  subtitle: Pathlike,
89
89
  format: Optional[SubtitleFormat] = None,
90
+ split_sentence: bool = False,
90
91
  output_subtitle_path: Optional[Pathlike] = None,
91
92
  ) -> str:
92
93
  """Perform alignment on audio and subtitle/text.
@@ -102,11 +103,11 @@ class LattifAI(SyncAPIClient):
102
103
  # step1: parse text or subtitles
103
104
  print(colorful.cyan(f'📖 Step 1: Reading subtitle file from {subtitle}'))
104
105
  supervisions = SubtitleIO.read(subtitle, format=format)
105
- print(colorful.green(f' ✓ Parsed {len(supervisions)} supervision segments'))
106
+ print(colorful.green(f' ✓ Parsed {len(supervisions)} subtitle segments'))
106
107
 
107
108
  # step2: make lattice by call Lattifai API
108
109
  print(colorful.cyan('🔗 Step 2: Creating lattice graph from text'))
109
- lattice_id, lattice_graph = self.tokenizer.tokenize(supervisions)
110
+ lattice_id, lattice_graph = self.tokenizer.tokenize(supervisions, split_sentence=split_sentence)
110
111
  print(colorful.green(f' ✓ Generated lattice graph with ID: {lattice_id}'))
111
112
 
112
113
  # step3: align audio with text
@@ -138,4 +139,4 @@ if __name__ == '__main__':
138
139
  subtitle = 'tests/data/SA1.TXT'
139
140
  output = None
140
141
 
141
- alignments = client.alignment(audio, subtitle, output_subtitle_path=output)
142
+ alignments = client.alignment(audio, subtitle, output_subtitle_path=output, split_sentence=True)
lattifai/io/reader.py CHANGED
@@ -58,13 +58,12 @@ class SubtitleReader(ABCMeta):
58
58
  subs: pysubs2.SSAFile = pysubs2.load(subtitle, encoding='utf-8') # auto detect format
59
59
 
60
60
  supervisions = []
61
-
62
61
  for event in subs.events:
63
62
  supervisions.append(
64
63
  Supervision(
65
64
  text=event.text,
66
65
  # "start": event.start / 1000.0 if event.start is not None else None,
67
- # "duration": event.end / 1000.0 if event.end is not None else None,
66
+ # "duration": (event.end - event.start) / 1000.0 if event.end is not None else None,
68
67
  # }
69
68
  )
70
69
  )
@@ -0,0 +1,284 @@
1
+ import gzip
2
+ import pickle
3
+ import re
4
+ from collections import defaultdict
5
+ from itertools import chain
6
+ from typing import Any, Dict, List, Optional, Tuple, Union
7
+
8
+ import torch
9
+
10
+ from lattifai.base_client import SyncAPIClient
11
+ from lattifai.io import Supervision
12
+ from lattifai.tokenizer.phonemizer import G2Phonemizer
13
+
14
+ PUNCTUATION = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
15
+ END_PUNCTUATION = '.!?"]。!?”】'
16
+ PUNCTUATION_SPACE = PUNCTUATION + ' '
17
+ STAR_TOKEN = '※'
18
+
19
+ GROUPING_SEPARATOR = '✹'
20
+
21
+ MAXIMUM_WORD_LENGTH = 40
22
+
23
+
24
+ class LatticeTokenizer:
25
+ """Tokenizer for converting Lhotse Cut to LatticeGraph."""
26
+
27
+ def __init__(self, client_wrapper: SyncAPIClient):
28
+ self.client_wrapper = client_wrapper
29
+ self.words: List[str] = []
30
+ self.g2p_model: Any = None # Placeholder for G2P model
31
+ self.dictionaries = defaultdict(lambda: [])
32
+ self.oov_word = '<unk>'
33
+ self.sentence_splitter = None
34
+ self.device = 'cpu'
35
+
36
+ def init_sentence_splitter(self):
37
+ if self.sentence_splitter is not None:
38
+ return
39
+
40
+ import onnxruntime as ort
41
+ from wtpsplit import SaT
42
+
43
+ providers = []
44
+ device = self.device
45
+ if device.startswith('cuda') and ort.get_all_providers().count('CUDAExecutionProvider') > 0:
46
+ providers.append('CUDAExecutionProvider')
47
+ elif device.startswith('mps') and ort.get_all_providers().count('MPSExecutionProvider') > 0:
48
+ providers.append('MPSExecutionProvider')
49
+
50
+ sat = SaT(
51
+ 'sat-3l-sm',
52
+ ort_providers=providers + ['CPUExecutionProvider'],
53
+ )
54
+ self.sentence_splitter = sat
55
+
56
+ @staticmethod
57
+ def _resplit_special_sentence_types(sentence: str) -> List[str]:
58
+ """
59
+ Re-split special sentence types.
60
+
61
+ Examples:
62
+ '[APPLAUSE] &gt;&gt; MIRA MURATI:' -> ['[APPLAUSE]', '&gt;&gt; MIRA MURATI:']
63
+ '[MUSIC] &gt;&gt; SPEAKER:' -> ['[MUSIC]', '&gt;&gt; SPEAKER:']
64
+
65
+ Special handling patterns:
66
+ 1. Separate special marks at the beginning (e.g., [APPLAUSE], [MUSIC], etc.) from subsequent speaker marks
67
+ 2. Use speaker marks (&gt;&gt; or other separators) as split points
68
+
69
+ Args:
70
+ sentence: Input sentence string
71
+
72
+ Returns:
73
+ List of re-split sentences. If no special marks are found, returns the original sentence in a list
74
+ """
75
+ # Detect special mark patterns: [SOMETHING] &gt;&gt; SPEAKER:
76
+ # or other forms like [SOMETHING] SPEAKER:
77
+
78
+ # Pattern 1: [mark] HTML-encoded separator speaker:
79
+ pattern1 = r'^(\[[^\]]+\])\s+(&gt;&gt;|>>)\s+(.+)$'
80
+ match1 = re.match(pattern1, sentence.strip())
81
+ if match1:
82
+ special_mark = match1.group(1)
83
+ separator = match1.group(2)
84
+ speaker_part = match1.group(3)
85
+ return [special_mark, f'{separator} {speaker_part}']
86
+
87
+ # Pattern 2: [mark] speaker:
88
+ pattern2 = r'^(\[[^\]]+\])\s+([^:]+:)(.*)$'
89
+ match2 = re.match(pattern2, sentence.strip())
90
+ if match2:
91
+ special_mark = match2.group(1)
92
+ speaker_label = match2.group(2)
93
+ remaining = match2.group(3).strip()
94
+ if remaining:
95
+ return [special_mark, f'{speaker_label} {remaining}']
96
+ else:
97
+ return [special_mark, speaker_label]
98
+
99
+ # If no special pattern matches, return the original sentence
100
+ return [sentence]
101
+
102
+ @staticmethod
103
+ def from_pretrained(
104
+ client_wrapper: SyncAPIClient,
105
+ model_path: str,
106
+ device: str = 'cpu',
107
+ compressed: bool = True,
108
+ ):
109
+ """Load tokenizer from exported binary file"""
110
+ from pathlib import Path
111
+
112
+ words_model_path = f'{model_path}/words.bin'
113
+ if compressed:
114
+ with gzip.open(words_model_path, 'rb') as f:
115
+ data = pickle.load(f)
116
+ else:
117
+ with open(words_model_path, 'rb') as f:
118
+ data = pickle.load(f)
119
+
120
+ tokenizer = LatticeTokenizer(client_wrapper=client_wrapper)
121
+ tokenizer.words = data['words']
122
+ tokenizer.dictionaries = defaultdict(list, data['dictionaries'])
123
+ tokenizer.oov_word = data['oov_word']
124
+
125
+ g2p_model_path = f'{model_path}/g2p.bin' if Path(f'{model_path}/g2p.bin').exists() else None
126
+ if g2p_model_path:
127
+ tokenizer.g2p_model = G2Phonemizer(g2p_model_path, device=device)
128
+
129
+ tokenizer.device = device
130
+ tokenizer.add_special_tokens()
131
+ return tokenizer
132
+
133
+ def add_special_tokens(self):
134
+ tokenizer = self
135
+ for special_token in ['&gt;&gt;', '&gt;']:
136
+ if special_token not in tokenizer.dictionaries:
137
+ tokenizer.dictionaries[special_token] = tokenizer.dictionaries[tokenizer.oov_word]
138
+ return self
139
+
140
+ def prenormalize(self, texts: List[str], language: Optional[str] = None) -> List[str]:
141
+ if not self.g2p_model:
142
+ raise ValueError('G2P model is not loaded, cannot prenormalize texts')
143
+
144
+ oov_words = []
145
+ for text in texts:
146
+ words = text.lower().replace('-', ' ').replace('—', ' ').replace('–', ' ').split()
147
+ oovs = [w for w in words if w not in self.words]
148
+ if oovs:
149
+ oov_words.extend([w for w in oovs if (w not in self.words and len(w) <= MAXIMUM_WORD_LENGTH)])
150
+
151
+ oov_words = list(set(oov_words))
152
+ if oov_words:
153
+ indexs = []
154
+ for k, _word in enumerate(oov_words):
155
+ if any(_word.startswith(p) and _word.endswith(q) for (p, q) in [('(', ')'), ('[', ']')]):
156
+ self.dictionaries[_word] = self.dictionaries[self.oov_word]
157
+ else:
158
+ _word = _word.strip(PUNCTUATION_SPACE)
159
+ if not _word or _word in self.words:
160
+ indexs.append(k)
161
+ for idx in sorted(indexs, reverse=True):
162
+ del oov_words[idx]
163
+
164
+ g2p_words = [w for w in oov_words if w not in self.dictionaries]
165
+ if g2p_words:
166
+ predictions = self.g2p_model(words=g2p_words, lang=language, batch_size=len(g2p_words), num_prons=4)
167
+ for _word, _predictions in zip(g2p_words, predictions):
168
+ for pronuncation in _predictions:
169
+ if pronuncation and pronuncation not in self.dictionaries[_word]:
170
+ self.dictionaries[_word].append(pronuncation)
171
+ if not self.dictionaries[_word]:
172
+ self.dictionaries[_word] = self.dictionaries[self.oov_word]
173
+
174
+ pronunciation_dictionaries: Dict[str, List[List[str]]] = {
175
+ w: self.dictionaries[w] for w in oov_words if self.dictionaries[w]
176
+ }
177
+ return pronunciation_dictionaries
178
+
179
+ return {}
180
+
181
+ def split_sentences(self, supervisions: List[Supervision], strip_whitespace=True) -> List[str]:
182
+ texts, text_len, sidx = [], 0, 0
183
+ for s, supervision in enumerate(supervisions):
184
+ text_len += len(supervision.text)
185
+ if text_len >= 2000 or s == len(supervisions) - 1:
186
+ text = ' '.join([sup.text for sup in supervisions[sidx : s + 1]])
187
+ texts.append(text)
188
+ sidx = s + 1
189
+ text_len = 0
190
+ if sidx < len(supervisions):
191
+ text = ' '.join([sup.text for sup in supervisions[sidx:]])
192
+ texts.append(text)
193
+ sentences = self.sentence_splitter.split(texts, threshold=0.15, strip_whitespace=strip_whitespace)
194
+
195
+ supervisions, remainder = [], ''
196
+ for _sentences in sentences:
197
+ # Process and re-split special sentence types
198
+ processed_sentences = []
199
+ for s, _sentence in enumerate(_sentences):
200
+ if remainder:
201
+ _sentence = remainder + _sentence
202
+ remainder = ''
203
+
204
+ # Detect and split special sentence types: e.g., '[APPLAUSE] &gt;&gt; MIRA MURATI:' -> ['[APPLAUSE]', '&gt;&gt; MIRA MURATI:'] # noqa: E501
205
+ resplit_parts = self._resplit_special_sentence_types(_sentence)
206
+ if any(resplit_parts[-1].endswith(sp) for sp in [':', ':']):
207
+ if s < len(_sentences) - 1:
208
+ _sentences[s + 1] = resplit_parts[-1] + ' ' + _sentences[s + 1]
209
+ else: # last part
210
+ remainder = resplit_parts[-1] + ' ' + remainder
211
+ processed_sentences.extend(resplit_parts[:-1])
212
+ else:
213
+ processed_sentences.extend(resplit_parts)
214
+
215
+ _sentences = processed_sentences
216
+
217
+ if remainder:
218
+ _sentences[0] = remainder + _sentences[0]
219
+ remainder = ''
220
+
221
+ if any(_sentences[-1].endswith(ep) for ep in END_PUNCTUATION):
222
+ supervisions.extend(Supervision(text=s) for s in _sentences)
223
+ else:
224
+ supervisions.extend(Supervision(text=s) for s in _sentences[:-1])
225
+ remainder += _sentences[-1] + ' '
226
+
227
+ if remainder.strip():
228
+ supervisions.append(Supervision(text=remainder.strip()))
229
+
230
+ return supervisions
231
+
232
+ def tokenize(self, supervisions: List[Supervision], split_sentence: bool = False) -> Tuple[str, Dict[str, Any]]:
233
+ if split_sentence:
234
+ self.init_sentence_splitter()
235
+ supervisions = self.split_sentences(supervisions)
236
+
237
+ pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions])
238
+ response = self.client_wrapper.post(
239
+ 'tokenize',
240
+ json={
241
+ 'supervisions': [s.to_dict() for s in supervisions],
242
+ 'pronunciation_dictionaries': pronunciation_dictionaries,
243
+ },
244
+ )
245
+ if response.status_code != 200:
246
+ raise Exception(f'Failed to tokenize texts: {response.text}')
247
+ result = response.json()
248
+ lattice_id = result['id']
249
+ return lattice_id, (result['lattice_graph'], result['final_state'], result.get('acoustic_scale', 1.0))
250
+
251
+ def detokenize(
252
+ self,
253
+ lattice_id: str,
254
+ lattice_results: Tuple[torch.Tensor, Any, Any, float, float],
255
+ # return_supervisions: bool = True,
256
+ # return_details: bool = False,
257
+ ) -> List[Supervision]:
258
+ emission, results, labels, frame_shift, offset, channel = lattice_results # noqa: F841
259
+ response = self.client_wrapper.post(
260
+ 'detokenize',
261
+ json={
262
+ 'lattice_id': lattice_id,
263
+ 'frame_shift': frame_shift,
264
+ 'results': [t.to_dict() for t in results[0]],
265
+ 'labels': labels[0],
266
+ 'offset': offset,
267
+ 'channel': channel,
268
+ 'destroy_lattice': True,
269
+ },
270
+ )
271
+ if response.status_code != 200:
272
+ raise Exception(f'Failed to detokenize lattice: {response.text}')
273
+ result = response.json()
274
+ # if return_details:
275
+ # raise NotImplementedError("return_details is not implemented yet")
276
+ return [Supervision.from_dict(s) for s in result['supervisions']]
277
+
278
+
279
+ # Compute average score weighted by the span length
280
+ def _score(spans):
281
+ if not spans:
282
+ return 0.0
283
+ # TokenSpan(token=token, start=start, end=end, score=scores[start:end].mean().item())
284
+ return round(sum(s.score * len(s) for s in spans) / sum(len(s) for s in spans), ndigits=4)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lattifai
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: Lattifai Python SDK: Seamless Integration with Lattifai's Speech and Video AI Services
5
5
  Author-email: Lattifai Technologies <tech@lattifai.com>
6
6
  Maintainer-email: Lattice <tech@lattifai.com>
@@ -60,6 +60,7 @@ Requires-Dist: tgt
60
60
  Requires-Dist: onnxruntime
61
61
  Requires-Dist: resampy
62
62
  Requires-Dist: g2p-phonemizer==0.1.1
63
+ Requires-Dist: wtpsplit>=2.1.6
63
64
  Provides-Extra: numpy
64
65
  Requires-Dist: numpy; extra == "numpy"
65
66
  Provides-Extra: test
@@ -117,9 +118,37 @@ Usage: lattifai align [OPTIONS] INPUT_AUDIO_PATH INPUT_SUBTITLE_PATH OUTPUT_SUBT
117
118
  Options:
118
119
  -F, --input_format [srt|vtt|ass|txt|auto] Input Subtitle format.
119
120
  -D, --device [cpu|cuda|mps] Device to use for inference.
121
+ --split_sentence Smart sentence splitting based on punctuation semantics.
120
122
  --help Show this message and exit.
121
123
  ```
122
124
 
125
+ #### Understanding --split_sentence
126
+
127
+ The `--split_sentence` option performs intelligent sentence re-splitting based on punctuation and semantic boundaries. This is especially useful when processing subtitles that combine multiple semantic units in a single segment, such as:
128
+
129
+ - **Mixed content**: Non-speech elements (e.g., `[APPLAUSE]`, `[MUSIC]`) followed by actual dialogue
130
+ - **Natural punctuation boundaries**: Colons, periods, and other punctuation marks that indicate semantic breaks
131
+ - **Concatenated phrases**: Multiple distinct utterances joined together without proper separation
132
+
133
+ **Example transformations**:
134
+ ```
135
+ Input: "[APPLAUSE] >> MIRA MURATI: Thank you all"
136
+ Output: ["[APPLAUSE]", ">> MIRA MURATI: Thank you all"]
137
+
138
+ Input: "[MUSIC] Welcome back. Today we discuss AI."
139
+ Output: ["[MUSIC]", "Welcome back.", "Today we discuss AI."]
140
+ ```
141
+
142
+ This feature helps improve alignment accuracy by:
143
+ 1. Respecting punctuation-based semantic boundaries
144
+ 2. Separating distinct utterances for more precise timing
145
+ 3. Maintaining semantic context for each independent phrase
146
+
147
+ **Usage**:
148
+ ```bash
149
+ lattifai align --split_sentence audio.wav subtitle.srt output.srt
150
+ ```
151
+
123
152
  ### Python API
124
153
 
125
154
  ```python
@@ -136,6 +165,7 @@ client = LattifAI(
136
165
  result = client.alignment(
137
166
  audio="audio.wav",
138
167
  subtitle="subtitle.srt",
168
+ split_sentence=False,
139
169
  output_subtitle_path="output.srt"
140
170
  )
141
171
  ```
@@ -161,13 +191,21 @@ LattifAI(
161
191
 
162
192
  ```python
163
193
  client.alignment(
164
- audio: str, # Path to audio file
165
- subtitle: str, # Path to subtitle/text file
166
- format: Optional[str] = None, # 'srt', 'vtt', 'ass', 'txt' (auto-detect if None)
194
+ audio: str, # Path to audio file
195
+ subtitle: str, # Path to subtitle/text file
196
+ format: Optional[str] = None, # 'srt', 'vtt', 'ass', 'txt' (auto-detect if None)
197
+ split_sentence: bool = False, # Smart sentence splitting based on punctuation semantics
167
198
  output_subtitle_path: Optional[str] = None
168
199
  ) -> str
169
200
  ```
170
201
 
202
+ **Parameters**:
203
+ - `audio`: Path to the audio file to be aligned
204
+ - `subtitle`: Path to the subtitle or text file
205
+ - `format`: Subtitle format ('srt', 'vtt', 'ass', 'txt'). Auto-detected if None
206
+ - `split_sentence`: Enable intelligent sentence re-splitting (default: False). Set to True when subtitles combine multiple semantic units (non-speech elements + dialogue, or multiple sentences) that would benefit from separate timing alignment
207
+ - `output_subtitle_path`: Output path for aligned subtitle (optional)
208
+
171
209
  ## Examples
172
210
 
173
211
  ### Basic Text Alignment
@@ -178,6 +216,7 @@ client.alignment(
178
216
  audio="speech.wav",
179
217
  subtitle="transcript.txt",
180
218
  format="txt",
219
+ split_sentence=False,
181
220
  output_subtitle_path="output.srt"
182
221
  )
183
222
  ```
@@ -1,22 +1,22 @@
1
1
  lattifai/__init__.py,sha256=JXUg0dT74UyAtKOjewRs9ijr5sl9SYsc6oU_WItY314,1497
2
2
  lattifai/base_client.py,sha256=ktFtATjL9pLSJUD-VqeJKA1FHkrsGHX7Uq_x00H7gO8,3322
3
- lattifai/client.py,sha256=NjHUqMZFoRxuyxdzBNEcn5kU57gJl31FSb6i9DDcoCw,5059
3
+ lattifai/client.py,sha256=QXbdTuDA5Aap2udu4iig7CVxlgwOIrydpuLlVASs0aA,5145
4
4
  lattifai/bin/__init__.py,sha256=7YhmtEM8kbxJtz2-KIskvpLKBZAvkMSceVx8z4fkgQ4,61
5
- lattifai/bin/align.py,sha256=nw-wABsNyxC8zN3siiqgNi1Foou3cYhVzIAomuVrFOY,1436
5
+ lattifai/bin/align.py,sha256=nQs901SDYmxyH2AXBtjgZGzrpwLaxANQRYP49Bd1AWo,1669
6
6
  lattifai/bin/cli_base.py,sha256=y535WXDRX8StloFn9icpfw7nQt0JxuWBIuPMnRxAYy8,392
7
7
  lattifai/bin/subtitle.py,sha256=bUWImAHpvyY59Vskqb5loQiD5ytQOxR8lTQRiQ4LyNA,647
8
8
  lattifai/io/__init__.py,sha256=vHWRN7MvAch-GUeFqqO-gM57SM-4YOpGUjIxFJdjfPA,671
9
- lattifai/io/reader.py,sha256=ErPnPMUvYQpjZ7Vd86EsHUkOcEfKdoI8iM3yKHRzSOQ,2576
9
+ lattifai/io/reader.py,sha256=mtgxT5c_BiHbqqJvPE3nf7TIe_OcWgGu1zr6iXasfrk,2591
10
10
  lattifai/io/supervision.py,sha256=5UfSsgBhXoDU3-6drDtoD7y8HIiA4xRKZnbOKgeejwM,354
11
11
  lattifai/io/writer.py,sha256=1eAEFLlL8kricxRDPFBtVmeC4IiFyFnjbWXvw0VU-q4,2036
12
- lattifai/tokenizers/__init__.py,sha256=aqv44PDtq6g3oFFKW_l4HSR5ywT5W8eP1dHHywIvBfs,72
13
- lattifai/tokenizers/phonemizer.py,sha256=SfRi1KIMpmaao6OVmR1h_I_3QU-vrE6D5bh72Afg5XM,1759
14
- lattifai/tokenizers/tokenizer.py,sha256=u4lgS6-jN9cLuMNIojA4Swfsqb1EcyI7Bh_iw7tuL-s,5818
12
+ lattifai/tokenizer/__init__.py,sha256=aqv44PDtq6g3oFFKW_l4HSR5ywT5W8eP1dHHywIvBfs,72
13
+ lattifai/tokenizer/phonemizer.py,sha256=SfRi1KIMpmaao6OVmR1h_I_3QU-vrE6D5bh72Afg5XM,1759
14
+ lattifai/tokenizer/tokenizer.py,sha256=Yuo0pLPQnF2uX0Fm5g8i5vtcADn7GeLpSqdGpMJgTww,11492
15
15
  lattifai/workers/__init__.py,sha256=s6YfkIq4FDIAzY9sPjRpXnJfszj2repqnMTqydRM5Zw,83
16
16
  lattifai/workers/lattice1_alpha.py,sha256=1VFo59EcygEctTHOhkcII8v3_mrj8JEJ8Fcaqk_7LVo,5762
17
- lattifai-0.2.0.dist-info/licenses/LICENSE,sha256=LNuoH5jpXXNKgjQ3XLwztFq8D3O7kZI-LSg81o4ym2M,1065
18
- lattifai-0.2.0.dist-info/METADATA,sha256=bXb6z5D1C-9YwHeycSFs8SAhUp8VNJbE9u-J9lvYMZ8,8997
19
- lattifai-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
20
- lattifai-0.2.0.dist-info/entry_points.txt,sha256=CwTI2NbJvF9msIHboAfTA99cmDr_HOWoODjS8R64JOw,131
21
- lattifai-0.2.0.dist-info/top_level.txt,sha256=-OVWZ68YYFcTN13ARkLasp2OUappe9wEVq-CKes7jM4,17
22
- lattifai-0.2.0.dist-info/RECORD,,
17
+ lattifai-0.2.2.dist-info/licenses/LICENSE,sha256=LNuoH5jpXXNKgjQ3XLwztFq8D3O7kZI-LSg81o4ym2M,1065
18
+ lattifai-0.2.2.dist-info/METADATA,sha256=4vmPOYKsIlvADiw0zUDQ2dbDpe-vOV-o5A0Hs1p7xfg,10971
19
+ lattifai-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
20
+ lattifai-0.2.2.dist-info/entry_points.txt,sha256=CwTI2NbJvF9msIHboAfTA99cmDr_HOWoODjS8R64JOw,131
21
+ lattifai-0.2.2.dist-info/top_level.txt,sha256=-OVWZ68YYFcTN13ARkLasp2OUappe9wEVq-CKes7jM4,17
22
+ lattifai-0.2.2.dist-info/RECORD,,
@@ -1,147 +0,0 @@
1
- import gzip
2
- import pickle
3
- from collections import defaultdict
4
- from itertools import chain
5
- from typing import Any, Dict, List, Optional, Tuple
6
-
7
- import torch
8
-
9
- from lattifai.base_client import SyncAPIClient
10
- from lattifai.io import Supervision
11
- from lattifai.tokenizers.phonemizer import G2Phonemizer
12
-
13
- PUNCTUATION = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
14
- PUNCTUATION_SPACE = PUNCTUATION + ' '
15
- STAR_TOKEN = '※'
16
-
17
- GROUPING_SEPARATOR = '✹'
18
-
19
- MAXIMUM_WORD_LENGTH = 40
20
-
21
-
22
- class LatticeTokenizer:
23
- """Tokenizer for converting Lhotse Cut to LatticeGraph."""
24
-
25
- def __init__(self, client_wrapper: SyncAPIClient):
26
- self.client_wrapper = client_wrapper
27
- self.words: List[str] = []
28
- self.g2p_model: Any = None # Placeholder for G2P model
29
- self.dictionaries = defaultdict(lambda: [])
30
- self.oov_word = '<unk>'
31
-
32
- @staticmethod
33
- def from_pretrained(
34
- client_wrapper: SyncAPIClient,
35
- model_path: str,
36
- device: str = 'cpu',
37
- compressed: bool = True,
38
- ):
39
- """Load tokenizer from exported binary file"""
40
- from pathlib import Path
41
-
42
- words_model_path = f'{model_path}/words.bin'
43
- if compressed:
44
- with gzip.open(words_model_path, 'rb') as f:
45
- data = pickle.load(f)
46
- else:
47
- with open(words_model_path, 'rb') as f:
48
- data = pickle.load(f)
49
-
50
- tokenizer = LatticeTokenizer(client_wrapper=client_wrapper)
51
- tokenizer.words = data['words']
52
- tokenizer.dictionaries = defaultdict(list, data['dictionaries'])
53
- tokenizer.oov_word = data['oov_word']
54
-
55
- g2p_model_path = f'{model_path}/g2p.bin' if Path(f'{model_path}/g2p.bin').exists() else None
56
- if g2p_model_path:
57
- tokenizer.g2p_model = G2Phonemizer(g2p_model_path, device=device)
58
- return tokenizer
59
-
60
- def prenormalize(self, texts: List[str], language: Optional[str] = None) -> List[str]:
61
- if not self.g2p_model:
62
- raise ValueError('G2P model is not loaded, cannot prenormalize texts')
63
-
64
- oov_words = []
65
- for text in texts:
66
- words = text.lower().replace('-', ' ').replace('—', ' ').replace('–', ' ').split()
67
- oovs = [w for w in words if w not in self.words]
68
- if oovs:
69
- oov_words.extend([w for w in oovs if (w not in self.words and len(w) <= MAXIMUM_WORD_LENGTH)])
70
-
71
- oov_words = list(set(oov_words))
72
- if oov_words:
73
- indexs = []
74
- for k, _word in enumerate(oov_words):
75
- if any(_word.startswith(p) and _word.endswith(q) for (p, q) in [('(', ')'), ('[', ']')]):
76
- self.dictionaries[_word] = self.dictionaries[self.oov_word]
77
- else:
78
- _word = _word.strip(PUNCTUATION_SPACE)
79
- if not _word or _word in self.words:
80
- indexs.append(k)
81
- for idx in sorted(indexs, reverse=True):
82
- del oov_words[idx]
83
-
84
- g2p_words = [w for w in oov_words if w not in self.dictionaries]
85
- if g2p_words:
86
- predictions = self.g2p_model(words=g2p_words, lang=language, batch_size=len(g2p_words), num_prons=4)
87
- for _word, _predictions in zip(g2p_words, predictions):
88
- for pronuncation in _predictions:
89
- if pronuncation and pronuncation not in self.dictionaries[_word]:
90
- self.dictionaries[_word].append(pronuncation)
91
-
92
- pronunciation_dictionaries: Dict[str, List[List[str]]] = {
93
- w: self.dictionaries[w] for w in oov_words if self.dictionaries[w]
94
- }
95
- return pronunciation_dictionaries
96
-
97
- return {}
98
-
99
- def tokenize(self, supervisions: List[Supervision]) -> Tuple[str, Dict[str, Any]]:
100
- pronunciation_dictionaries = self.prenormalize([s.text for s in supervisions])
101
- response = self.client_wrapper.post(
102
- 'tokenize',
103
- json={
104
- 'supervisions': [s.to_dict() for s in supervisions],
105
- 'pronunciation_dictionaries': pronunciation_dictionaries,
106
- },
107
- )
108
- if response.status_code != 200:
109
- raise Exception(f'Failed to tokenize texts: {response.text}')
110
- result = response.json()
111
- lattice_id = result['id']
112
- return lattice_id, (result['lattice_graph'], result['final_state'], result.get('acoustic_scale', 1.0))
113
-
114
- def detokenize(
115
- self,
116
- lattice_id: str,
117
- lattice_results: Tuple[torch.Tensor, Any, Any, float, float],
118
- # return_supervisions: bool = True,
119
- # return_details: bool = False,
120
- ) -> List[Supervision]:
121
- emission, results, labels, frame_shift, offset, channel = lattice_results # noqa: F841
122
- response = self.client_wrapper.post(
123
- 'detokenize',
124
- json={
125
- 'lattice_id': lattice_id,
126
- 'frame_shift': frame_shift,
127
- 'results': [t.to_dict() for t in results[0]],
128
- 'labels': labels[0],
129
- 'offset': offset,
130
- 'channel': channel,
131
- 'destroy_lattice': True,
132
- },
133
- )
134
- if response.status_code != 200:
135
- raise Exception(f'Failed to detokenize lattice: {response.text}')
136
- result = response.json()
137
- # if return_details:
138
- # raise NotImplementedError("return_details is not implemented yet")
139
- return [Supervision.from_dict(s) for s in result['supervisions']]
140
-
141
-
142
- # Compute average score weighted by the span length
143
- def _score(spans):
144
- if not spans:
145
- return 0.0
146
- # TokenSpan(token=token, start=start, end=end, score=scores[start:end].mean().item())
147
- return round(sum(s.score * len(s) for s in spans) / sum(len(s) for s in spans), ndigits=4)
File without changes
File without changes