lattifai 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/_init.py +20 -0
- lattifai/alignment/__init__.py +2 -3
- lattifai/alignment/lattice1_aligner.py +117 -4
- lattifai/alignment/lattice1_worker.py +47 -4
- lattifai/alignment/segmenter.py +3 -2
- lattifai/alignment/text_align.py +2 -1
- lattifai/alignment/tokenizer.py +56 -29
- lattifai/audio2.py +162 -183
- lattifai/cli/alignment.py +5 -0
- lattifai/cli/caption.py +6 -6
- lattifai/cli/transcribe.py +1 -5
- lattifai/cli/youtube.py +3 -0
- lattifai/client.py +41 -12
- lattifai/config/__init__.py +21 -3
- lattifai/config/alignment.py +7 -0
- lattifai/config/caption.py +13 -243
- lattifai/config/client.py +16 -0
- lattifai/config/event.py +102 -0
- lattifai/config/transcription.py +25 -1
- lattifai/data/__init__.py +8 -0
- lattifai/data/caption.py +228 -0
- lattifai/errors.py +78 -53
- lattifai/event/__init__.py +65 -0
- lattifai/event/lattifai.py +166 -0
- lattifai/mixin.py +22 -17
- lattifai/transcription/base.py +2 -1
- lattifai/transcription/gemini.py +147 -16
- lattifai/transcription/lattifai.py +8 -11
- lattifai/types.py +1 -1
- lattifai/youtube/client.py +143 -48
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/METADATA +129 -58
- lattifai-1.3.1.dist-info/RECORD +57 -0
- lattifai/__init__.py +0 -88
- lattifai/alignment/sentence_splitter.py +0 -350
- lattifai/caption/__init__.py +0 -96
- lattifai/caption/caption.py +0 -661
- lattifai/caption/formats/__init__.py +0 -199
- lattifai/caption/formats/base.py +0 -211
- lattifai/caption/formats/gemini.py +0 -722
- lattifai/caption/formats/json.py +0 -194
- lattifai/caption/formats/lrc.py +0 -309
- lattifai/caption/formats/nle/__init__.py +0 -9
- lattifai/caption/formats/nle/audition.py +0 -561
- lattifai/caption/formats/nle/avid.py +0 -423
- lattifai/caption/formats/nle/fcpxml.py +0 -549
- lattifai/caption/formats/nle/premiere.py +0 -589
- lattifai/caption/formats/pysubs2.py +0 -642
- lattifai/caption/formats/sbv.py +0 -147
- lattifai/caption/formats/tabular.py +0 -338
- lattifai/caption/formats/textgrid.py +0 -193
- lattifai/caption/formats/ttml.py +0 -652
- lattifai/caption/formats/vtt.py +0 -469
- lattifai/caption/parsers/__init__.py +0 -9
- lattifai/caption/parsers/text_parser.py +0 -147
- lattifai/caption/standardize.py +0 -636
- lattifai/caption/supervision.py +0 -34
- lattifai/caption/utils.py +0 -474
- lattifai-1.2.2.dist-info/RECORD +0 -76
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/WHEEL +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/entry_points.txt +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.1.dist-info}/top_level.txt +0 -0
lattifai/caption/utils.py
DELETED
|
@@ -1,474 +0,0 @@
|
|
|
1
|
-
"""Utility functions for caption processing.
|
|
2
|
-
|
|
3
|
-
This module provides utility functions for:
|
|
4
|
-
- Timecode offset handling (for professional timelines starting at 01:00:00:00)
|
|
5
|
-
- Overlap/collision resolution (merge or trim modes)
|
|
6
|
-
- SRT format optimization (UTF-8 BOM, comma-separated milliseconds)
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
from copy import deepcopy
|
|
10
|
-
from dataclasses import dataclass
|
|
11
|
-
from enum import Enum
|
|
12
|
-
from typing import TYPE_CHECKING, List, Optional, Tuple
|
|
13
|
-
|
|
14
|
-
if TYPE_CHECKING:
|
|
15
|
-
from .supervision import Supervision
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class CollisionMode(Enum):
|
|
19
|
-
"""Mode for resolving overlapping captions."""
|
|
20
|
-
|
|
21
|
-
MERGE = "merge" # Merge overlapping lines with line break
|
|
22
|
-
TRIM = "trim" # Trim earlier caption to end before later starts
|
|
23
|
-
KEEP = "keep" # Keep overlaps as-is (may cause issues in some NLE)
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
@dataclass
|
|
27
|
-
class TimecodeOffset:
|
|
28
|
-
"""Configuration for timecode offset.
|
|
29
|
-
|
|
30
|
-
Professional timelines often start at 01:00:00:00 instead of 00:00:00:00.
|
|
31
|
-
This class handles the offset conversion.
|
|
32
|
-
|
|
33
|
-
Attributes:
|
|
34
|
-
hours: Hour offset (default 0)
|
|
35
|
-
minutes: Minute offset (default 0)
|
|
36
|
-
seconds: Second offset (default 0)
|
|
37
|
-
frames: Frame offset (default 0)
|
|
38
|
-
fps: Frame rate for frame-based offset calculation
|
|
39
|
-
"""
|
|
40
|
-
|
|
41
|
-
hours: int = 0
|
|
42
|
-
minutes: int = 0
|
|
43
|
-
seconds: float = 0.0
|
|
44
|
-
frames: int = 0
|
|
45
|
-
fps: float = 25.0
|
|
46
|
-
|
|
47
|
-
@property
|
|
48
|
-
def total_seconds(self) -> float:
|
|
49
|
-
"""Calculate total offset in seconds."""
|
|
50
|
-
return self.hours * 3600 + self.minutes * 60 + self.seconds + (self.frames / self.fps)
|
|
51
|
-
|
|
52
|
-
@classmethod
|
|
53
|
-
def from_timecode(cls, timecode: str, fps: float = 25.0) -> "TimecodeOffset":
|
|
54
|
-
"""Create offset from timecode string.
|
|
55
|
-
|
|
56
|
-
Args:
|
|
57
|
-
timecode: Timecode string (HH:MM:SS:FF or HH:MM:SS.mmm)
|
|
58
|
-
fps: Frame rate
|
|
59
|
-
|
|
60
|
-
Returns:
|
|
61
|
-
TimecodeOffset instance
|
|
62
|
-
"""
|
|
63
|
-
# Handle different separators
|
|
64
|
-
if ";" in timecode:
|
|
65
|
-
# Drop-frame format
|
|
66
|
-
parts = timecode.replace(";", ":").split(":")
|
|
67
|
-
else:
|
|
68
|
-
parts = timecode.split(":")
|
|
69
|
-
|
|
70
|
-
hours = int(parts[0]) if len(parts) > 0 else 0
|
|
71
|
-
minutes = int(parts[1]) if len(parts) > 1 else 0
|
|
72
|
-
|
|
73
|
-
# Handle seconds (may have frames or milliseconds)
|
|
74
|
-
if len(parts) > 2:
|
|
75
|
-
sec_part = parts[2]
|
|
76
|
-
if "." in sec_part:
|
|
77
|
-
# Millisecond format
|
|
78
|
-
seconds = float(sec_part)
|
|
79
|
-
frames = 0
|
|
80
|
-
else:
|
|
81
|
-
seconds = float(sec_part)
|
|
82
|
-
frames = int(parts[3]) if len(parts) > 3 else 0
|
|
83
|
-
else:
|
|
84
|
-
seconds = 0.0
|
|
85
|
-
frames = 0
|
|
86
|
-
|
|
87
|
-
return cls(hours=hours, minutes=minutes, seconds=seconds, frames=frames, fps=fps)
|
|
88
|
-
|
|
89
|
-
@classmethod
|
|
90
|
-
def broadcast_start(cls, fps: float = 25.0) -> "TimecodeOffset":
|
|
91
|
-
"""Create standard broadcast start offset (01:00:00:00).
|
|
92
|
-
|
|
93
|
-
Args:
|
|
94
|
-
fps: Frame rate
|
|
95
|
-
|
|
96
|
-
Returns:
|
|
97
|
-
TimecodeOffset for broadcast start
|
|
98
|
-
"""
|
|
99
|
-
return cls(hours=1, fps=fps)
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
def apply_timecode_offset(
|
|
103
|
-
supervisions: List["Supervision"],
|
|
104
|
-
offset: TimecodeOffset,
|
|
105
|
-
) -> List["Supervision"]:
|
|
106
|
-
"""Apply timecode offset to all supervisions.
|
|
107
|
-
|
|
108
|
-
Args:
|
|
109
|
-
supervisions: List of supervision segments
|
|
110
|
-
offset: Timecode offset to apply
|
|
111
|
-
|
|
112
|
-
Returns:
|
|
113
|
-
New list of supervisions with offset applied
|
|
114
|
-
"""
|
|
115
|
-
from .supervision import Supervision
|
|
116
|
-
|
|
117
|
-
offset_seconds = offset.total_seconds
|
|
118
|
-
result = []
|
|
119
|
-
|
|
120
|
-
for sup in supervisions:
|
|
121
|
-
new_sup = Supervision(
|
|
122
|
-
text=sup.text,
|
|
123
|
-
start=sup.start + offset_seconds,
|
|
124
|
-
duration=sup.duration,
|
|
125
|
-
speaker=sup.speaker,
|
|
126
|
-
id=sup.id,
|
|
127
|
-
language=sup.language,
|
|
128
|
-
alignment=deepcopy(getattr(sup, "alignment", None)),
|
|
129
|
-
custom=sup.custom.copy() if sup.custom else None,
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
# Also offset word-level alignments if present
|
|
133
|
-
if new_sup.alignment and "word" in new_sup.alignment:
|
|
134
|
-
from lhotse.supervision import AlignmentItem
|
|
135
|
-
|
|
136
|
-
new_words = []
|
|
137
|
-
for word in new_sup.alignment["word"]:
|
|
138
|
-
new_words.append(
|
|
139
|
-
AlignmentItem(
|
|
140
|
-
symbol=word.symbol,
|
|
141
|
-
start=word.start + offset_seconds,
|
|
142
|
-
duration=word.duration,
|
|
143
|
-
score=word.score,
|
|
144
|
-
)
|
|
145
|
-
)
|
|
146
|
-
new_sup.alignment["word"] = new_words
|
|
147
|
-
|
|
148
|
-
result.append(new_sup)
|
|
149
|
-
|
|
150
|
-
return result
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
def resolve_overlaps(
|
|
154
|
-
supervisions: List["Supervision"],
|
|
155
|
-
mode: CollisionMode = CollisionMode.MERGE,
|
|
156
|
-
gap_threshold: float = 0.05,
|
|
157
|
-
) -> List["Supervision"]:
|
|
158
|
-
"""Resolve overlapping supervisions.
|
|
159
|
-
|
|
160
|
-
Args:
|
|
161
|
-
supervisions: List of supervision segments (should be sorted by start time)
|
|
162
|
-
mode: How to handle overlaps (MERGE, TRIM, or KEEP)
|
|
163
|
-
gap_threshold: Minimum gap between captions in seconds (for TRIM mode)
|
|
164
|
-
|
|
165
|
-
Returns:
|
|
166
|
-
New list of supervisions with overlaps resolved
|
|
167
|
-
"""
|
|
168
|
-
from .supervision import Supervision
|
|
169
|
-
|
|
170
|
-
if not supervisions or mode == CollisionMode.KEEP:
|
|
171
|
-
return supervisions
|
|
172
|
-
|
|
173
|
-
# Sort by start time
|
|
174
|
-
sorted_sups = sorted(supervisions, key=lambda x: x.start)
|
|
175
|
-
result = []
|
|
176
|
-
|
|
177
|
-
i = 0
|
|
178
|
-
while i < len(sorted_sups):
|
|
179
|
-
current = sorted_sups[i]
|
|
180
|
-
|
|
181
|
-
# Find all overlapping supervisions
|
|
182
|
-
overlapping = [current]
|
|
183
|
-
j = i + 1
|
|
184
|
-
while j < len(sorted_sups):
|
|
185
|
-
next_sup = sorted_sups[j]
|
|
186
|
-
# Check if next overlaps with any in our group
|
|
187
|
-
current_end = max(s.end for s in overlapping)
|
|
188
|
-
if next_sup.start < current_end:
|
|
189
|
-
overlapping.append(next_sup)
|
|
190
|
-
j += 1
|
|
191
|
-
else:
|
|
192
|
-
break
|
|
193
|
-
|
|
194
|
-
if len(overlapping) == 1:
|
|
195
|
-
# No overlap
|
|
196
|
-
result.append(current)
|
|
197
|
-
i += 1
|
|
198
|
-
elif mode == CollisionMode.MERGE:
|
|
199
|
-
# Merge all overlapping into one
|
|
200
|
-
merged = _merge_supervisions(overlapping)
|
|
201
|
-
result.append(merged)
|
|
202
|
-
i = j
|
|
203
|
-
elif mode == CollisionMode.TRIM:
|
|
204
|
-
# Trim each to not overlap with next
|
|
205
|
-
for k, sup in enumerate(overlapping[:-1]):
|
|
206
|
-
next_sup = overlapping[k + 1]
|
|
207
|
-
# Trim current to end before next starts
|
|
208
|
-
new_duration = max(gap_threshold, next_sup.start - sup.start - gap_threshold)
|
|
209
|
-
trimmed = Supervision(
|
|
210
|
-
text=sup.text,
|
|
211
|
-
start=sup.start,
|
|
212
|
-
duration=min(sup.duration, new_duration),
|
|
213
|
-
speaker=sup.speaker,
|
|
214
|
-
id=sup.id,
|
|
215
|
-
language=sup.language,
|
|
216
|
-
alignment=sup.alignment,
|
|
217
|
-
custom=sup.custom,
|
|
218
|
-
)
|
|
219
|
-
result.append(trimmed)
|
|
220
|
-
# Add last one as-is
|
|
221
|
-
result.append(overlapping[-1])
|
|
222
|
-
i = j
|
|
223
|
-
else:
|
|
224
|
-
result.append(current)
|
|
225
|
-
i += 1
|
|
226
|
-
|
|
227
|
-
return result
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
def _merge_supervisions(supervisions: List["Supervision"]) -> "Supervision":
|
|
231
|
-
"""Merge multiple overlapping supervisions into one.
|
|
232
|
-
|
|
233
|
-
Args:
|
|
234
|
-
supervisions: List of overlapping supervisions
|
|
235
|
-
|
|
236
|
-
Returns:
|
|
237
|
-
Single merged supervision
|
|
238
|
-
"""
|
|
239
|
-
from .supervision import Supervision
|
|
240
|
-
|
|
241
|
-
if not supervisions:
|
|
242
|
-
raise ValueError("Cannot merge empty supervision list")
|
|
243
|
-
|
|
244
|
-
if len(supervisions) == 1:
|
|
245
|
-
return supervisions[0]
|
|
246
|
-
|
|
247
|
-
# Calculate merged timing
|
|
248
|
-
start = min(s.start for s in supervisions)
|
|
249
|
-
end = max(s.end for s in supervisions)
|
|
250
|
-
|
|
251
|
-
# Merge text with line breaks, indicating speakers
|
|
252
|
-
texts = []
|
|
253
|
-
for sup in supervisions:
|
|
254
|
-
text = sup.text.strip() if sup.text else ""
|
|
255
|
-
if sup.speaker:
|
|
256
|
-
texts.append(f"- {sup.speaker}: {text}")
|
|
257
|
-
else:
|
|
258
|
-
texts.append(f"- {text}")
|
|
259
|
-
|
|
260
|
-
merged_text = "\n".join(texts)
|
|
261
|
-
|
|
262
|
-
# Use first supervision's speaker or None for mixed speakers
|
|
263
|
-
speakers = set(s.speaker for s in supervisions if s.speaker)
|
|
264
|
-
speaker = supervisions[0].speaker if len(speakers) == 1 else None
|
|
265
|
-
|
|
266
|
-
return Supervision(
|
|
267
|
-
text=merged_text,
|
|
268
|
-
start=start,
|
|
269
|
-
duration=end - start,
|
|
270
|
-
speaker=speaker,
|
|
271
|
-
id=supervisions[0].id,
|
|
272
|
-
language=supervisions[0].language,
|
|
273
|
-
)
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
def format_srt_timestamp(seconds: float) -> str:
|
|
277
|
-
"""Format timestamp for SRT format.
|
|
278
|
-
|
|
279
|
-
SRT uses comma as millisecond separator: HH:MM:SS,mmm
|
|
280
|
-
|
|
281
|
-
Args:
|
|
282
|
-
seconds: Time in seconds
|
|
283
|
-
|
|
284
|
-
Returns:
|
|
285
|
-
SRT-formatted timestamp string
|
|
286
|
-
"""
|
|
287
|
-
if seconds < 0:
|
|
288
|
-
seconds = 0
|
|
289
|
-
|
|
290
|
-
hours = int(seconds // 3600)
|
|
291
|
-
minutes = int((seconds % 3600) // 60)
|
|
292
|
-
secs = int(seconds % 60)
|
|
293
|
-
millis = int((seconds % 1) * 1000)
|
|
294
|
-
|
|
295
|
-
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
def generate_srt_content(
|
|
299
|
-
supervisions: List["Supervision"],
|
|
300
|
-
include_speaker: bool = True,
|
|
301
|
-
use_bom: bool = True,
|
|
302
|
-
) -> bytes:
|
|
303
|
-
"""Generate SRT content with proper formatting.
|
|
304
|
-
|
|
305
|
-
Args:
|
|
306
|
-
supervisions: List of supervision segments
|
|
307
|
-
include_speaker: Include speaker labels in text
|
|
308
|
-
use_bom: Include UTF-8 BOM for Windows compatibility
|
|
309
|
-
|
|
310
|
-
Returns:
|
|
311
|
-
SRT content as bytes
|
|
312
|
-
"""
|
|
313
|
-
lines = []
|
|
314
|
-
|
|
315
|
-
for i, sup in enumerate(supervisions, 1):
|
|
316
|
-
# Sequence number
|
|
317
|
-
lines.append(str(i))
|
|
318
|
-
|
|
319
|
-
# Timestamp line with comma separator
|
|
320
|
-
start_ts = format_srt_timestamp(sup.start)
|
|
321
|
-
end_ts = format_srt_timestamp(sup.end)
|
|
322
|
-
lines.append(f"{start_ts} --> {end_ts}")
|
|
323
|
-
|
|
324
|
-
# Text content
|
|
325
|
-
text = sup.text.strip() if sup.text else ""
|
|
326
|
-
if include_speaker and sup.speaker:
|
|
327
|
-
# Check if speaker was originally in text
|
|
328
|
-
if not (hasattr(sup, "custom") and sup.custom and not sup.custom.get("original_speaker", True)):
|
|
329
|
-
text = f"{sup.speaker}: {text}"
|
|
330
|
-
lines.append(text)
|
|
331
|
-
|
|
332
|
-
# Blank line between entries
|
|
333
|
-
lines.append("")
|
|
334
|
-
|
|
335
|
-
content = "\n".join(lines)
|
|
336
|
-
|
|
337
|
-
if use_bom:
|
|
338
|
-
# UTF-8 with BOM for Windows compatibility
|
|
339
|
-
return b"\xef\xbb\xbf" + content.encode("utf-8")
|
|
340
|
-
else:
|
|
341
|
-
return content.encode("utf-8")
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
def detect_overlaps(supervisions: List["Supervision"]) -> List[Tuple[int, int]]:
|
|
345
|
-
"""Detect all overlapping supervision pairs.
|
|
346
|
-
|
|
347
|
-
Args:
|
|
348
|
-
supervisions: List of supervision segments
|
|
349
|
-
|
|
350
|
-
Returns:
|
|
351
|
-
List of tuples (index1, index2) where supervisions overlap
|
|
352
|
-
"""
|
|
353
|
-
overlaps = []
|
|
354
|
-
sorted_sups = sorted(enumerate(supervisions), key=lambda x: x[1].start)
|
|
355
|
-
|
|
356
|
-
for i in range(len(sorted_sups) - 1):
|
|
357
|
-
idx1, sup1 = sorted_sups[i]
|
|
358
|
-
for j in range(i + 1, len(sorted_sups)):
|
|
359
|
-
idx2, sup2 = sorted_sups[j]
|
|
360
|
-
if sup2.start >= sup1.end:
|
|
361
|
-
break
|
|
362
|
-
overlaps.append((idx1, idx2))
|
|
363
|
-
|
|
364
|
-
return overlaps
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
def split_long_lines(
|
|
368
|
-
supervisions: List["Supervision"],
|
|
369
|
-
max_chars_per_line: int = 42,
|
|
370
|
-
max_lines: int = 2,
|
|
371
|
-
) -> List["Supervision"]:
|
|
372
|
-
"""Split supervisions with long text into multiple segments.
|
|
373
|
-
|
|
374
|
-
Useful for broadcast compliance where line length limits are strict.
|
|
375
|
-
|
|
376
|
-
Args:
|
|
377
|
-
supervisions: List of supervision segments
|
|
378
|
-
max_chars_per_line: Maximum characters per line
|
|
379
|
-
max_lines: Maximum lines per supervision
|
|
380
|
-
|
|
381
|
-
Returns:
|
|
382
|
-
New list with long supervisions split
|
|
383
|
-
"""
|
|
384
|
-
from .supervision import Supervision
|
|
385
|
-
|
|
386
|
-
result = []
|
|
387
|
-
max_total_chars = max_chars_per_line * max_lines
|
|
388
|
-
|
|
389
|
-
for sup in supervisions:
|
|
390
|
-
text = sup.text.strip() if sup.text else ""
|
|
391
|
-
|
|
392
|
-
if len(text) <= max_total_chars:
|
|
393
|
-
# Text fits, just wrap lines if needed
|
|
394
|
-
wrapped = _wrap_text(text, max_chars_per_line, max_lines)
|
|
395
|
-
new_sup = Supervision(
|
|
396
|
-
text=wrapped,
|
|
397
|
-
start=sup.start,
|
|
398
|
-
duration=sup.duration,
|
|
399
|
-
speaker=sup.speaker,
|
|
400
|
-
id=sup.id,
|
|
401
|
-
language=sup.language,
|
|
402
|
-
alignment=sup.alignment,
|
|
403
|
-
custom=sup.custom,
|
|
404
|
-
)
|
|
405
|
-
result.append(new_sup)
|
|
406
|
-
else:
|
|
407
|
-
# Split into multiple supervisions
|
|
408
|
-
chunks = _split_text_chunks(text, max_total_chars)
|
|
409
|
-
chunk_duration = sup.duration / len(chunks)
|
|
410
|
-
|
|
411
|
-
for i, chunk in enumerate(chunks):
|
|
412
|
-
wrapped = _wrap_text(chunk, max_chars_per_line, max_lines)
|
|
413
|
-
new_sup = Supervision(
|
|
414
|
-
text=wrapped,
|
|
415
|
-
start=sup.start + i * chunk_duration,
|
|
416
|
-
duration=chunk_duration,
|
|
417
|
-
speaker=sup.speaker if i == 0 else None,
|
|
418
|
-
id=f"{sup.id}_{i}" if sup.id else None,
|
|
419
|
-
language=sup.language,
|
|
420
|
-
)
|
|
421
|
-
result.append(new_sup)
|
|
422
|
-
|
|
423
|
-
return result
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
def _wrap_text(text: str, max_chars: int, max_lines: int) -> str:
|
|
427
|
-
"""Wrap text to fit within character and line limits."""
|
|
428
|
-
words = text.split()
|
|
429
|
-
lines = []
|
|
430
|
-
current_line = []
|
|
431
|
-
current_length = 0
|
|
432
|
-
|
|
433
|
-
for word in words:
|
|
434
|
-
word_len = len(word)
|
|
435
|
-
if current_length + word_len + (1 if current_line else 0) <= max_chars:
|
|
436
|
-
current_line.append(word)
|
|
437
|
-
current_length += word_len + (1 if len(current_line) > 1 else 0)
|
|
438
|
-
else:
|
|
439
|
-
if current_line:
|
|
440
|
-
lines.append(" ".join(current_line))
|
|
441
|
-
current_line = [word]
|
|
442
|
-
current_length = word_len
|
|
443
|
-
|
|
444
|
-
if len(lines) >= max_lines:
|
|
445
|
-
break
|
|
446
|
-
|
|
447
|
-
if current_line and len(lines) < max_lines:
|
|
448
|
-
lines.append(" ".join(current_line))
|
|
449
|
-
|
|
450
|
-
return "\n".join(lines[:max_lines])
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
def _split_text_chunks(text: str, max_chars: int) -> List[str]:
|
|
454
|
-
"""Split text into chunks that fit within character limit."""
|
|
455
|
-
words = text.split()
|
|
456
|
-
chunks = []
|
|
457
|
-
current_chunk = []
|
|
458
|
-
current_length = 0
|
|
459
|
-
|
|
460
|
-
for word in words:
|
|
461
|
-
word_len = len(word)
|
|
462
|
-
if current_length + word_len + (1 if current_chunk else 0) <= max_chars:
|
|
463
|
-
current_chunk.append(word)
|
|
464
|
-
current_length += word_len + (1 if len(current_chunk) > 1 else 0)
|
|
465
|
-
else:
|
|
466
|
-
if current_chunk:
|
|
467
|
-
chunks.append(" ".join(current_chunk))
|
|
468
|
-
current_chunk = [word]
|
|
469
|
-
current_length = word_len
|
|
470
|
-
|
|
471
|
-
if current_chunk:
|
|
472
|
-
chunks.append(" ".join(current_chunk))
|
|
473
|
-
|
|
474
|
-
return chunks
|
lattifai-1.2.2.dist-info/RECORD
DELETED
|
@@ -1,76 +0,0 @@
|
|
|
1
|
-
lattifai/__init__.py,sha256=RXa1IK8Qt6jsAnLlxecOCZmREqv2naXx6T1Fy0g6pqU,1953
|
|
2
|
-
lattifai/audio2.py,sha256=P3N8_BwiscbetzDbkbj-n8BcMu2vWD6-MvtQvGwWWf0,17448
|
|
3
|
-
lattifai/client.py,sha256=pTtpOZRpc3weXkjKZ_-FZLsbbs1CrzVqM4fVqRjiYTc,17179
|
|
4
|
-
lattifai/errors.py,sha256=LyWRGVhQ6Ak2CYn9FBYAPRgQ7_VHpxzNsXI31HXD--s,11291
|
|
5
|
-
lattifai/logging.py,sha256=MbUEeOUFlF92pA9v532DiPPWKl03S7UHCJ6Z652cf0w,2860
|
|
6
|
-
lattifai/mixin.py,sha256=0I-rwvZumaYt8KFTfiVPT2wpXs-JfTEnLOPTdI5r-bM,26115
|
|
7
|
-
lattifai/types.py,sha256=SjYBfwrCBOXlICvH04niFQJ7OzTx7oTaa_npfRkB67U,659
|
|
8
|
-
lattifai/utils.py,sha256=5LeunAN0OQ1jWoKMIThpXSEOxFYD2dCRTdsglosodUU,7963
|
|
9
|
-
lattifai/alignment/__init__.py,sha256=ggOF4MlbnBD7U9yrcyRb1caBR3se_KGA87cfYlyX8RY,450
|
|
10
|
-
lattifai/alignment/lattice1_aligner.py,sha256=WG3mJM4fGyYkY7FdqhPE10yXwBzhdj2TkS-6LF8F_9k,6463
|
|
11
|
-
lattifai/alignment/lattice1_worker.py,sha256=hQbZTgncPq3n-b_l-gUPDPfm460EwuZTKveErgWLWNk,10891
|
|
12
|
-
lattifai/alignment/phonemizer.py,sha256=fbhN2DOl39lW4nQWKzyUUTMUabg7v61lB1kj8SKK-Sw,1761
|
|
13
|
-
lattifai/alignment/punctuation.py,sha256=qLcvuXhBzoEa6bznWZiAB5TAxR6eLr_ZV-PnnCY90UA,1218
|
|
14
|
-
lattifai/alignment/segmenter.py,sha256=0s0eABe0rLAo7eNfl0l5e_knxmZba_BjabPdqsRD45E,6284
|
|
15
|
-
lattifai/alignment/sentence_splitter.py,sha256=2ORvfAgW9yQaqHjts2zlSFjTiNDZF5Fhd5KZX19QWe0,14781
|
|
16
|
-
lattifai/alignment/text_align.py,sha256=PN7RNL5d6jim96zeCUdfdFEdGw--I8zc0kcgWIFJIXU,14910
|
|
17
|
-
lattifai/alignment/tokenizer.py,sha256=AQzXbJ_AW8cg4CAd5TVl1Qd3zH56uy9whX9LVFQ4AaA,17835
|
|
18
|
-
lattifai/caption/__init__.py,sha256=tyIsUvCbImw_qrhp0Nxxrk4vt9szJIPlRcTBviOQkuI,2641
|
|
19
|
-
lattifai/caption/caption.py,sha256=2PHLRDG0Ks4JMl6jNDeXlrI1kpYinektbZ15GwwTcFI,23479
|
|
20
|
-
lattifai/caption/standardize.py,sha256=1pAB8BmziTqYkgj7abCXUcNmNwSV1EAR0PrmbpAEipU,21491
|
|
21
|
-
lattifai/caption/supervision.py,sha256=DRrM8lfKU_x9aVBcLG6xnT0xIJrnc8jzHpzcSwQOg8c,905
|
|
22
|
-
lattifai/caption/utils.py,sha256=YOdJCXhy-6DdrZUkdrJHuPE9sbEHsE9Z7-Vdo4Z5lLY,14406
|
|
23
|
-
lattifai/caption/formats/__init__.py,sha256=UGl7Y0ybMf_F4hiNMMwoKOrpWNxs5m2tiD5zkbwjokY,5240
|
|
24
|
-
lattifai/caption/formats/base.py,sha256=gGeKLKEAB2Hs05R09QMkq5KlXMIQ7bbkUhLct40IcU8,6314
|
|
25
|
-
lattifai/caption/formats/gemini.py,sha256=zIxK7Vxo2YB1eXFiWnsNrz9WSx69lMN0rL-Sd3r57iI,29389
|
|
26
|
-
lattifai/caption/formats/json.py,sha256=s3tFWMUzkWx_IL46phPJnFbJW476Yh_GsxcwD7Q_Mfw,6416
|
|
27
|
-
lattifai/caption/formats/lrc.py,sha256=CWS9wD3Tp6xuvF5gP1nTlVBsBXYnu59_4m4zNRem-c0,11084
|
|
28
|
-
lattifai/caption/formats/pysubs2.py,sha256=eOTQKRbsFStW9gTHaxuAtD7ha1OnrFdqcNLsjdxpHRY,22591
|
|
29
|
-
lattifai/caption/formats/sbv.py,sha256=QUgm5lfRSc2IGSX955yQ7rPiSlaYrOHvniUigr2sF7Y,4520
|
|
30
|
-
lattifai/caption/formats/tabular.py,sha256=HHoiif2yIkMjO9f1bRNAk5Pc0CfkA1mtCFHk5sdLocM,11701
|
|
31
|
-
lattifai/caption/formats/textgrid.py,sha256=m2jMTwLhQa8gbm0Fs1XyEUdiHJaSfCxB9jrYsdk8j7Q,6659
|
|
32
|
-
lattifai/caption/formats/ttml.py,sha256=pJ_wd9pX-MwOhDFMeAHnCpbDiLtIhs888rkW26T7w9Y,23236
|
|
33
|
-
lattifai/caption/formats/vtt.py,sha256=f5OWqsr-2-ddW3CnMtiiqYKQz-hLYRn2B9WM_eT4-AM,17102
|
|
34
|
-
lattifai/caption/formats/nle/__init__.py,sha256=DPBnWPtxEKCC0J_8DCeTyXULPgkrqFT2jbKvkazAx0s,257
|
|
35
|
-
lattifai/caption/formats/nle/audition.py,sha256=65ipbUPdwgvNcUA--dQuisWCbmlt6nHPRbSdl4UUF2Q,18076
|
|
36
|
-
lattifai/caption/formats/nle/avid.py,sha256=UQwFlN4-Myly-kXZxuJTu-7IunEN2_PtAcK9YGQVpMA,14403
|
|
37
|
-
lattifai/caption/formats/nle/fcpxml.py,sha256=76NL6PeIR3KAG1BZscAZdoFJr5wcNdoS4j3VZsOxFV8,18317
|
|
38
|
-
lattifai/caption/formats/nle/premiere.py,sha256=Y2nXSWxI0J0YhV3iHJ9jDrFs0S_5sX32_fEi9SJyVt0,21319
|
|
39
|
-
lattifai/caption/parsers/__init__.py,sha256=z1JMr47FVl7CGbBDg92PKj9RabKktJIUv9iTmmKfEes,227
|
|
40
|
-
lattifai/caption/parsers/text_parser.py,sha256=rQv-aedTWowBe7crvYEOrHqrgKdpNBPcM8HeU-jElHY,4793
|
|
41
|
-
lattifai/cli/__init__.py,sha256=PdqoCTqRSFSWrqL3FjBTa5VzJy_e6Rq0OzyT7YkyHpc,541
|
|
42
|
-
lattifai/cli/alignment.py,sha256=06em-Uaf6NhSz1ce4dwT2r8n56NrtibR7ZsSkmc18Kc,5954
|
|
43
|
-
lattifai/cli/caption.py,sha256=b2mSVFVgL76b4FB5UoJ7AW5iGzPfKiWiLhbM96z_QoA,10371
|
|
44
|
-
lattifai/cli/diarization.py,sha256=GTd2vnTm6cJN6Q3mFP-ShY9bZBl1_zKzWFu-4HHcMzk,4075
|
|
45
|
-
lattifai/cli/transcribe.py,sha256=YhEalG3TQRK7esAN5SOZUQPwIk3TAI9ZknO8cW8C21Q,8038
|
|
46
|
-
lattifai/cli/youtube.py,sha256=CqAxSC_sErslnrnx2RSwAHc7INKET0wLG9Mf_144O-A,6238
|
|
47
|
-
lattifai/config/__init__.py,sha256=JOOn2WbvWXBN6a_3fSNt24W7xnJY7wn8RyNLa0XIY3s,724
|
|
48
|
-
lattifai/config/alignment.py,sha256=ObWf896GGLfP4jsxJaSk6nUyzeF4MvW-ULoPYa8kd9w,4987
|
|
49
|
-
lattifai/config/caption.py,sha256=D4sKNUestwFessU1nZrUqCTsIzYPgpTg12SZlm0HzbQ,15200
|
|
50
|
-
lattifai/config/client.py,sha256=46b816MiYja3Uan_3wjnhtqDr0M6T-FqEygJ3e50IZc,1664
|
|
51
|
-
lattifai/config/diarization.py,sha256=cIkwCfsYqfMns3i6tKWcwBBBkdnhhmB_Eo0TuOPCw9o,2484
|
|
52
|
-
lattifai/config/media.py,sha256=nxvgC7zeLsthCARPPUbnK2eMJY8R1d-1XgiAsy8kroA,15568
|
|
53
|
-
lattifai/config/transcription.py,sha256=_gPJD6cob_jWNdf841nBHhAqJGCxS6PfSyvx2W_vPcM,3082
|
|
54
|
-
lattifai/diarization/__init__.py,sha256=-ZZ_a5hIQgnlHIOehCTtmVmWOWC2H6eOhSs4AcVtRtk,1782
|
|
55
|
-
lattifai/diarization/lattifai.py,sha256=tCnFL6ywITqeKR8YoCsYvyJxNoIwoC6GsnI9zkXNB-Q,3128
|
|
56
|
-
lattifai/transcription/__init__.py,sha256=vMHciyCEPKhhfM3KjMCeDqnyxU1oghF8g5o5SvpnT_4,2669
|
|
57
|
-
lattifai/transcription/base.py,sha256=A2qnocdRCCbvy8mKP0f3K3mx3ZvYyxVXir3aJ2iU19s,4592
|
|
58
|
-
lattifai/transcription/gemini.py,sha256=LJSQt9nGqQdEG6ZFXoHWltumyMEM7-Ezy8ss0iPJb7k,12414
|
|
59
|
-
lattifai/transcription/lattifai.py,sha256=Sik4IyvzdqIMCvgkaxCzqvo-j7u0MfX045z8AJunjhg,3556
|
|
60
|
-
lattifai/transcription/prompts/README.md,sha256=X49KWSQVdjWxxWUp4R2w3ZqKrAOi6_kDNHh1hMaQ4PE,694
|
|
61
|
-
lattifai/transcription/prompts/__init__.py,sha256=G9b42COaCYv3sPPNkHsGDLOMBuVGKt4mXGYal_BYtYQ,1351
|
|
62
|
-
lattifai/transcription/prompts/gemini/README.md,sha256=rt7f7yDGtaobKBo95LG3u56mqa3ABOXQd0UVgJYtYuo,781
|
|
63
|
-
lattifai/transcription/prompts/gemini/transcription_gem.txt,sha256=cljzZ--BDgnnKzqVCakr-fTp2Xk38UOsUquvruNX-LU,4600
|
|
64
|
-
lattifai/workflow/__init__.py,sha256=INpQgc9gZ2Fp-aTHcpR3TEHGtEtPzjOB8T7-jLzVM0E,1547
|
|
65
|
-
lattifai/workflow/agents.py,sha256=yEOnxnhcTvr1iOhCorNvp8B76P6nQsLRXJCu_rCYFfM,38
|
|
66
|
-
lattifai/workflow/base.py,sha256=8QoVIBZwJfr5mppJbtUFafHv5QR9lL-XrULjTWD0oBg,6257
|
|
67
|
-
lattifai/workflow/file_manager.py,sha256=yc29Vb7JNUMJ9rwM_YjkAHfDInl8HMVAl9A7z7XiIOU,32974
|
|
68
|
-
lattifai/youtube/__init__.py,sha256=_uO3KCx-t6I-JaYFpcYLYpvkbmEOOni3xBqGEbExg68,1587
|
|
69
|
-
lattifai/youtube/client.py,sha256=aEOnd8jp4w1ZZkTfxppl7yz2TVdxMTkb8lGCqQxLqxE,47128
|
|
70
|
-
lattifai/youtube/types.py,sha256=80RgBmvM4tRbxqyNv9GU6hr9vPp_yhKrK0RJ_vG2h4E,472
|
|
71
|
-
lattifai-1.2.2.dist-info/licenses/LICENSE,sha256=xGMLmdFJy6Jkz3Hd0znyQLmcxC93FSZB5isKnEDMoQQ,1066
|
|
72
|
-
lattifai-1.2.2.dist-info/METADATA,sha256=NncEA5sSiDyj2DfZCt251tLSranIOn2Gd4KD2D0Q118,19757
|
|
73
|
-
lattifai-1.2.2.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
|
74
|
-
lattifai-1.2.2.dist-info/entry_points.txt,sha256=MfoqXNjXrhD7VMApHgaHmAECTcGVUMUiR0uqnTg7Ads,502
|
|
75
|
-
lattifai-1.2.2.dist-info/top_level.txt,sha256=tHSoXF26r-IGfbIP_JoYATqbmf14h5NrnNJGH4j5reI,9
|
|
76
|
-
lattifai-1.2.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|