lattifai 0.2.5__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,371 @@
1
+ """Reader for YouTube transcript files with speaker labels and timestamps."""
2
+
3
+ import re
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+ from typing import List, Optional, Tuple
7
+
8
+ from lhotse.utils import Pathlike
9
+
10
+ from .supervision import Supervision
11
+
12
+
13
+ @dataclass
14
+ class GeminiSegment:
15
+ """Represents a segment in the Gemini transcript with metadata."""
16
+
17
+ text: str
18
+ timestamp: Optional[float] = None
19
+ speaker: Optional[str] = None
20
+ section: Optional[str] = None
21
+ segment_type: str = 'dialogue' # 'dialogue', 'event', or 'section_header'
22
+ line_number: int = 0
23
+
24
+ @property
25
+ def start(self) -> float:
26
+ """Return start time in seconds."""
27
+ return self.timestamp if self.timestamp is not None else 0.0
28
+
29
+
30
+ class GeminiReader:
31
+ """Parser for YouTube transcript format with speaker labels and timestamps."""
32
+
33
+ # Regex patterns for parsing (supports both [HH:MM:SS] and [MM:SS] formats)
34
+ TIMESTAMP_PATTERN = re.compile(r'\[(\d{1,2}):(\d{2}):(\d{2})\]|\[(\d{1,2}):(\d{2})\]')
35
+ SECTION_HEADER_PATTERN = re.compile(r'^##\s*\[(\d{1,2}):(\d{2}):(\d{2})\]\s*(.+)$')
36
+ SPEAKER_PATTERN = re.compile(r'^\*\*(.+?[::])\*\*\s*(.+)$')
37
+ EVENT_PATTERN = re.compile(r'^\[([^\]]+)\]\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$')
38
+ INLINE_TIMESTAMP_PATTERN = re.compile(r'^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$')
39
+
40
+ # New patterns for YouTube link format: [[MM:SS](URL&t=seconds)]
41
+ YOUTUBE_SECTION_PATTERN = re.compile(r'^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$')
42
+ YOUTUBE_INLINE_PATTERN = re.compile(r'^(.+?)\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]$')
43
+
44
+ @classmethod
45
+ def parse_timestamp(cls, *args) -> float:
46
+ """Convert timestamp to seconds.
47
+
48
+ Supports both HH:MM:SS and MM:SS formats.
49
+ Args can be (hours, minutes, seconds) or (minutes, seconds).
50
+ Can also accept a single argument which is seconds.
51
+ """
52
+ if len(args) == 3:
53
+ # HH:MM:SS format
54
+ hours, minutes, seconds = args
55
+ return int(hours) * 3600 + int(minutes) * 60 + int(seconds)
56
+ elif len(args) == 2:
57
+ # MM:SS format
58
+ minutes, seconds = args
59
+ return int(minutes) * 60 + int(seconds)
60
+ elif len(args) == 1:
61
+ # Direct seconds (from YouTube &t= parameter)
62
+ return int(args[0])
63
+ else:
64
+ raise ValueError(f'Invalid timestamp args: {args}')
65
+
66
+ @classmethod
67
+ def read(
68
+ cls,
69
+ transcript_path: Pathlike,
70
+ include_events: bool = False,
71
+ include_sections: bool = False,
72
+ ) -> List[GeminiSegment]:
73
+ """Parse YouTube transcript file and return list of transcript segments.
74
+
75
+ Args:
76
+ transcript_path: Path to the transcript file
77
+ include_events: Whether to include event descriptions like [Applause]
78
+ include_sections: Whether to include section headers
79
+
80
+ Returns:
81
+ List of GeminiSegment objects with all metadata
82
+ """
83
+ transcript_path = Path(transcript_path).expanduser().resolve()
84
+ if not transcript_path.exists():
85
+ raise FileNotFoundError(f'Transcript file not found: {transcript_path}')
86
+
87
+ segments: List[GeminiSegment] = []
88
+ current_section = None
89
+ current_speaker = None
90
+
91
+ with open(transcript_path, 'r', encoding='utf-8') as f:
92
+ lines = f.readlines()
93
+
94
+ for line_num, line in enumerate(lines, start=1):
95
+ line = line.strip()
96
+ if not line:
97
+ continue
98
+
99
+ # Skip table of contents
100
+ if line.startswith('* ['):
101
+ continue
102
+ if line.startswith('## Table of Contents'):
103
+ continue
104
+
105
+ # Parse section headers
106
+ section_match = cls.SECTION_HEADER_PATTERN.match(line)
107
+ if section_match:
108
+ hours, minutes, seconds, section_title = section_match.groups()
109
+ timestamp = cls.parse_timestamp(hours, minutes, seconds)
110
+ current_section = section_title.strip()
111
+ if include_sections:
112
+ segments.append(
113
+ GeminiSegment(
114
+ text=section_title.strip(),
115
+ timestamp=timestamp,
116
+ section=current_section,
117
+ segment_type='section_header',
118
+ line_number=line_num,
119
+ )
120
+ )
121
+ continue
122
+
123
+ # Parse YouTube format section headers: ## [[MM:SS](URL&t=seconds)] Title
124
+ youtube_section_match = cls.YOUTUBE_SECTION_PATTERN.match(line)
125
+ if youtube_section_match:
126
+ minutes, seconds, url_seconds, section_title = youtube_section_match.groups()
127
+ # Use the URL seconds for more accuracy
128
+ timestamp = cls.parse_timestamp(url_seconds)
129
+ current_section = section_title.strip()
130
+ if include_sections:
131
+ segments.append(
132
+ GeminiSegment(
133
+ text=section_title.strip(),
134
+ timestamp=timestamp,
135
+ section=current_section,
136
+ segment_type='section_header',
137
+ line_number=line_num,
138
+ )
139
+ )
140
+ continue
141
+
142
+ # Parse event descriptions [event] [HH:MM:SS] or [MM:SS]
143
+ event_match = cls.EVENT_PATTERN.match(line)
144
+ if event_match:
145
+ groups = event_match.groups()
146
+ event_text = groups[0]
147
+ # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
148
+ if groups[1] is not None: # HH:MM:SS format
149
+ timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
150
+ elif groups[4] is not None: # MM:SS format
151
+ timestamp = cls.parse_timestamp(groups[4], groups[5])
152
+ else:
153
+ timestamp = None
154
+
155
+ if include_events and timestamp is not None:
156
+ segments.append(
157
+ GeminiSegment(
158
+ text=event_text.strip(),
159
+ timestamp=timestamp,
160
+ section=current_section,
161
+ segment_type='event',
162
+ line_number=line_num,
163
+ )
164
+ )
165
+ continue
166
+
167
+ # Parse speaker dialogue: **Speaker:** Text [HH:MM:SS] or [MM:SS]
168
+ speaker_match = cls.SPEAKER_PATTERN.match(line)
169
+ if speaker_match:
170
+ speaker, text_with_timestamp = speaker_match.groups()
171
+ current_speaker = speaker.strip()
172
+
173
+ # Extract timestamp from the end of the text
174
+ timestamp_match = cls.INLINE_TIMESTAMP_PATTERN.match(text_with_timestamp.strip())
175
+ youtube_match = cls.YOUTUBE_INLINE_PATTERN.match(text_with_timestamp.strip())
176
+
177
+ if timestamp_match:
178
+ groups = timestamp_match.groups()
179
+ text = groups[0]
180
+ # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
181
+ if groups[1] is not None: # HH:MM:SS format
182
+ timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
183
+ elif groups[4] is not None: # MM:SS format
184
+ timestamp = cls.parse_timestamp(groups[4], groups[5])
185
+ else:
186
+ timestamp = None
187
+ elif youtube_match:
188
+ groups = youtube_match.groups()
189
+ text = groups[0]
190
+ # Extract seconds from URL parameter
191
+ url_seconds = groups[3]
192
+ timestamp = cls.parse_timestamp(url_seconds)
193
+ else:
194
+ text = text_with_timestamp.strip()
195
+ timestamp = None
196
+
197
+ segments.append(
198
+ GeminiSegment(
199
+ text=text.strip(),
200
+ timestamp=timestamp,
201
+ speaker=current_speaker,
202
+ section=current_section,
203
+ segment_type='dialogue',
204
+ line_number=line_num,
205
+ )
206
+ )
207
+ current_speaker = None # Reset speaker after use
208
+ continue
209
+
210
+ # Parse plain text with timestamp at the end
211
+ inline_match = cls.INLINE_TIMESTAMP_PATTERN.match(line)
212
+ youtube_inline_match = cls.YOUTUBE_INLINE_PATTERN.match(line)
213
+
214
+ if inline_match:
215
+ groups = inline_match.groups()
216
+ text = groups[0]
217
+ # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
218
+ if groups[1] is not None: # HH:MM:SS format
219
+ timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
220
+ elif groups[4] is not None: # MM:SS format
221
+ timestamp = cls.parse_timestamp(groups[4], groups[5])
222
+ else:
223
+ timestamp = None
224
+
225
+ segments.append(
226
+ GeminiSegment(
227
+ text=text.strip(),
228
+ timestamp=timestamp,
229
+ speaker=current_speaker,
230
+ section=current_section,
231
+ segment_type='dialogue',
232
+ line_number=line_num,
233
+ )
234
+ )
235
+ continue
236
+ elif youtube_inline_match:
237
+ groups = youtube_inline_match.groups()
238
+ text = groups[0]
239
+ # Extract seconds from URL parameter
240
+ url_seconds = groups[3]
241
+ timestamp = cls.parse_timestamp(url_seconds)
242
+
243
+ segments.append(
244
+ GeminiSegment(
245
+ text=text.strip(),
246
+ timestamp=timestamp,
247
+ speaker=current_speaker,
248
+ section=current_section,
249
+ segment_type='dialogue',
250
+ line_number=line_num,
251
+ )
252
+ )
253
+ continue
254
+
255
+ # Skip markdown headers and other formatting
256
+ if line.startswith('#'):
257
+ continue
258
+
259
+ return segments
260
+
261
+ @classmethod
262
+ def extract_for_alignment(
263
+ cls,
264
+ transcript_path: Pathlike,
265
+ merge_consecutive: bool = False,
266
+ min_duration: float = 0.1,
267
+ merge_max_gap: float = 2.0,
268
+ ) -> List[Supervision]:
269
+ """Extract text segments for forced alignment.
270
+
271
+ This extracts only dialogue segments (not events or section headers)
272
+ and converts them to Supervision objects suitable for alignment.
273
+
274
+ Args:
275
+ transcript_path: Path to the transcript file
276
+ merge_consecutive: Whether to merge consecutive segments from same speaker
277
+ min_duration: Minimum duration for a segment
278
+ merge_max_gap: Maximum time gap (seconds) to merge consecutive segments
279
+
280
+ Returns:
281
+ List of Supervision objects ready for alignment
282
+ """
283
+ segments = cls.read(transcript_path, include_events=False, include_sections=False)
284
+
285
+ # Filter to only dialogue segments with timestamps
286
+ dialogue_segments = [s for s in segments if s.segment_type == 'dialogue' and s.timestamp is not None]
287
+
288
+ if not dialogue_segments:
289
+ raise ValueError(f'No dialogue segments with timestamps found in {transcript_path}')
290
+
291
+ # Sort by timestamp
292
+ dialogue_segments.sort(key=lambda x: x.timestamp)
293
+
294
+ # Convert to Supervision objects
295
+ supervisions: List[Supervision] = []
296
+
297
+ for i, segment in enumerate(dialogue_segments):
298
+ # Estimate duration based on next segment
299
+ if i < len(dialogue_segments) - 1:
300
+ duration = dialogue_segments[i + 1].timestamp - segment.timestamp
301
+ else:
302
+ # Last segment: estimate based on text length (rough heuristic)
303
+ words = len(segment.text.split())
304
+ duration = words * 0.3 # ~0.3 seconds per word
305
+
306
+ supervisions.append(
307
+ Supervision(
308
+ text=segment.text,
309
+ start=segment.timestamp,
310
+ duration=max(duration, min_duration),
311
+ id=f'segment_{i:05d}',
312
+ speaker=segment.speaker,
313
+ )
314
+ )
315
+
316
+ # Optionally merge consecutive segments from same speaker
317
+ if merge_consecutive:
318
+ merged = []
319
+ current_speaker = None
320
+ current_texts = []
321
+ current_start = None
322
+ last_end_time = None
323
+
324
+ for i, (segment, sup) in enumerate(zip(dialogue_segments, supervisions)):
325
+ # Check if we should merge with previous segment
326
+ should_merge = False
327
+ if segment.speaker == current_speaker and current_start is not None:
328
+ # Same speaker - check time gap
329
+ time_gap = sup.start - last_end_time if last_end_time else 0
330
+ if time_gap <= merge_max_gap:
331
+ should_merge = True
332
+
333
+ if should_merge:
334
+ # Same speaker within time threshold, accumulate
335
+ current_texts.append(segment.text)
336
+ last_end_time = sup.start + sup.duration
337
+ else:
338
+ # Different speaker or gap too large, save previous segment
339
+ if current_texts:
340
+ merged_text = ' '.join(current_texts)
341
+ merged.append(
342
+ Supervision(
343
+ text=merged_text,
344
+ start=current_start,
345
+ duration=last_end_time - current_start,
346
+ id=f'merged_{len(merged):05d}',
347
+ )
348
+ )
349
+ current_speaker = segment.speaker
350
+ current_texts = [segment.text]
351
+ current_start = sup.start
352
+ last_end_time = sup.start + sup.duration
353
+
354
+ # Add final segment
355
+ if current_texts:
356
+ merged_text = ' '.join(current_texts)
357
+ merged.append(
358
+ Supervision(
359
+ text=merged_text,
360
+ start=current_start,
361
+ duration=last_end_time - current_start,
362
+ id=f'merged_{len(merged):05d}',
363
+ )
364
+ )
365
+
366
+ supervisions = merged
367
+
368
+ return supervisions
369
+
370
+
371
+ __all__ = ['GeminiReader', 'GeminiSegment']
@@ -0,0 +1,173 @@
1
+ """Writer for YouTube transcript files with corrected timestamps from alignment."""
2
+
3
+ import re
4
+ from pathlib import Path
5
+ from typing import Dict, List, Optional
6
+
7
+ from lhotse.utils import Pathlike
8
+
9
+ from .gemini_reader import GeminiReader, GeminiSegment
10
+ from .supervision import Supervision
11
+
12
+
13
+ class GeminiWriter:
14
+ """Writer for updating YouTube transcript timestamps based on alignment results."""
15
+
16
+ @staticmethod
17
+ def format_timestamp(seconds: float) -> str:
18
+ """Convert seconds to [HH:MM:SS] format."""
19
+ hours = int(seconds // 3600)
20
+ minutes = int((seconds % 3600) // 60)
21
+ secs = int(seconds % 60)
22
+ return f'[{hours:02d}:{minutes:02d}:{secs:02d}]'
23
+
24
+ @classmethod
25
+ def update_timestamps(
26
+ cls,
27
+ original_transcript: Pathlike,
28
+ aligned_supervisions: List[Supervision],
29
+ output_path: Pathlike,
30
+ timestamp_mapping: Optional[Dict[int, float]] = None,
31
+ ) -> Pathlike:
32
+ """Update transcript file with corrected timestamps from alignment.
33
+
34
+ Args:
35
+ original_transcript: Path to the original transcript file
36
+ aligned_supervisions: List of aligned Supervision objects with corrected timestamps
37
+ output_path: Path to write the updated transcript
38
+ timestamp_mapping: Optional manual mapping from line_number to new timestamp
39
+
40
+ Returns:
41
+ Path to the output file
42
+ """
43
+ original_path = Path(original_transcript)
44
+ output_path = Path(output_path)
45
+
46
+ # Read original file
47
+ with open(original_path, 'r', encoding='utf-8') as f:
48
+ lines = f.readlines()
49
+
50
+ # Parse original segments to get line numbers
51
+ original_segments = GeminiReader.read(original_transcript, include_events=True, include_sections=True)
52
+
53
+ # Create mapping from line number to new timestamp
54
+ if timestamp_mapping is None:
55
+ timestamp_mapping = cls._create_timestamp_mapping(original_segments, aligned_supervisions)
56
+
57
+ # Update timestamps in lines
58
+ updated_lines = []
59
+ for line_num, line in enumerate(lines, start=1):
60
+ if line_num in timestamp_mapping:
61
+ new_timestamp = timestamp_mapping[line_num]
62
+ updated_line = cls._replace_timestamp(line, new_timestamp)
63
+ updated_lines.append(updated_line)
64
+ else:
65
+ updated_lines.append(line)
66
+
67
+ # Write updated content
68
+ output_path.parent.mkdir(parents=True, exist_ok=True)
69
+ with open(output_path, 'w', encoding='utf-8') as f:
70
+ f.writelines(updated_lines)
71
+
72
+ return output_path
73
+
74
+ @classmethod
75
+ def _create_timestamp_mapping(
76
+ cls, original_segments: List[GeminiSegment], aligned_supervisions: List[Supervision]
77
+ ) -> Dict[int, float]:
78
+ """Create mapping from line numbers to new timestamps based on alignment.
79
+
80
+ This performs text matching between original segments and aligned supervisions
81
+ to determine which timestamps should be updated.
82
+ """
83
+ mapping = {}
84
+
85
+ # Create a simple text-based matching
86
+ dialogue_segments = [s for s in original_segments if s.segment_type == 'dialogue']
87
+
88
+ # Try to match based on text content
89
+ for aligned_sup in aligned_supervisions:
90
+ aligned_text = aligned_sup.text.strip()
91
+
92
+ # Find best matching original segment
93
+ best_match = None
94
+ best_score = 0
95
+
96
+ for orig_seg in dialogue_segments:
97
+ orig_text = orig_seg.text.strip()
98
+
99
+ # Simple text similarity (could be improved with fuzzy matching)
100
+ if aligned_text == orig_text:
101
+ best_match = orig_seg
102
+ best_score = 1.0
103
+ break
104
+ elif aligned_text in orig_text or orig_text in aligned_text:
105
+ score = min(len(aligned_text), len(orig_text)) / max(len(aligned_text), len(orig_text))
106
+ if score > best_score:
107
+ best_score = score
108
+ best_match = orig_seg
109
+
110
+ # If we found a good match, update the mapping
111
+ if best_match and best_score > 0.8:
112
+ mapping[best_match.line_number] = aligned_sup.start
113
+
114
+ return mapping
115
+
116
+ @classmethod
117
+ def _replace_timestamp(cls, line: str, new_timestamp: float) -> str:
118
+ """Replace timestamp in a line with new timestamp."""
119
+ new_ts_str = cls.format_timestamp(new_timestamp)
120
+
121
+ # Replace timestamp patterns
122
+ # Pattern 1: [HH:MM:SS] at the end or in brackets
123
+ line = re.sub(r'\[\d{2}:\d{2}:\d{2}\]', new_ts_str, line)
124
+
125
+ return line
126
+
127
+ @classmethod
128
+ def write_aligned_transcript(
129
+ cls,
130
+ aligned_supervisions: List[Supervision],
131
+ output_path: Pathlike,
132
+ include_word_timestamps: bool = False,
133
+ ) -> Pathlike:
134
+ """Write a new transcript file from aligned supervisions.
135
+
136
+ This creates a simplified transcript format with accurate timestamps.
137
+
138
+ Args:
139
+ aligned_supervisions: List of aligned Supervision objects
140
+ output_path: Path to write the transcript
141
+ include_word_timestamps: Whether to include word-level timestamps if available
142
+
143
+ Returns:
144
+ Path to the output file
145
+ """
146
+ output_path = Path(output_path)
147
+ output_path.parent.mkdir(parents=True, exist_ok=True)
148
+
149
+ with open(output_path, 'w', encoding='utf-8') as f:
150
+ f.write('# Aligned Transcript\n\n')
151
+
152
+ for i, sup in enumerate(aligned_supervisions):
153
+ # Write segment with timestamp
154
+ start_ts = cls.format_timestamp(sup.start)
155
+ f.write(f'{start_ts} {sup.text}\n')
156
+
157
+ # Optionally write word-level timestamps
158
+ if include_word_timestamps and hasattr(sup, 'alignment') and sup.alignment:
159
+ if 'word' in sup.alignment:
160
+ f.write(' Words: ')
161
+ word_parts = []
162
+ for word_info in sup.alignment['word']:
163
+ word_ts = cls.format_timestamp(word_info['start'])
164
+ word_parts.append(f'{word_info["symbol"]}{word_ts}')
165
+ f.write(' '.join(word_parts))
166
+ f.write('\n')
167
+
168
+ f.write('\n')
169
+
170
+ return output_path
171
+
172
+
173
+ __all__ = ['GeminiWriter']
lattifai/io/reader.py CHANGED
@@ -30,16 +30,29 @@ class SubtitleReader(ABCMeta):
30
30
  elif format:
31
31
  format = format.lower()
32
32
 
33
- if format == 'txt' or subtitle[-4:].lower() == '.txt':
33
+ if format == 'gemini' or str(subtitle).endswith('Gemini.md'):
34
+ from .gemini_reader import GeminiReader
35
+
36
+ supervisions = GeminiReader.extract_for_alignment(subtitle)
37
+ elif format == 'txt' or (format == 'auto' and str(subtitle)[-4:].lower() == '.txt'):
34
38
  if not Path(str(subtitle)).exists(): # str
35
- lines = [line.strip() for line in subtitle.split('\n')]
39
+ lines = [line.strip() for line in str(subtitle).split('\n')]
36
40
  else: # file
37
- lines = [line.strip() for line in open(subtitle).readlines()]
38
- examples = [Supervision(text=line) for line in lines if line]
41
+ path_str = str(subtitle)
42
+ with open(path_str, encoding='utf-8') as f:
43
+ lines = [line.strip() for line in f.readlines()]
44
+ supervisions = [Supervision(text=line) for line in lines if line]
39
45
  else:
40
- examples = cls._parse_subtitle(subtitle, format=format)
46
+ try:
47
+ supervisions = cls._parse_subtitle(subtitle, format=format)
48
+ except Exception as e:
49
+ del e
50
+ print(f"Failed to parse subtitle with format {format}, trying 'gemini' parser.")
51
+ from .gemini_reader import GeminiReader
41
52
 
42
- return examples
53
+ supervisions = GeminiReader.extract_for_alignment(subtitle)
54
+
55
+ return supervisions
43
56
 
44
57
  @classmethod
45
58
  def _parse_subtitle(cls, subtitle: Pathlike, format: Optional[SubtitleFormat]) -> List[Supervision]:
@@ -62,9 +75,8 @@ class SubtitleReader(ABCMeta):
62
75
  supervisions.append(
63
76
  Supervision(
64
77
  text=event.text,
65
- # "start": event.start / 1000.0 if event.start is not None else None,
66
- # "duration": (event.end - event.start) / 1000.0 if event.end is not None else None,
67
- # }
78
+ start=event.start / 1000.0 if event.start is not None else None,
79
+ duration=(event.end - event.start) / 1000.0 if event.end is not None else None,
68
80
  )
69
81
  )
70
82
  return supervisions
@@ -7,6 +7,22 @@ from lhotse.utils import Seconds
7
7
 
8
8
  @dataclass
9
9
  class Supervision(SupervisionSegment):
10
+ """
11
+ Extended SupervisionSegment with simplified initialization.
12
+
13
+ Note: The `alignment` field is inherited from SupervisionSegment:
14
+ alignment: Optional[Dict[str, List[AlignmentItem]]] = None
15
+
16
+ Structure of alignment when return_details=True:
17
+ {
18
+ 'word': [
19
+ AlignmentItem(symbol='hello', start=0.0, duration=0.5, score=0.95),
20
+ AlignmentItem(symbol='world', start=0.6, duration=0.4, score=0.92),
21
+ ...
22
+ ]
23
+ }
24
+ """
25
+
10
26
  text: Optional[str] = None
11
27
  id: str = ''
12
28
  recording_id: str = ''
lattifai/io/utils.py ADDED
@@ -0,0 +1,15 @@
1
+ """
2
+ Utility constants and helper functions for subtitle I/O operations
3
+ """
4
+
5
+ # Supported subtitle formats for reading/writing
6
+ SUBTITLE_FORMATS = ['srt', 'vtt', 'ass', 'ssa', 'sub', 'sbv', 'txt', 'md']
7
+
8
+ # Input subtitle formats (includes special formats like 'auto' and 'gemini')
9
+ INPUT_SUBTITLE_FORMATS = ['srt', 'vtt', 'ass', 'ssa', 'sub', 'sbv', 'txt', 'auto', 'gemini']
10
+
11
+ # Output subtitle formats (includes special formats like 'TextGrid' and 'json')
12
+ OUTPUT_SUBTITLE_FORMATS = ['srt', 'vtt', 'ass', 'ssa', 'sub', 'sbv', 'txt', 'TextGrid', 'json']
13
+
14
+ # All subtitle formats combined (for file detection)
15
+ ALL_SUBTITLE_FORMATS = list(set(SUBTITLE_FORMATS + ['TextGrid', 'json', 'gemini']))