lattifai 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. lattifai/_init.py +20 -0
  2. lattifai/alignment/__init__.py +9 -1
  3. lattifai/alignment/lattice1_aligner.py +175 -54
  4. lattifai/alignment/lattice1_worker.py +47 -4
  5. lattifai/alignment/punctuation.py +38 -0
  6. lattifai/alignment/segmenter.py +3 -2
  7. lattifai/alignment/text_align.py +441 -0
  8. lattifai/alignment/tokenizer.py +134 -65
  9. lattifai/audio2.py +162 -183
  10. lattifai/cli/__init__.py +2 -1
  11. lattifai/cli/alignment.py +5 -0
  12. lattifai/cli/caption.py +111 -4
  13. lattifai/cli/transcribe.py +2 -6
  14. lattifai/cli/youtube.py +7 -1
  15. lattifai/client.py +72 -123
  16. lattifai/config/__init__.py +28 -0
  17. lattifai/config/alignment.py +14 -0
  18. lattifai/config/caption.py +45 -31
  19. lattifai/config/client.py +16 -0
  20. lattifai/config/event.py +102 -0
  21. lattifai/config/media.py +20 -0
  22. lattifai/config/transcription.py +25 -1
  23. lattifai/data/__init__.py +8 -0
  24. lattifai/data/caption.py +228 -0
  25. lattifai/diarization/__init__.py +41 -1
  26. lattifai/errors.py +78 -53
  27. lattifai/event/__init__.py +65 -0
  28. lattifai/event/lattifai.py +166 -0
  29. lattifai/mixin.py +49 -32
  30. lattifai/transcription/base.py +8 -2
  31. lattifai/transcription/gemini.py +147 -16
  32. lattifai/transcription/lattifai.py +25 -63
  33. lattifai/types.py +1 -1
  34. lattifai/utils.py +7 -13
  35. lattifai/workflow/__init__.py +28 -4
  36. lattifai/workflow/file_manager.py +2 -5
  37. lattifai/youtube/__init__.py +43 -0
  38. lattifai/youtube/client.py +1265 -0
  39. lattifai/youtube/types.py +23 -0
  40. lattifai-1.3.0.dist-info/METADATA +678 -0
  41. lattifai-1.3.0.dist-info/RECORD +57 -0
  42. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +1 -2
  43. lattifai/__init__.py +0 -88
  44. lattifai/alignment/sentence_splitter.py +0 -219
  45. lattifai/caption/__init__.py +0 -20
  46. lattifai/caption/caption.py +0 -1467
  47. lattifai/caption/gemini_reader.py +0 -462
  48. lattifai/caption/gemini_writer.py +0 -173
  49. lattifai/caption/supervision.py +0 -34
  50. lattifai/caption/text_parser.py +0 -145
  51. lattifai/cli/app_installer.py +0 -142
  52. lattifai/cli/server.py +0 -44
  53. lattifai/server/app.py +0 -427
  54. lattifai/workflow/youtube.py +0 -577
  55. lattifai-1.2.1.dist-info/METADATA +0 -1134
  56. lattifai-1.2.1.dist-info/RECORD +0 -58
  57. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
  58. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
  59. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
@@ -1,462 +0,0 @@
1
- """Reader for YouTube transcript files with speaker labels and timestamps."""
2
-
3
- import re
4
- from dataclasses import dataclass
5
- from pathlib import Path
6
- from typing import List, Optional
7
-
8
- from lhotse.utils import Pathlike
9
-
10
- from .supervision import Supervision
11
-
12
-
13
- @dataclass
14
- class GeminiSegment:
15
- """Represents a segment in the Gemini transcript with metadata."""
16
-
17
- text: str
18
- timestamp: Optional[float] = None # For backward compatibility (start time)
19
- end_timestamp: Optional[float] = None # End time when timestamp is at the end
20
- speaker: Optional[str] = None
21
- section: Optional[str] = None
22
- segment_type: str = "dialogue" # 'dialogue', 'event', or 'section_header'
23
- line_number: int = 0
24
-
25
- @property
26
- def start(self) -> float:
27
- """Return start time in seconds."""
28
- return self.timestamp if self.timestamp is not None else 0.0
29
-
30
- @property
31
- def end(self) -> Optional[float]:
32
- """Return end time in seconds if available."""
33
- return self.end_timestamp
34
-
35
-
36
- class GeminiReader:
37
- """Parser for YouTube transcript format with speaker labels and timestamps."""
38
-
39
- # Regex patterns for parsing (supports both [HH:MM:SS] and [MM:SS] formats)
40
- TIMESTAMP_PATTERN = re.compile(r"\[(\d{1,2}):(\d{2}):(\d{2})\]|\[(\d{1,2}):(\d{2})\]")
41
- SECTION_HEADER_PATTERN = re.compile(r"^##\s*\[(\d{1,2}):(\d{2}):(\d{2})\]\s*(.+)$")
42
- SPEAKER_PATTERN = re.compile(r"^\*\*(.+?[::])\*\*\s*(.+)$")
43
- # Event pattern: [Event] [HH:MM:SS] or [Event] [MM:SS] - prioritize HH:MM:SS format
44
- EVENT_PATTERN = re.compile(r"^\[([^\]]+)\]\s*\[(\d{1,2}):(\d{2})(?::(\d{2}))?\]$")
45
- # Timestamp at the end indicates end time
46
- INLINE_TIMESTAMP_END_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
47
- # Timestamp at the beginning indicates start time
48
- INLINE_TIMESTAMP_START_PATTERN = re.compile(r"^\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]\s*(.+)$")
49
-
50
- # New patterns for YouTube link format: [[MM:SS](URL&t=seconds)]
51
- YOUTUBE_SECTION_PATTERN = re.compile(r"^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$")
52
- YOUTUBE_INLINE_PATTERN = re.compile(r"^(.+?)\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]$")
53
-
54
- @classmethod
55
- def parse_timestamp(cls, *args) -> float:
56
- """Convert timestamp to seconds.
57
-
58
- Supports both HH:MM:SS and MM:SS formats.
59
- Args can be (hours, minutes, seconds) or (minutes, seconds).
60
- Can also accept a single argument which is seconds.
61
- """
62
- if len(args) == 3:
63
- # HH:MM:SS format
64
- hours, minutes, seconds = args
65
- return int(hours) * 3600 + int(minutes) * 60 + int(seconds)
66
- elif len(args) == 2:
67
- # MM:SS format
68
- minutes, seconds = args
69
- return int(minutes) * 60 + int(seconds)
70
- elif len(args) == 1:
71
- # Direct seconds (from YouTube &t= parameter)
72
- return int(args[0])
73
- else:
74
- raise ValueError(f"Invalid timestamp args: {args}")
75
-
76
- @classmethod
77
- def read(
78
- cls,
79
- transcript_path: Pathlike,
80
- include_events: bool = False,
81
- include_sections: bool = False,
82
- ) -> List[GeminiSegment]:
83
- """Parse YouTube transcript file and return list of transcript segments.
84
-
85
- Args:
86
- transcript_path: Path to the transcript file
87
- include_events: Whether to include event descriptions like [Applause]
88
- include_sections: Whether to include section headers
89
-
90
- Returns:
91
- List of GeminiSegment objects with all metadata
92
- """
93
- transcript_path = Path(transcript_path).expanduser().resolve()
94
- if not transcript_path.exists():
95
- raise FileNotFoundError(f"Transcript file not found: {transcript_path}")
96
-
97
- segments: List[GeminiSegment] = []
98
- current_section = None
99
- current_speaker = None
100
-
101
- with open(transcript_path, "r", encoding="utf-8") as f:
102
- lines = f.readlines()
103
-
104
- for line_num, line in enumerate(lines, start=1):
105
- line = line.strip()
106
- if not line:
107
- continue
108
-
109
- # Skip table of contents
110
- if line.startswith("* ["):
111
- continue
112
- if line.startswith("## Table of Contents"):
113
- continue
114
-
115
- # Parse section headers
116
- section_match = cls.SECTION_HEADER_PATTERN.match(line)
117
- if section_match:
118
- hours, minutes, seconds, section_title = section_match.groups()
119
- timestamp = cls.parse_timestamp(hours, minutes, seconds)
120
- current_section = section_title.strip()
121
- if include_sections:
122
- segments.append(
123
- GeminiSegment(
124
- text=section_title.strip(),
125
- timestamp=timestamp,
126
- section=current_section,
127
- segment_type="section_header",
128
- line_number=line_num,
129
- )
130
- )
131
- continue
132
-
133
- # Parse YouTube format section headers: ## [[MM:SS](URL&t=seconds)] Title
134
- youtube_section_match = cls.YOUTUBE_SECTION_PATTERN.match(line)
135
- if youtube_section_match:
136
- minutes, seconds, url_seconds, section_title = youtube_section_match.groups()
137
- # Use the URL seconds for more accuracy
138
- timestamp = cls.parse_timestamp(url_seconds)
139
- current_section = section_title.strip()
140
- if include_sections:
141
- segments.append(
142
- GeminiSegment(
143
- text=section_title.strip(),
144
- timestamp=timestamp,
145
- section=current_section,
146
- segment_type="section_header",
147
- line_number=line_num,
148
- )
149
- )
150
- continue
151
-
152
- # Parse event descriptions [event] [HH:MM:SS] or [MM:SS]
153
- event_match = cls.EVENT_PATTERN.match(line)
154
- if event_match:
155
- groups = event_match.groups()
156
- event_text = groups[0]
157
- # Parse timestamp - groups: (event_text, hours/minutes, minutes/seconds, seconds_optional)
158
- hours_or_minutes = groups[1]
159
- minutes_or_seconds = groups[2]
160
- seconds_optional = groups[3]
161
-
162
- if seconds_optional is not None:
163
- # HH:MM:SS format
164
- timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds, seconds_optional)
165
- else:
166
- # MM:SS format
167
- timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds)
168
-
169
- if include_events and timestamp is not None:
170
- segments.append(
171
- GeminiSegment(
172
- text=f"[{event_text.strip()}]",
173
- timestamp=timestamp,
174
- section=current_section,
175
- segment_type="event",
176
- line_number=line_num,
177
- )
178
- )
179
- continue
180
-
181
- # Parse speaker dialogue: **Speaker:** Text [HH:MM:SS] or [MM:SS]
182
- speaker_match = cls.SPEAKER_PATTERN.match(line)
183
- if speaker_match:
184
- speaker, text_with_timestamp = speaker_match.groups()
185
- current_speaker = speaker.strip()
186
-
187
- # Check for timestamp at the beginning (start time)
188
- start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(text_with_timestamp.strip())
189
- # Check for timestamp at the end (end time)
190
- end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(text_with_timestamp.strip())
191
- youtube_match = cls.YOUTUBE_INLINE_PATTERN.match(text_with_timestamp.strip())
192
-
193
- start_timestamp = None
194
- end_timestamp = None
195
- text = text_with_timestamp.strip()
196
-
197
- if start_match:
198
- groups = start_match.groups()
199
- # Parse timestamp - can be HH:MM:SS (groups 0,1,2) or MM:SS (groups 3,4)
200
- if groups[0] is not None: # HH:MM:SS format
201
- start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
202
- elif groups[3] is not None: # MM:SS format
203
- start_timestamp = cls.parse_timestamp(groups[3], groups[4])
204
- text = groups[5] # Text is after timestamp
205
- elif end_match:
206
- groups = end_match.groups()
207
- text = groups[0] # Text is before timestamp
208
- # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
209
- if groups[1] is not None: # HH:MM:SS format
210
- end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
211
- elif groups[4] is not None: # MM:SS format
212
- end_timestamp = cls.parse_timestamp(groups[4], groups[5])
213
- elif youtube_match:
214
- groups = youtube_match.groups()
215
- text = groups[0]
216
- # Extract seconds from URL parameter (treat as end time)
217
- url_seconds = groups[3]
218
- end_timestamp = cls.parse_timestamp(url_seconds)
219
-
220
- segments.append(
221
- GeminiSegment(
222
- text=text.strip(),
223
- timestamp=start_timestamp,
224
- end_timestamp=end_timestamp,
225
- speaker=current_speaker,
226
- section=current_section,
227
- segment_type="dialogue",
228
- line_number=line_num,
229
- )
230
- )
231
- current_speaker = None # Reset speaker after use
232
- continue
233
-
234
- # Parse plain text with timestamp (check both positions)
235
- start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(line)
236
- end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(line)
237
- youtube_inline_match = cls.YOUTUBE_INLINE_PATTERN.match(line)
238
-
239
- start_timestamp = None
240
- end_timestamp = None
241
- text = None
242
-
243
- if start_match:
244
- groups = start_match.groups()
245
- # Parse timestamp - can be HH:MM:SS (groups 0,1,2) or MM:SS (groups 3,4)
246
- if groups[0] is not None: # HH:MM:SS format
247
- start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
248
- elif groups[3] is not None: # MM:SS format
249
- start_timestamp = cls.parse_timestamp(groups[3], groups[4])
250
- text = groups[5] # Text is after timestamp
251
-
252
- segments.append(
253
- GeminiSegment(
254
- text=text.strip(),
255
- timestamp=start_timestamp,
256
- end_timestamp=None,
257
- speaker=current_speaker,
258
- section=current_section,
259
- segment_type="dialogue",
260
- line_number=line_num,
261
- )
262
- )
263
- continue
264
- elif end_match:
265
- groups = end_match.groups()
266
- text = groups[0] # Text is before timestamp
267
- # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
268
- if groups[1] is not None: # HH:MM:SS format
269
- end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
270
- elif groups[4] is not None: # MM:SS format
271
- end_timestamp = cls.parse_timestamp(groups[4], groups[5])
272
-
273
- segments.append(
274
- GeminiSegment(
275
- text=text.strip(),
276
- timestamp=None,
277
- end_timestamp=end_timestamp,
278
- speaker=current_speaker,
279
- section=current_section,
280
- segment_type="dialogue",
281
- line_number=line_num,
282
- )
283
- )
284
- continue
285
- elif youtube_inline_match:
286
- groups = youtube_inline_match.groups()
287
- text = groups[0]
288
- # Extract seconds from URL parameter (treat as end time)
289
- url_seconds = groups[3]
290
- end_timestamp = cls.parse_timestamp(url_seconds)
291
-
292
- segments.append(
293
- GeminiSegment(
294
- text=text.strip(),
295
- timestamp=None,
296
- end_timestamp=end_timestamp,
297
- speaker=current_speaker,
298
- section=current_section,
299
- segment_type="dialogue",
300
- line_number=line_num,
301
- )
302
- )
303
- continue
304
-
305
- # Skip markdown headers and other formatting
306
- if line.startswith("#"):
307
- continue
308
-
309
- return segments
310
-
311
- @classmethod
312
- def extract_for_alignment(
313
- cls,
314
- transcript_path: Pathlike,
315
- merge_consecutive: bool = False,
316
- min_duration: float = 0.1,
317
- merge_max_gap: float = 2.0,
318
- ) -> List[Supervision]:
319
- """Extract text segments for forced alignment.
320
-
321
- This extracts only dialogue segments (not events or section headers)
322
- and converts them to Supervision objects suitable for alignment.
323
-
324
- Args:
325
- transcript_path: Path to the transcript file
326
- merge_consecutive: Whether to merge consecutive segments from same speaker
327
- min_duration: Minimum duration for a segment
328
- merge_max_gap: Maximum time gap (seconds) to merge consecutive segments
329
-
330
- Returns:
331
- List of Supervision objects ready for alignment
332
- """
333
- segments = cls.read(transcript_path, include_events=True, include_sections=False)
334
-
335
- # Filter to dialogue and event segments with timestamps (either start or end)
336
- dialogue_segments = [
337
- s
338
- for s in segments
339
- if s.segment_type in ("dialogue", "event") and (s.timestamp is not None or s.end_timestamp is not None)
340
- ]
341
-
342
- if not dialogue_segments:
343
- raise ValueError(f"No dialogue segments with timestamps found in {transcript_path}")
344
-
345
- # Sort by timestamp (use start time if available, otherwise end time)
346
- dialogue_segments.sort(key=lambda x: x.timestamp if x.timestamp is not None else x.end_timestamp)
347
-
348
- # Convert to Supervision objects
349
- supervisions: List[Supervision] = []
350
- prev_end_time = 0.0
351
-
352
- for i, segment in enumerate(dialogue_segments):
353
- seg_start = None
354
- seg_end = None
355
-
356
- # Determine start and end times based on available timestamps
357
- if segment.timestamp is not None:
358
- # Has start time
359
- seg_start = segment.timestamp
360
- if segment.end_timestamp is not None:
361
- # Has both start and end
362
- seg_end = segment.end_timestamp
363
- else:
364
- # Only has start, estimate end
365
- if i < len(dialogue_segments) - 1:
366
- # Use next segment's time
367
- next_seg = dialogue_segments[i + 1]
368
- if next_seg.timestamp is not None:
369
- seg_end = next_seg.timestamp
370
- elif next_seg.end_timestamp is not None:
371
- # Next has only end, estimate its start and use that
372
- words_next = len(next_seg.text.split())
373
- estimated_duration_next = words_next * 0.3
374
- seg_end = next_seg.end_timestamp - estimated_duration_next
375
-
376
- if seg_end is None:
377
- # Estimate based on text length
378
- words = len(segment.text.split())
379
- seg_end = seg_start + words * 0.3
380
-
381
- elif segment.end_timestamp is not None:
382
- # Only has end time, need to infer start
383
- seg_end = segment.end_timestamp
384
- # Use previous segment's end time as start, or estimate based on text
385
- if prev_end_time > 0:
386
- seg_start = prev_end_time
387
- else:
388
- # Estimate start based on text length
389
- words = len(segment.text.split())
390
- estimated_duration = words * 0.3
391
- seg_start = seg_end - estimated_duration
392
-
393
- if seg_start is not None and seg_end is not None:
394
- duration = max(seg_end - seg_start, min_duration)
395
- if segment.segment_type == "dialogue":
396
- supervisions.append(
397
- Supervision(
398
- text=segment.text,
399
- start=seg_start,
400
- duration=duration,
401
- id=f"segment_{i:05d}",
402
- speaker=segment.speaker,
403
- )
404
- )
405
- prev_end_time = seg_start + duration
406
-
407
- # Optionally merge consecutive segments from same speaker
408
- if merge_consecutive:
409
- merged = []
410
- current_speaker = None
411
- current_texts = []
412
- current_start = None
413
- last_end_time = None
414
-
415
- for i, (segment, sup) in enumerate(zip(dialogue_segments, supervisions)):
416
- # Check if we should merge with previous segment
417
- should_merge = False
418
- if segment.speaker == current_speaker and current_start is not None:
419
- # Same speaker - check time gap
420
- time_gap = sup.start - last_end_time if last_end_time else 0
421
- if time_gap <= merge_max_gap:
422
- should_merge = True
423
-
424
- if should_merge:
425
- # Same speaker within time threshold, accumulate
426
- current_texts.append(segment.text)
427
- last_end_time = sup.start + sup.duration
428
- else:
429
- # Different speaker or gap too large, save previous segment
430
- if current_texts:
431
- merged_text = " ".join(current_texts)
432
- merged.append(
433
- Supervision(
434
- text=merged_text,
435
- start=current_start,
436
- duration=last_end_time - current_start,
437
- id=f"merged_{len(merged):05d}",
438
- )
439
- )
440
- current_speaker = segment.speaker
441
- current_texts = [segment.text]
442
- current_start = sup.start
443
- last_end_time = sup.start + sup.duration
444
-
445
- # Add final segment
446
- if current_texts:
447
- merged_text = " ".join(current_texts)
448
- merged.append(
449
- Supervision(
450
- text=merged_text,
451
- start=current_start,
452
- duration=last_end_time - current_start,
453
- id=f"merged_{len(merged):05d}",
454
- )
455
- )
456
-
457
- supervisions = merged
458
-
459
- return supervisions
460
-
461
-
462
- __all__ = ["GeminiReader", "GeminiSegment"]
@@ -1,173 +0,0 @@
1
- """Writer for YouTube transcript files with corrected timestamps from alignment."""
2
-
3
- import re
4
- from pathlib import Path
5
- from typing import Dict, List, Optional
6
-
7
- from lhotse.utils import Pathlike
8
-
9
- from .gemini_reader import GeminiReader, GeminiSegment
10
- from .supervision import Supervision
11
-
12
-
13
- class GeminiWriter:
14
- """Writer for updating YouTube transcript timestamps based on alignment results."""
15
-
16
- @staticmethod
17
- def format_timestamp(seconds: float) -> str:
18
- """Convert seconds to [HH:MM:SS] format."""
19
- hours = int(seconds // 3600)
20
- minutes = int((seconds % 3600) // 60)
21
- secs = int(seconds % 60)
22
- return f"[{hours:02d}:{minutes:02d}:{secs:02d}]"
23
-
24
- @classmethod
25
- def update_timestamps(
26
- cls,
27
- original_transcript: Pathlike,
28
- aligned_supervisions: List[Supervision],
29
- output_path: Pathlike,
30
- timestamp_mapping: Optional[Dict[int, float]] = None,
31
- ) -> Pathlike:
32
- """Update transcript file with corrected timestamps from alignment.
33
-
34
- Args:
35
- original_transcript: Path to the original transcript file
36
- aligned_supervisions: List of aligned Supervision objects with corrected timestamps
37
- output_path: Path to write the updated transcript
38
- timestamp_mapping: Optional manual mapping from line_number to new timestamp
39
-
40
- Returns:
41
- Path to the output file
42
- """
43
- original_path = Path(original_transcript)
44
- output_path = Path(output_path)
45
-
46
- # Read original file
47
- with open(original_path, "r", encoding="utf-8") as f:
48
- lines = f.readlines()
49
-
50
- # Parse original segments to get line numbers
51
- original_segments = GeminiReader.read(original_transcript, include_events=True, include_sections=True)
52
-
53
- # Create mapping from line number to new timestamp
54
- if timestamp_mapping is None:
55
- timestamp_mapping = cls._create_timestamp_mapping(original_segments, aligned_supervisions)
56
-
57
- # Update timestamps in lines
58
- updated_lines = []
59
- for line_num, line in enumerate(lines, start=1):
60
- if line_num in timestamp_mapping:
61
- new_timestamp = timestamp_mapping[line_num]
62
- updated_line = cls._replace_timestamp(line, new_timestamp)
63
- updated_lines.append(updated_line)
64
- else:
65
- updated_lines.append(line)
66
-
67
- # Write updated content
68
- output_path.parent.mkdir(parents=True, exist_ok=True)
69
- with open(output_path, "w", encoding="utf-8") as f:
70
- f.writelines(updated_lines)
71
-
72
- return output_path
73
-
74
- @classmethod
75
- def _create_timestamp_mapping(
76
- cls, original_segments: List[GeminiSegment], aligned_supervisions: List[Supervision]
77
- ) -> Dict[int, float]:
78
- """Create mapping from line numbers to new timestamps based on alignment.
79
-
80
- This performs text matching between original segments and aligned supervisions
81
- to determine which timestamps should be updated.
82
- """
83
- mapping = {}
84
-
85
- # Create a simple text-based matching
86
- dialogue_segments = [s for s in original_segments if s.segment_type == "dialogue"]
87
-
88
- # Try to match based on text content
89
- for aligned_sup in aligned_supervisions:
90
- aligned_text = aligned_sup.text.strip()
91
-
92
- # Find best matching original segment
93
- best_match = None
94
- best_score = 0
95
-
96
- for orig_seg in dialogue_segments:
97
- orig_text = orig_seg.text.strip()
98
-
99
- # Simple text similarity (could be improved with fuzzy matching)
100
- if aligned_text == orig_text:
101
- best_match = orig_seg
102
- best_score = 1.0
103
- break
104
- elif aligned_text in orig_text or orig_text in aligned_text:
105
- score = min(len(aligned_text), len(orig_text)) / max(len(aligned_text), len(orig_text))
106
- if score > best_score:
107
- best_score = score
108
- best_match = orig_seg
109
-
110
- # If we found a good match, update the mapping
111
- if best_match and best_score > 0.8:
112
- mapping[best_match.line_number] = aligned_sup.start
113
-
114
- return mapping
115
-
116
- @classmethod
117
- def _replace_timestamp(cls, line: str, new_timestamp: float) -> str:
118
- """Replace timestamp in a line with new timestamp."""
119
- new_ts_str = cls.format_timestamp(new_timestamp)
120
-
121
- # Replace timestamp patterns
122
- # Pattern 1: [HH:MM:SS] at the end or in brackets
123
- line = re.sub(r"\[\d{2}:\d{2}:\d{2}\]", new_ts_str, line)
124
-
125
- return line
126
-
127
- @classmethod
128
- def write_aligned_transcript(
129
- cls,
130
- aligned_supervisions: List[Supervision],
131
- output_path: Pathlike,
132
- include_word_timestamps: bool = False,
133
- ) -> Pathlike:
134
- """Write a new transcript file from aligned supervisions.
135
-
136
- This creates a simplified transcript format with accurate timestamps.
137
-
138
- Args:
139
- aligned_supervisions: List of aligned Supervision objects
140
- output_path: Path to write the transcript
141
- include_word_timestamps: Whether to include word-level timestamps if available
142
-
143
- Returns:
144
- Path to the output file
145
- """
146
- output_path = Path(output_path)
147
- output_path.parent.mkdir(parents=True, exist_ok=True)
148
-
149
- with open(output_path, "w", encoding="utf-8") as f:
150
- f.write("# Aligned Transcript\n\n")
151
-
152
- for i, sup in enumerate(aligned_supervisions):
153
- # Write segment with timestamp
154
- start_ts = cls.format_timestamp(sup.start)
155
- f.write(f"{start_ts} {sup.text}\n")
156
-
157
- # Optionally write word-level timestamps
158
- if include_word_timestamps and hasattr(sup, "alignment") and sup.alignment:
159
- if "word" in sup.alignment:
160
- f.write(" Words: ")
161
- word_parts = []
162
- for word_info in sup.alignment["word"]:
163
- word_ts = cls.format_timestamp(word_info["start"])
164
- word_parts.append(f'{word_info["symbol"]}{word_ts}')
165
- f.write(" ".join(word_parts))
166
- f.write("\n")
167
-
168
- f.write("\n")
169
-
170
- return output_path
171
-
172
-
173
- __all__ = ["GeminiWriter"]
@@ -1,34 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import Optional
3
-
4
- from lhotse.supervision import SupervisionSegment
5
- from lhotse.utils import Seconds
6
-
7
-
8
- @dataclass
9
- class Supervision(SupervisionSegment):
10
- """
11
- Extended SupervisionSegment with simplified initialization.
12
-
13
- Note: The `alignment` field is inherited from SupervisionSegment:
14
- alignment: Optional[Dict[str, List[AlignmentItem]]] = None
15
-
16
- Structure of alignment when return_details=True:
17
- {
18
- 'word': [
19
- AlignmentItem(symbol='hello', start=0.0, duration=0.5, score=0.95),
20
- AlignmentItem(symbol='world', start=0.6, duration=0.4, score=0.92),
21
- ...
22
- ]
23
- }
24
- """
25
-
26
- text: Optional[str] = None
27
- speaker: Optional[str] = None
28
- id: str = ""
29
- recording_id: str = ""
30
- start: Seconds = 0.0
31
- duration: Seconds = 0.0
32
-
33
-
34
- __all__ = ["Supervision"]