lattifai 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. lattifai/__init__.py +0 -24
  2. lattifai/alignment/__init__.py +10 -1
  3. lattifai/alignment/lattice1_aligner.py +66 -58
  4. lattifai/alignment/lattice1_worker.py +1 -6
  5. lattifai/alignment/punctuation.py +38 -0
  6. lattifai/alignment/segmenter.py +1 -1
  7. lattifai/alignment/sentence_splitter.py +350 -0
  8. lattifai/alignment/text_align.py +440 -0
  9. lattifai/alignment/tokenizer.py +91 -220
  10. lattifai/caption/__init__.py +82 -6
  11. lattifai/caption/caption.py +335 -1143
  12. lattifai/caption/formats/__init__.py +199 -0
  13. lattifai/caption/formats/base.py +211 -0
  14. lattifai/caption/formats/gemini.py +722 -0
  15. lattifai/caption/formats/json.py +194 -0
  16. lattifai/caption/formats/lrc.py +309 -0
  17. lattifai/caption/formats/nle/__init__.py +9 -0
  18. lattifai/caption/formats/nle/audition.py +561 -0
  19. lattifai/caption/formats/nle/avid.py +423 -0
  20. lattifai/caption/formats/nle/fcpxml.py +549 -0
  21. lattifai/caption/formats/nle/premiere.py +589 -0
  22. lattifai/caption/formats/pysubs2.py +642 -0
  23. lattifai/caption/formats/sbv.py +147 -0
  24. lattifai/caption/formats/tabular.py +338 -0
  25. lattifai/caption/formats/textgrid.py +193 -0
  26. lattifai/caption/formats/ttml.py +652 -0
  27. lattifai/caption/formats/vtt.py +469 -0
  28. lattifai/caption/parsers/__init__.py +9 -0
  29. lattifai/caption/{text_parser.py → parsers/text_parser.py} +4 -2
  30. lattifai/caption/standardize.py +636 -0
  31. lattifai/caption/utils.py +474 -0
  32. lattifai/cli/__init__.py +2 -1
  33. lattifai/cli/caption.py +108 -1
  34. lattifai/cli/transcribe.py +4 -9
  35. lattifai/cli/youtube.py +4 -1
  36. lattifai/client.py +48 -84
  37. lattifai/config/__init__.py +11 -1
  38. lattifai/config/alignment.py +9 -2
  39. lattifai/config/caption.py +267 -23
  40. lattifai/config/media.py +20 -0
  41. lattifai/diarization/__init__.py +41 -1
  42. lattifai/mixin.py +36 -18
  43. lattifai/transcription/base.py +6 -1
  44. lattifai/transcription/lattifai.py +19 -54
  45. lattifai/utils.py +81 -13
  46. lattifai/workflow/__init__.py +28 -4
  47. lattifai/workflow/file_manager.py +2 -5
  48. lattifai/youtube/__init__.py +43 -0
  49. lattifai/youtube/client.py +1170 -0
  50. lattifai/youtube/types.py +23 -0
  51. lattifai-1.2.2.dist-info/METADATA +615 -0
  52. lattifai-1.2.2.dist-info/RECORD +76 -0
  53. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/entry_points.txt +1 -2
  54. lattifai/caption/gemini_reader.py +0 -371
  55. lattifai/caption/gemini_writer.py +0 -173
  56. lattifai/cli/app_installer.py +0 -142
  57. lattifai/cli/server.py +0 -44
  58. lattifai/server/app.py +0 -427
  59. lattifai/workflow/youtube.py +0 -577
  60. lattifai-1.2.0.dist-info/METADATA +0 -1133
  61. lattifai-1.2.0.dist-info/RECORD +0 -57
  62. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/WHEEL +0 -0
  63. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/licenses/LICENSE +0 -0
  64. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,76 @@
1
+ lattifai/__init__.py,sha256=RXa1IK8Qt6jsAnLlxecOCZmREqv2naXx6T1Fy0g6pqU,1953
2
+ lattifai/audio2.py,sha256=P3N8_BwiscbetzDbkbj-n8BcMu2vWD6-MvtQvGwWWf0,17448
3
+ lattifai/client.py,sha256=pTtpOZRpc3weXkjKZ_-FZLsbbs1CrzVqM4fVqRjiYTc,17179
4
+ lattifai/errors.py,sha256=LyWRGVhQ6Ak2CYn9FBYAPRgQ7_VHpxzNsXI31HXD--s,11291
5
+ lattifai/logging.py,sha256=MbUEeOUFlF92pA9v532DiPPWKl03S7UHCJ6Z652cf0w,2860
6
+ lattifai/mixin.py,sha256=0I-rwvZumaYt8KFTfiVPT2wpXs-JfTEnLOPTdI5r-bM,26115
7
+ lattifai/types.py,sha256=SjYBfwrCBOXlICvH04niFQJ7OzTx7oTaa_npfRkB67U,659
8
+ lattifai/utils.py,sha256=5LeunAN0OQ1jWoKMIThpXSEOxFYD2dCRTdsglosodUU,7963
9
+ lattifai/alignment/__init__.py,sha256=ggOF4MlbnBD7U9yrcyRb1caBR3se_KGA87cfYlyX8RY,450
10
+ lattifai/alignment/lattice1_aligner.py,sha256=WG3mJM4fGyYkY7FdqhPE10yXwBzhdj2TkS-6LF8F_9k,6463
11
+ lattifai/alignment/lattice1_worker.py,sha256=hQbZTgncPq3n-b_l-gUPDPfm460EwuZTKveErgWLWNk,10891
12
+ lattifai/alignment/phonemizer.py,sha256=fbhN2DOl39lW4nQWKzyUUTMUabg7v61lB1kj8SKK-Sw,1761
13
+ lattifai/alignment/punctuation.py,sha256=qLcvuXhBzoEa6bznWZiAB5TAxR6eLr_ZV-PnnCY90UA,1218
14
+ lattifai/alignment/segmenter.py,sha256=0s0eABe0rLAo7eNfl0l5e_knxmZba_BjabPdqsRD45E,6284
15
+ lattifai/alignment/sentence_splitter.py,sha256=2ORvfAgW9yQaqHjts2zlSFjTiNDZF5Fhd5KZX19QWe0,14781
16
+ lattifai/alignment/text_align.py,sha256=PN7RNL5d6jim96zeCUdfdFEdGw--I8zc0kcgWIFJIXU,14910
17
+ lattifai/alignment/tokenizer.py,sha256=AQzXbJ_AW8cg4CAd5TVl1Qd3zH56uy9whX9LVFQ4AaA,17835
18
+ lattifai/caption/__init__.py,sha256=tyIsUvCbImw_qrhp0Nxxrk4vt9szJIPlRcTBviOQkuI,2641
19
+ lattifai/caption/caption.py,sha256=2PHLRDG0Ks4JMl6jNDeXlrI1kpYinektbZ15GwwTcFI,23479
20
+ lattifai/caption/standardize.py,sha256=1pAB8BmziTqYkgj7abCXUcNmNwSV1EAR0PrmbpAEipU,21491
21
+ lattifai/caption/supervision.py,sha256=DRrM8lfKU_x9aVBcLG6xnT0xIJrnc8jzHpzcSwQOg8c,905
22
+ lattifai/caption/utils.py,sha256=YOdJCXhy-6DdrZUkdrJHuPE9sbEHsE9Z7-Vdo4Z5lLY,14406
23
+ lattifai/caption/formats/__init__.py,sha256=UGl7Y0ybMf_F4hiNMMwoKOrpWNxs5m2tiD5zkbwjokY,5240
24
+ lattifai/caption/formats/base.py,sha256=gGeKLKEAB2Hs05R09QMkq5KlXMIQ7bbkUhLct40IcU8,6314
25
+ lattifai/caption/formats/gemini.py,sha256=zIxK7Vxo2YB1eXFiWnsNrz9WSx69lMN0rL-Sd3r57iI,29389
26
+ lattifai/caption/formats/json.py,sha256=s3tFWMUzkWx_IL46phPJnFbJW476Yh_GsxcwD7Q_Mfw,6416
27
+ lattifai/caption/formats/lrc.py,sha256=CWS9wD3Tp6xuvF5gP1nTlVBsBXYnu59_4m4zNRem-c0,11084
28
+ lattifai/caption/formats/pysubs2.py,sha256=eOTQKRbsFStW9gTHaxuAtD7ha1OnrFdqcNLsjdxpHRY,22591
29
+ lattifai/caption/formats/sbv.py,sha256=QUgm5lfRSc2IGSX955yQ7rPiSlaYrOHvniUigr2sF7Y,4520
30
+ lattifai/caption/formats/tabular.py,sha256=HHoiif2yIkMjO9f1bRNAk5Pc0CfkA1mtCFHk5sdLocM,11701
31
+ lattifai/caption/formats/textgrid.py,sha256=m2jMTwLhQa8gbm0Fs1XyEUdiHJaSfCxB9jrYsdk8j7Q,6659
32
+ lattifai/caption/formats/ttml.py,sha256=pJ_wd9pX-MwOhDFMeAHnCpbDiLtIhs888rkW26T7w9Y,23236
33
+ lattifai/caption/formats/vtt.py,sha256=f5OWqsr-2-ddW3CnMtiiqYKQz-hLYRn2B9WM_eT4-AM,17102
34
+ lattifai/caption/formats/nle/__init__.py,sha256=DPBnWPtxEKCC0J_8DCeTyXULPgkrqFT2jbKvkazAx0s,257
35
+ lattifai/caption/formats/nle/audition.py,sha256=65ipbUPdwgvNcUA--dQuisWCbmlt6nHPRbSdl4UUF2Q,18076
36
+ lattifai/caption/formats/nle/avid.py,sha256=UQwFlN4-Myly-kXZxuJTu-7IunEN2_PtAcK9YGQVpMA,14403
37
+ lattifai/caption/formats/nle/fcpxml.py,sha256=76NL6PeIR3KAG1BZscAZdoFJr5wcNdoS4j3VZsOxFV8,18317
38
+ lattifai/caption/formats/nle/premiere.py,sha256=Y2nXSWxI0J0YhV3iHJ9jDrFs0S_5sX32_fEi9SJyVt0,21319
39
+ lattifai/caption/parsers/__init__.py,sha256=z1JMr47FVl7CGbBDg92PKj9RabKktJIUv9iTmmKfEes,227
40
+ lattifai/caption/parsers/text_parser.py,sha256=rQv-aedTWowBe7crvYEOrHqrgKdpNBPcM8HeU-jElHY,4793
41
+ lattifai/cli/__init__.py,sha256=PdqoCTqRSFSWrqL3FjBTa5VzJy_e6Rq0OzyT7YkyHpc,541
42
+ lattifai/cli/alignment.py,sha256=06em-Uaf6NhSz1ce4dwT2r8n56NrtibR7ZsSkmc18Kc,5954
43
+ lattifai/cli/caption.py,sha256=b2mSVFVgL76b4FB5UoJ7AW5iGzPfKiWiLhbM96z_QoA,10371
44
+ lattifai/cli/diarization.py,sha256=GTd2vnTm6cJN6Q3mFP-ShY9bZBl1_zKzWFu-4HHcMzk,4075
45
+ lattifai/cli/transcribe.py,sha256=YhEalG3TQRK7esAN5SOZUQPwIk3TAI9ZknO8cW8C21Q,8038
46
+ lattifai/cli/youtube.py,sha256=CqAxSC_sErslnrnx2RSwAHc7INKET0wLG9Mf_144O-A,6238
47
+ lattifai/config/__init__.py,sha256=JOOn2WbvWXBN6a_3fSNt24W7xnJY7wn8RyNLa0XIY3s,724
48
+ lattifai/config/alignment.py,sha256=ObWf896GGLfP4jsxJaSk6nUyzeF4MvW-ULoPYa8kd9w,4987
49
+ lattifai/config/caption.py,sha256=D4sKNUestwFessU1nZrUqCTsIzYPgpTg12SZlm0HzbQ,15200
50
+ lattifai/config/client.py,sha256=46b816MiYja3Uan_3wjnhtqDr0M6T-FqEygJ3e50IZc,1664
51
+ lattifai/config/diarization.py,sha256=cIkwCfsYqfMns3i6tKWcwBBBkdnhhmB_Eo0TuOPCw9o,2484
52
+ lattifai/config/media.py,sha256=nxvgC7zeLsthCARPPUbnK2eMJY8R1d-1XgiAsy8kroA,15568
53
+ lattifai/config/transcription.py,sha256=_gPJD6cob_jWNdf841nBHhAqJGCxS6PfSyvx2W_vPcM,3082
54
+ lattifai/diarization/__init__.py,sha256=-ZZ_a5hIQgnlHIOehCTtmVmWOWC2H6eOhSs4AcVtRtk,1782
55
+ lattifai/diarization/lattifai.py,sha256=tCnFL6ywITqeKR8YoCsYvyJxNoIwoC6GsnI9zkXNB-Q,3128
56
+ lattifai/transcription/__init__.py,sha256=vMHciyCEPKhhfM3KjMCeDqnyxU1oghF8g5o5SvpnT_4,2669
57
+ lattifai/transcription/base.py,sha256=A2qnocdRCCbvy8mKP0f3K3mx3ZvYyxVXir3aJ2iU19s,4592
58
+ lattifai/transcription/gemini.py,sha256=LJSQt9nGqQdEG6ZFXoHWltumyMEM7-Ezy8ss0iPJb7k,12414
59
+ lattifai/transcription/lattifai.py,sha256=Sik4IyvzdqIMCvgkaxCzqvo-j7u0MfX045z8AJunjhg,3556
60
+ lattifai/transcription/prompts/README.md,sha256=X49KWSQVdjWxxWUp4R2w3ZqKrAOi6_kDNHh1hMaQ4PE,694
61
+ lattifai/transcription/prompts/__init__.py,sha256=G9b42COaCYv3sPPNkHsGDLOMBuVGKt4mXGYal_BYtYQ,1351
62
+ lattifai/transcription/prompts/gemini/README.md,sha256=rt7f7yDGtaobKBo95LG3u56mqa3ABOXQd0UVgJYtYuo,781
63
+ lattifai/transcription/prompts/gemini/transcription_gem.txt,sha256=cljzZ--BDgnnKzqVCakr-fTp2Xk38UOsUquvruNX-LU,4600
64
+ lattifai/workflow/__init__.py,sha256=INpQgc9gZ2Fp-aTHcpR3TEHGtEtPzjOB8T7-jLzVM0E,1547
65
+ lattifai/workflow/agents.py,sha256=yEOnxnhcTvr1iOhCorNvp8B76P6nQsLRXJCu_rCYFfM,38
66
+ lattifai/workflow/base.py,sha256=8QoVIBZwJfr5mppJbtUFafHv5QR9lL-XrULjTWD0oBg,6257
67
+ lattifai/workflow/file_manager.py,sha256=yc29Vb7JNUMJ9rwM_YjkAHfDInl8HMVAl9A7z7XiIOU,32974
68
+ lattifai/youtube/__init__.py,sha256=_uO3KCx-t6I-JaYFpcYLYpvkbmEOOni3xBqGEbExg68,1587
69
+ lattifai/youtube/client.py,sha256=aEOnd8jp4w1ZZkTfxppl7yz2TVdxMTkb8lGCqQxLqxE,47128
70
+ lattifai/youtube/types.py,sha256=80RgBmvM4tRbxqyNv9GU6hr9vPp_yhKrK0RJ_vG2h4E,472
71
+ lattifai-1.2.2.dist-info/licenses/LICENSE,sha256=xGMLmdFJy6Jkz3Hd0znyQLmcxC93FSZB5isKnEDMoQQ,1066
72
+ lattifai-1.2.2.dist-info/METADATA,sha256=NncEA5sSiDyj2DfZCt251tLSranIOn2Gd4KD2D0Q118,19757
73
+ lattifai-1.2.2.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
74
+ lattifai-1.2.2.dist-info/entry_points.txt,sha256=MfoqXNjXrhD7VMApHgaHmAECTcGVUMUiR0uqnTg7Ads,502
75
+ lattifai-1.2.2.dist-info/top_level.txt,sha256=tHSoXF26r-IGfbIP_JoYATqbmf14h5NrnNJGH4j5reI,9
76
+ lattifai-1.2.2.dist-info/RECORD,,
@@ -1,11 +1,10 @@
1
1
  [console_scripts]
2
2
  lai-align = lattifai.cli.alignment:main
3
- lai-app-install = lattifai.cli.app_installer:main
4
3
  lai-diarize = lattifai.cli.diarization:main
5
- lai-server = lattifai.cli.server:main
6
4
  lai-transcribe = lattifai.cli.transcribe:main
7
5
  lai-youtube = lattifai.cli.youtube:main
8
6
  laicap-convert = lattifai.cli.caption:main_convert
7
+ laicap-diff = lattifai.cli.caption:main_diff
9
8
  laicap-normalize = lattifai.cli.caption:main_normalize
10
9
  laicap-shift = lattifai.cli.caption:main_shift
11
10
 
@@ -1,371 +0,0 @@
1
- """Reader for YouTube transcript files with speaker labels and timestamps."""
2
-
3
- import re
4
- from dataclasses import dataclass
5
- from pathlib import Path
6
- from typing import List, Optional
7
-
8
- from lhotse.utils import Pathlike
9
-
10
- from .supervision import Supervision
11
-
12
-
13
- @dataclass
14
- class GeminiSegment:
15
- """Represents a segment in the Gemini transcript with metadata."""
16
-
17
- text: str
18
- timestamp: Optional[float] = None
19
- speaker: Optional[str] = None
20
- section: Optional[str] = None
21
- segment_type: str = "dialogue" # 'dialogue', 'event', or 'section_header'
22
- line_number: int = 0
23
-
24
- @property
25
- def start(self) -> float:
26
- """Return start time in seconds."""
27
- return self.timestamp if self.timestamp is not None else 0.0
28
-
29
-
30
- class GeminiReader:
31
- """Parser for YouTube transcript format with speaker labels and timestamps."""
32
-
33
- # Regex patterns for parsing (supports both [HH:MM:SS] and [MM:SS] formats)
34
- TIMESTAMP_PATTERN = re.compile(r"\[(\d{1,2}):(\d{2}):(\d{2})\]|\[(\d{1,2}):(\d{2})\]")
35
- SECTION_HEADER_PATTERN = re.compile(r"^##\s*\[(\d{1,2}):(\d{2}):(\d{2})\]\s*(.+)$")
36
- SPEAKER_PATTERN = re.compile(r"^\*\*(.+?[::])\*\*\s*(.+)$")
37
- EVENT_PATTERN = re.compile(r"^\[([^\]]+)\]\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
38
- INLINE_TIMESTAMP_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
39
-
40
- # New patterns for YouTube link format: [[MM:SS](URL&t=seconds)]
41
- YOUTUBE_SECTION_PATTERN = re.compile(r"^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$")
42
- YOUTUBE_INLINE_PATTERN = re.compile(r"^(.+?)\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]$")
43
-
44
- @classmethod
45
- def parse_timestamp(cls, *args) -> float:
46
- """Convert timestamp to seconds.
47
-
48
- Supports both HH:MM:SS and MM:SS formats.
49
- Args can be (hours, minutes, seconds) or (minutes, seconds).
50
- Can also accept a single argument which is seconds.
51
- """
52
- if len(args) == 3:
53
- # HH:MM:SS format
54
- hours, minutes, seconds = args
55
- return int(hours) * 3600 + int(minutes) * 60 + int(seconds)
56
- elif len(args) == 2:
57
- # MM:SS format
58
- minutes, seconds = args
59
- return int(minutes) * 60 + int(seconds)
60
- elif len(args) == 1:
61
- # Direct seconds (from YouTube &t= parameter)
62
- return int(args[0])
63
- else:
64
- raise ValueError(f"Invalid timestamp args: {args}")
65
-
66
- @classmethod
67
- def read(
68
- cls,
69
- transcript_path: Pathlike,
70
- include_events: bool = False,
71
- include_sections: bool = False,
72
- ) -> List[GeminiSegment]:
73
- """Parse YouTube transcript file and return list of transcript segments.
74
-
75
- Args:
76
- transcript_path: Path to the transcript file
77
- include_events: Whether to include event descriptions like [Applause]
78
- include_sections: Whether to include section headers
79
-
80
- Returns:
81
- List of GeminiSegment objects with all metadata
82
- """
83
- transcript_path = Path(transcript_path).expanduser().resolve()
84
- if not transcript_path.exists():
85
- raise FileNotFoundError(f"Transcript file not found: {transcript_path}")
86
-
87
- segments: List[GeminiSegment] = []
88
- current_section = None
89
- current_speaker = None
90
-
91
- with open(transcript_path, "r", encoding="utf-8") as f:
92
- lines = f.readlines()
93
-
94
- for line_num, line in enumerate(lines, start=1):
95
- line = line.strip()
96
- if not line:
97
- continue
98
-
99
- # Skip table of contents
100
- if line.startswith("* ["):
101
- continue
102
- if line.startswith("## Table of Contents"):
103
- continue
104
-
105
- # Parse section headers
106
- section_match = cls.SECTION_HEADER_PATTERN.match(line)
107
- if section_match:
108
- hours, minutes, seconds, section_title = section_match.groups()
109
- timestamp = cls.parse_timestamp(hours, minutes, seconds)
110
- current_section = section_title.strip()
111
- if include_sections:
112
- segments.append(
113
- GeminiSegment(
114
- text=section_title.strip(),
115
- timestamp=timestamp,
116
- section=current_section,
117
- segment_type="section_header",
118
- line_number=line_num,
119
- )
120
- )
121
- continue
122
-
123
- # Parse YouTube format section headers: ## [[MM:SS](URL&t=seconds)] Title
124
- youtube_section_match = cls.YOUTUBE_SECTION_PATTERN.match(line)
125
- if youtube_section_match:
126
- minutes, seconds, url_seconds, section_title = youtube_section_match.groups()
127
- # Use the URL seconds for more accuracy
128
- timestamp = cls.parse_timestamp(url_seconds)
129
- current_section = section_title.strip()
130
- if include_sections:
131
- segments.append(
132
- GeminiSegment(
133
- text=section_title.strip(),
134
- timestamp=timestamp,
135
- section=current_section,
136
- segment_type="section_header",
137
- line_number=line_num,
138
- )
139
- )
140
- continue
141
-
142
- # Parse event descriptions [event] [HH:MM:SS] or [MM:SS]
143
- event_match = cls.EVENT_PATTERN.match(line)
144
- if event_match:
145
- groups = event_match.groups()
146
- event_text = groups[0]
147
- # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
148
- if groups[1] is not None: # HH:MM:SS format
149
- timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
150
- elif groups[4] is not None: # MM:SS format
151
- timestamp = cls.parse_timestamp(groups[4], groups[5])
152
- else:
153
- timestamp = None
154
-
155
- if include_events and timestamp is not None:
156
- segments.append(
157
- GeminiSegment(
158
- text=event_text.strip(),
159
- timestamp=timestamp,
160
- section=current_section,
161
- segment_type="event",
162
- line_number=line_num,
163
- )
164
- )
165
- continue
166
-
167
- # Parse speaker dialogue: **Speaker:** Text [HH:MM:SS] or [MM:SS]
168
- speaker_match = cls.SPEAKER_PATTERN.match(line)
169
- if speaker_match:
170
- speaker, text_with_timestamp = speaker_match.groups()
171
- current_speaker = speaker.strip()
172
-
173
- # Extract timestamp from the end of the text
174
- timestamp_match = cls.INLINE_TIMESTAMP_PATTERN.match(text_with_timestamp.strip())
175
- youtube_match = cls.YOUTUBE_INLINE_PATTERN.match(text_with_timestamp.strip())
176
-
177
- if timestamp_match:
178
- groups = timestamp_match.groups()
179
- text = groups[0]
180
- # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
181
- if groups[1] is not None: # HH:MM:SS format
182
- timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
183
- elif groups[4] is not None: # MM:SS format
184
- timestamp = cls.parse_timestamp(groups[4], groups[5])
185
- else:
186
- timestamp = None
187
- elif youtube_match:
188
- groups = youtube_match.groups()
189
- text = groups[0]
190
- # Extract seconds from URL parameter
191
- url_seconds = groups[3]
192
- timestamp = cls.parse_timestamp(url_seconds)
193
- else:
194
- text = text_with_timestamp.strip()
195
- timestamp = None
196
-
197
- segments.append(
198
- GeminiSegment(
199
- text=text.strip(),
200
- timestamp=timestamp,
201
- speaker=current_speaker,
202
- section=current_section,
203
- segment_type="dialogue",
204
- line_number=line_num,
205
- )
206
- )
207
- current_speaker = None # Reset speaker after use
208
- continue
209
-
210
- # Parse plain text with timestamp at the end
211
- inline_match = cls.INLINE_TIMESTAMP_PATTERN.match(line)
212
- youtube_inline_match = cls.YOUTUBE_INLINE_PATTERN.match(line)
213
-
214
- if inline_match:
215
- groups = inline_match.groups()
216
- text = groups[0]
217
- # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
218
- if groups[1] is not None: # HH:MM:SS format
219
- timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
220
- elif groups[4] is not None: # MM:SS format
221
- timestamp = cls.parse_timestamp(groups[4], groups[5])
222
- else:
223
- timestamp = None
224
-
225
- segments.append(
226
- GeminiSegment(
227
- text=text.strip(),
228
- timestamp=timestamp,
229
- speaker=current_speaker,
230
- section=current_section,
231
- segment_type="dialogue",
232
- line_number=line_num,
233
- )
234
- )
235
- continue
236
- elif youtube_inline_match:
237
- groups = youtube_inline_match.groups()
238
- text = groups[0]
239
- # Extract seconds from URL parameter
240
- url_seconds = groups[3]
241
- timestamp = cls.parse_timestamp(url_seconds)
242
-
243
- segments.append(
244
- GeminiSegment(
245
- text=text.strip(),
246
- timestamp=timestamp,
247
- speaker=current_speaker,
248
- section=current_section,
249
- segment_type="dialogue",
250
- line_number=line_num,
251
- )
252
- )
253
- continue
254
-
255
- # Skip markdown headers and other formatting
256
- if line.startswith("#"):
257
- continue
258
-
259
- return segments
260
-
261
- @classmethod
262
- def extract_for_alignment(
263
- cls,
264
- transcript_path: Pathlike,
265
- merge_consecutive: bool = False,
266
- min_duration: float = 0.1,
267
- merge_max_gap: float = 2.0,
268
- ) -> List[Supervision]:
269
- """Extract text segments for forced alignment.
270
-
271
- This extracts only dialogue segments (not events or section headers)
272
- and converts them to Supervision objects suitable for alignment.
273
-
274
- Args:
275
- transcript_path: Path to the transcript file
276
- merge_consecutive: Whether to merge consecutive segments from same speaker
277
- min_duration: Minimum duration for a segment
278
- merge_max_gap: Maximum time gap (seconds) to merge consecutive segments
279
-
280
- Returns:
281
- List of Supervision objects ready for alignment
282
- """
283
- segments = cls.read(transcript_path, include_events=False, include_sections=False)
284
-
285
- # Filter to only dialogue segments with timestamps
286
- dialogue_segments = [s for s in segments if s.segment_type == "dialogue" and s.timestamp is not None]
287
-
288
- if not dialogue_segments:
289
- raise ValueError(f"No dialogue segments with timestamps found in {transcript_path}")
290
-
291
- # Sort by timestamp
292
- dialogue_segments.sort(key=lambda x: x.timestamp)
293
-
294
- # Convert to Supervision objects
295
- supervisions: List[Supervision] = []
296
-
297
- for i, segment in enumerate(dialogue_segments):
298
- # Estimate duration based on next segment
299
- if i < len(dialogue_segments) - 1:
300
- duration = dialogue_segments[i + 1].timestamp - segment.timestamp
301
- else:
302
- # Last segment: estimate based on text length (rough heuristic)
303
- words = len(segment.text.split())
304
- duration = words * 0.3 # ~0.3 seconds per word
305
-
306
- supervisions.append(
307
- Supervision(
308
- text=segment.text,
309
- start=segment.timestamp,
310
- duration=max(duration, min_duration),
311
- id=f"segment_{i:05d}",
312
- speaker=segment.speaker,
313
- )
314
- )
315
-
316
- # Optionally merge consecutive segments from same speaker
317
- if merge_consecutive:
318
- merged = []
319
- current_speaker = None
320
- current_texts = []
321
- current_start = None
322
- last_end_time = None
323
-
324
- for i, (segment, sup) in enumerate(zip(dialogue_segments, supervisions)):
325
- # Check if we should merge with previous segment
326
- should_merge = False
327
- if segment.speaker == current_speaker and current_start is not None:
328
- # Same speaker - check time gap
329
- time_gap = sup.start - last_end_time if last_end_time else 0
330
- if time_gap <= merge_max_gap:
331
- should_merge = True
332
-
333
- if should_merge:
334
- # Same speaker within time threshold, accumulate
335
- current_texts.append(segment.text)
336
- last_end_time = sup.start + sup.duration
337
- else:
338
- # Different speaker or gap too large, save previous segment
339
- if current_texts:
340
- merged_text = " ".join(current_texts)
341
- merged.append(
342
- Supervision(
343
- text=merged_text,
344
- start=current_start,
345
- duration=last_end_time - current_start,
346
- id=f"merged_{len(merged):05d}",
347
- )
348
- )
349
- current_speaker = segment.speaker
350
- current_texts = [segment.text]
351
- current_start = sup.start
352
- last_end_time = sup.start + sup.duration
353
-
354
- # Add final segment
355
- if current_texts:
356
- merged_text = " ".join(current_texts)
357
- merged.append(
358
- Supervision(
359
- text=merged_text,
360
- start=current_start,
361
- duration=last_end_time - current_start,
362
- id=f"merged_{len(merged):05d}",
363
- )
364
- )
365
-
366
- supervisions = merged
367
-
368
- return supervisions
369
-
370
-
371
- __all__ = ["GeminiReader", "GeminiSegment"]
@@ -1,173 +0,0 @@
1
- """Writer for YouTube transcript files with corrected timestamps from alignment."""
2
-
3
- import re
4
- from pathlib import Path
5
- from typing import Dict, List, Optional
6
-
7
- from lhotse.utils import Pathlike
8
-
9
- from .gemini_reader import GeminiReader, GeminiSegment
10
- from .supervision import Supervision
11
-
12
-
13
- class GeminiWriter:
14
- """Writer for updating YouTube transcript timestamps based on alignment results."""
15
-
16
- @staticmethod
17
- def format_timestamp(seconds: float) -> str:
18
- """Convert seconds to [HH:MM:SS] format."""
19
- hours = int(seconds // 3600)
20
- minutes = int((seconds % 3600) // 60)
21
- secs = int(seconds % 60)
22
- return f"[{hours:02d}:{minutes:02d}:{secs:02d}]"
23
-
24
- @classmethod
25
- def update_timestamps(
26
- cls,
27
- original_transcript: Pathlike,
28
- aligned_supervisions: List[Supervision],
29
- output_path: Pathlike,
30
- timestamp_mapping: Optional[Dict[int, float]] = None,
31
- ) -> Pathlike:
32
- """Update transcript file with corrected timestamps from alignment.
33
-
34
- Args:
35
- original_transcript: Path to the original transcript file
36
- aligned_supervisions: List of aligned Supervision objects with corrected timestamps
37
- output_path: Path to write the updated transcript
38
- timestamp_mapping: Optional manual mapping from line_number to new timestamp
39
-
40
- Returns:
41
- Path to the output file
42
- """
43
- original_path = Path(original_transcript)
44
- output_path = Path(output_path)
45
-
46
- # Read original file
47
- with open(original_path, "r", encoding="utf-8") as f:
48
- lines = f.readlines()
49
-
50
- # Parse original segments to get line numbers
51
- original_segments = GeminiReader.read(original_transcript, include_events=True, include_sections=True)
52
-
53
- # Create mapping from line number to new timestamp
54
- if timestamp_mapping is None:
55
- timestamp_mapping = cls._create_timestamp_mapping(original_segments, aligned_supervisions)
56
-
57
- # Update timestamps in lines
58
- updated_lines = []
59
- for line_num, line in enumerate(lines, start=1):
60
- if line_num in timestamp_mapping:
61
- new_timestamp = timestamp_mapping[line_num]
62
- updated_line = cls._replace_timestamp(line, new_timestamp)
63
- updated_lines.append(updated_line)
64
- else:
65
- updated_lines.append(line)
66
-
67
- # Write updated content
68
- output_path.parent.mkdir(parents=True, exist_ok=True)
69
- with open(output_path, "w", encoding="utf-8") as f:
70
- f.writelines(updated_lines)
71
-
72
- return output_path
73
-
74
- @classmethod
75
- def _create_timestamp_mapping(
76
- cls, original_segments: List[GeminiSegment], aligned_supervisions: List[Supervision]
77
- ) -> Dict[int, float]:
78
- """Create mapping from line numbers to new timestamps based on alignment.
79
-
80
- This performs text matching between original segments and aligned supervisions
81
- to determine which timestamps should be updated.
82
- """
83
- mapping = {}
84
-
85
- # Create a simple text-based matching
86
- dialogue_segments = [s for s in original_segments if s.segment_type == "dialogue"]
87
-
88
- # Try to match based on text content
89
- for aligned_sup in aligned_supervisions:
90
- aligned_text = aligned_sup.text.strip()
91
-
92
- # Find best matching original segment
93
- best_match = None
94
- best_score = 0
95
-
96
- for orig_seg in dialogue_segments:
97
- orig_text = orig_seg.text.strip()
98
-
99
- # Simple text similarity (could be improved with fuzzy matching)
100
- if aligned_text == orig_text:
101
- best_match = orig_seg
102
- best_score = 1.0
103
- break
104
- elif aligned_text in orig_text or orig_text in aligned_text:
105
- score = min(len(aligned_text), len(orig_text)) / max(len(aligned_text), len(orig_text))
106
- if score > best_score:
107
- best_score = score
108
- best_match = orig_seg
109
-
110
- # If we found a good match, update the mapping
111
- if best_match and best_score > 0.8:
112
- mapping[best_match.line_number] = aligned_sup.start
113
-
114
- return mapping
115
-
116
- @classmethod
117
- def _replace_timestamp(cls, line: str, new_timestamp: float) -> str:
118
- """Replace timestamp in a line with new timestamp."""
119
- new_ts_str = cls.format_timestamp(new_timestamp)
120
-
121
- # Replace timestamp patterns
122
- # Pattern 1: [HH:MM:SS] at the end or in brackets
123
- line = re.sub(r"\[\d{2}:\d{2}:\d{2}\]", new_ts_str, line)
124
-
125
- return line
126
-
127
- @classmethod
128
- def write_aligned_transcript(
129
- cls,
130
- aligned_supervisions: List[Supervision],
131
- output_path: Pathlike,
132
- include_word_timestamps: bool = False,
133
- ) -> Pathlike:
134
- """Write a new transcript file from aligned supervisions.
135
-
136
- This creates a simplified transcript format with accurate timestamps.
137
-
138
- Args:
139
- aligned_supervisions: List of aligned Supervision objects
140
- output_path: Path to write the transcript
141
- include_word_timestamps: Whether to include word-level timestamps if available
142
-
143
- Returns:
144
- Path to the output file
145
- """
146
- output_path = Path(output_path)
147
- output_path.parent.mkdir(parents=True, exist_ok=True)
148
-
149
- with open(output_path, "w", encoding="utf-8") as f:
150
- f.write("# Aligned Transcript\n\n")
151
-
152
- for i, sup in enumerate(aligned_supervisions):
153
- # Write segment with timestamp
154
- start_ts = cls.format_timestamp(sup.start)
155
- f.write(f"{start_ts} {sup.text}\n")
156
-
157
- # Optionally write word-level timestamps
158
- if include_word_timestamps and hasattr(sup, "alignment") and sup.alignment:
159
- if "word" in sup.alignment:
160
- f.write(" Words: ")
161
- word_parts = []
162
- for word_info in sup.alignment["word"]:
163
- word_ts = cls.format_timestamp(word_info["start"])
164
- word_parts.append(f'{word_info["symbol"]}{word_ts}')
165
- f.write(" ".join(word_parts))
166
- f.write("\n")
167
-
168
- f.write("\n")
169
-
170
- return output_path
171
-
172
-
173
- __all__ = ["GeminiWriter"]