lattifai 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +0 -24
- lattifai/alignment/__init__.py +10 -1
- lattifai/alignment/lattice1_aligner.py +66 -58
- lattifai/alignment/lattice1_worker.py +1 -6
- lattifai/alignment/punctuation.py +38 -0
- lattifai/alignment/segmenter.py +1 -1
- lattifai/alignment/sentence_splitter.py +350 -0
- lattifai/alignment/text_align.py +440 -0
- lattifai/alignment/tokenizer.py +91 -220
- lattifai/caption/__init__.py +82 -6
- lattifai/caption/caption.py +335 -1143
- lattifai/caption/formats/__init__.py +199 -0
- lattifai/caption/formats/base.py +211 -0
- lattifai/caption/formats/gemini.py +722 -0
- lattifai/caption/formats/json.py +194 -0
- lattifai/caption/formats/lrc.py +309 -0
- lattifai/caption/formats/nle/__init__.py +9 -0
- lattifai/caption/formats/nle/audition.py +561 -0
- lattifai/caption/formats/nle/avid.py +423 -0
- lattifai/caption/formats/nle/fcpxml.py +549 -0
- lattifai/caption/formats/nle/premiere.py +589 -0
- lattifai/caption/formats/pysubs2.py +642 -0
- lattifai/caption/formats/sbv.py +147 -0
- lattifai/caption/formats/tabular.py +338 -0
- lattifai/caption/formats/textgrid.py +193 -0
- lattifai/caption/formats/ttml.py +652 -0
- lattifai/caption/formats/vtt.py +469 -0
- lattifai/caption/parsers/__init__.py +9 -0
- lattifai/caption/{text_parser.py → parsers/text_parser.py} +4 -2
- lattifai/caption/standardize.py +636 -0
- lattifai/caption/utils.py +474 -0
- lattifai/cli/__init__.py +2 -1
- lattifai/cli/caption.py +108 -1
- lattifai/cli/transcribe.py +4 -9
- lattifai/cli/youtube.py +4 -1
- lattifai/client.py +48 -84
- lattifai/config/__init__.py +11 -1
- lattifai/config/alignment.py +9 -2
- lattifai/config/caption.py +267 -23
- lattifai/config/media.py +20 -0
- lattifai/diarization/__init__.py +41 -1
- lattifai/mixin.py +36 -18
- lattifai/transcription/base.py +6 -1
- lattifai/transcription/lattifai.py +19 -54
- lattifai/utils.py +81 -13
- lattifai/workflow/__init__.py +28 -4
- lattifai/workflow/file_manager.py +2 -5
- lattifai/youtube/__init__.py +43 -0
- lattifai/youtube/client.py +1170 -0
- lattifai/youtube/types.py +23 -0
- lattifai-1.2.2.dist-info/METADATA +615 -0
- lattifai-1.2.2.dist-info/RECORD +76 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/entry_points.txt +1 -2
- lattifai/caption/gemini_reader.py +0 -371
- lattifai/caption/gemini_writer.py +0 -173
- lattifai/cli/app_installer.py +0 -142
- lattifai/cli/server.py +0 -44
- lattifai/server/app.py +0 -427
- lattifai/workflow/youtube.py +0 -577
- lattifai-1.2.0.dist-info/METADATA +0 -1133
- lattifai-1.2.0.dist-info/RECORD +0 -57
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/WHEEL +0 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
lattifai/__init__.py,sha256=RXa1IK8Qt6jsAnLlxecOCZmREqv2naXx6T1Fy0g6pqU,1953
|
|
2
|
+
lattifai/audio2.py,sha256=P3N8_BwiscbetzDbkbj-n8BcMu2vWD6-MvtQvGwWWf0,17448
|
|
3
|
+
lattifai/client.py,sha256=pTtpOZRpc3weXkjKZ_-FZLsbbs1CrzVqM4fVqRjiYTc,17179
|
|
4
|
+
lattifai/errors.py,sha256=LyWRGVhQ6Ak2CYn9FBYAPRgQ7_VHpxzNsXI31HXD--s,11291
|
|
5
|
+
lattifai/logging.py,sha256=MbUEeOUFlF92pA9v532DiPPWKl03S7UHCJ6Z652cf0w,2860
|
|
6
|
+
lattifai/mixin.py,sha256=0I-rwvZumaYt8KFTfiVPT2wpXs-JfTEnLOPTdI5r-bM,26115
|
|
7
|
+
lattifai/types.py,sha256=SjYBfwrCBOXlICvH04niFQJ7OzTx7oTaa_npfRkB67U,659
|
|
8
|
+
lattifai/utils.py,sha256=5LeunAN0OQ1jWoKMIThpXSEOxFYD2dCRTdsglosodUU,7963
|
|
9
|
+
lattifai/alignment/__init__.py,sha256=ggOF4MlbnBD7U9yrcyRb1caBR3se_KGA87cfYlyX8RY,450
|
|
10
|
+
lattifai/alignment/lattice1_aligner.py,sha256=WG3mJM4fGyYkY7FdqhPE10yXwBzhdj2TkS-6LF8F_9k,6463
|
|
11
|
+
lattifai/alignment/lattice1_worker.py,sha256=hQbZTgncPq3n-b_l-gUPDPfm460EwuZTKveErgWLWNk,10891
|
|
12
|
+
lattifai/alignment/phonemizer.py,sha256=fbhN2DOl39lW4nQWKzyUUTMUabg7v61lB1kj8SKK-Sw,1761
|
|
13
|
+
lattifai/alignment/punctuation.py,sha256=qLcvuXhBzoEa6bznWZiAB5TAxR6eLr_ZV-PnnCY90UA,1218
|
|
14
|
+
lattifai/alignment/segmenter.py,sha256=0s0eABe0rLAo7eNfl0l5e_knxmZba_BjabPdqsRD45E,6284
|
|
15
|
+
lattifai/alignment/sentence_splitter.py,sha256=2ORvfAgW9yQaqHjts2zlSFjTiNDZF5Fhd5KZX19QWe0,14781
|
|
16
|
+
lattifai/alignment/text_align.py,sha256=PN7RNL5d6jim96zeCUdfdFEdGw--I8zc0kcgWIFJIXU,14910
|
|
17
|
+
lattifai/alignment/tokenizer.py,sha256=AQzXbJ_AW8cg4CAd5TVl1Qd3zH56uy9whX9LVFQ4AaA,17835
|
|
18
|
+
lattifai/caption/__init__.py,sha256=tyIsUvCbImw_qrhp0Nxxrk4vt9szJIPlRcTBviOQkuI,2641
|
|
19
|
+
lattifai/caption/caption.py,sha256=2PHLRDG0Ks4JMl6jNDeXlrI1kpYinektbZ15GwwTcFI,23479
|
|
20
|
+
lattifai/caption/standardize.py,sha256=1pAB8BmziTqYkgj7abCXUcNmNwSV1EAR0PrmbpAEipU,21491
|
|
21
|
+
lattifai/caption/supervision.py,sha256=DRrM8lfKU_x9aVBcLG6xnT0xIJrnc8jzHpzcSwQOg8c,905
|
|
22
|
+
lattifai/caption/utils.py,sha256=YOdJCXhy-6DdrZUkdrJHuPE9sbEHsE9Z7-Vdo4Z5lLY,14406
|
|
23
|
+
lattifai/caption/formats/__init__.py,sha256=UGl7Y0ybMf_F4hiNMMwoKOrpWNxs5m2tiD5zkbwjokY,5240
|
|
24
|
+
lattifai/caption/formats/base.py,sha256=gGeKLKEAB2Hs05R09QMkq5KlXMIQ7bbkUhLct40IcU8,6314
|
|
25
|
+
lattifai/caption/formats/gemini.py,sha256=zIxK7Vxo2YB1eXFiWnsNrz9WSx69lMN0rL-Sd3r57iI,29389
|
|
26
|
+
lattifai/caption/formats/json.py,sha256=s3tFWMUzkWx_IL46phPJnFbJW476Yh_GsxcwD7Q_Mfw,6416
|
|
27
|
+
lattifai/caption/formats/lrc.py,sha256=CWS9wD3Tp6xuvF5gP1nTlVBsBXYnu59_4m4zNRem-c0,11084
|
|
28
|
+
lattifai/caption/formats/pysubs2.py,sha256=eOTQKRbsFStW9gTHaxuAtD7ha1OnrFdqcNLsjdxpHRY,22591
|
|
29
|
+
lattifai/caption/formats/sbv.py,sha256=QUgm5lfRSc2IGSX955yQ7rPiSlaYrOHvniUigr2sF7Y,4520
|
|
30
|
+
lattifai/caption/formats/tabular.py,sha256=HHoiif2yIkMjO9f1bRNAk5Pc0CfkA1mtCFHk5sdLocM,11701
|
|
31
|
+
lattifai/caption/formats/textgrid.py,sha256=m2jMTwLhQa8gbm0Fs1XyEUdiHJaSfCxB9jrYsdk8j7Q,6659
|
|
32
|
+
lattifai/caption/formats/ttml.py,sha256=pJ_wd9pX-MwOhDFMeAHnCpbDiLtIhs888rkW26T7w9Y,23236
|
|
33
|
+
lattifai/caption/formats/vtt.py,sha256=f5OWqsr-2-ddW3CnMtiiqYKQz-hLYRn2B9WM_eT4-AM,17102
|
|
34
|
+
lattifai/caption/formats/nle/__init__.py,sha256=DPBnWPtxEKCC0J_8DCeTyXULPgkrqFT2jbKvkazAx0s,257
|
|
35
|
+
lattifai/caption/formats/nle/audition.py,sha256=65ipbUPdwgvNcUA--dQuisWCbmlt6nHPRbSdl4UUF2Q,18076
|
|
36
|
+
lattifai/caption/formats/nle/avid.py,sha256=UQwFlN4-Myly-kXZxuJTu-7IunEN2_PtAcK9YGQVpMA,14403
|
|
37
|
+
lattifai/caption/formats/nle/fcpxml.py,sha256=76NL6PeIR3KAG1BZscAZdoFJr5wcNdoS4j3VZsOxFV8,18317
|
|
38
|
+
lattifai/caption/formats/nle/premiere.py,sha256=Y2nXSWxI0J0YhV3iHJ9jDrFs0S_5sX32_fEi9SJyVt0,21319
|
|
39
|
+
lattifai/caption/parsers/__init__.py,sha256=z1JMr47FVl7CGbBDg92PKj9RabKktJIUv9iTmmKfEes,227
|
|
40
|
+
lattifai/caption/parsers/text_parser.py,sha256=rQv-aedTWowBe7crvYEOrHqrgKdpNBPcM8HeU-jElHY,4793
|
|
41
|
+
lattifai/cli/__init__.py,sha256=PdqoCTqRSFSWrqL3FjBTa5VzJy_e6Rq0OzyT7YkyHpc,541
|
|
42
|
+
lattifai/cli/alignment.py,sha256=06em-Uaf6NhSz1ce4dwT2r8n56NrtibR7ZsSkmc18Kc,5954
|
|
43
|
+
lattifai/cli/caption.py,sha256=b2mSVFVgL76b4FB5UoJ7AW5iGzPfKiWiLhbM96z_QoA,10371
|
|
44
|
+
lattifai/cli/diarization.py,sha256=GTd2vnTm6cJN6Q3mFP-ShY9bZBl1_zKzWFu-4HHcMzk,4075
|
|
45
|
+
lattifai/cli/transcribe.py,sha256=YhEalG3TQRK7esAN5SOZUQPwIk3TAI9ZknO8cW8C21Q,8038
|
|
46
|
+
lattifai/cli/youtube.py,sha256=CqAxSC_sErslnrnx2RSwAHc7INKET0wLG9Mf_144O-A,6238
|
|
47
|
+
lattifai/config/__init__.py,sha256=JOOn2WbvWXBN6a_3fSNt24W7xnJY7wn8RyNLa0XIY3s,724
|
|
48
|
+
lattifai/config/alignment.py,sha256=ObWf896GGLfP4jsxJaSk6nUyzeF4MvW-ULoPYa8kd9w,4987
|
|
49
|
+
lattifai/config/caption.py,sha256=D4sKNUestwFessU1nZrUqCTsIzYPgpTg12SZlm0HzbQ,15200
|
|
50
|
+
lattifai/config/client.py,sha256=46b816MiYja3Uan_3wjnhtqDr0M6T-FqEygJ3e50IZc,1664
|
|
51
|
+
lattifai/config/diarization.py,sha256=cIkwCfsYqfMns3i6tKWcwBBBkdnhhmB_Eo0TuOPCw9o,2484
|
|
52
|
+
lattifai/config/media.py,sha256=nxvgC7zeLsthCARPPUbnK2eMJY8R1d-1XgiAsy8kroA,15568
|
|
53
|
+
lattifai/config/transcription.py,sha256=_gPJD6cob_jWNdf841nBHhAqJGCxS6PfSyvx2W_vPcM,3082
|
|
54
|
+
lattifai/diarization/__init__.py,sha256=-ZZ_a5hIQgnlHIOehCTtmVmWOWC2H6eOhSs4AcVtRtk,1782
|
|
55
|
+
lattifai/diarization/lattifai.py,sha256=tCnFL6ywITqeKR8YoCsYvyJxNoIwoC6GsnI9zkXNB-Q,3128
|
|
56
|
+
lattifai/transcription/__init__.py,sha256=vMHciyCEPKhhfM3KjMCeDqnyxU1oghF8g5o5SvpnT_4,2669
|
|
57
|
+
lattifai/transcription/base.py,sha256=A2qnocdRCCbvy8mKP0f3K3mx3ZvYyxVXir3aJ2iU19s,4592
|
|
58
|
+
lattifai/transcription/gemini.py,sha256=LJSQt9nGqQdEG6ZFXoHWltumyMEM7-Ezy8ss0iPJb7k,12414
|
|
59
|
+
lattifai/transcription/lattifai.py,sha256=Sik4IyvzdqIMCvgkaxCzqvo-j7u0MfX045z8AJunjhg,3556
|
|
60
|
+
lattifai/transcription/prompts/README.md,sha256=X49KWSQVdjWxxWUp4R2w3ZqKrAOi6_kDNHh1hMaQ4PE,694
|
|
61
|
+
lattifai/transcription/prompts/__init__.py,sha256=G9b42COaCYv3sPPNkHsGDLOMBuVGKt4mXGYal_BYtYQ,1351
|
|
62
|
+
lattifai/transcription/prompts/gemini/README.md,sha256=rt7f7yDGtaobKBo95LG3u56mqa3ABOXQd0UVgJYtYuo,781
|
|
63
|
+
lattifai/transcription/prompts/gemini/transcription_gem.txt,sha256=cljzZ--BDgnnKzqVCakr-fTp2Xk38UOsUquvruNX-LU,4600
|
|
64
|
+
lattifai/workflow/__init__.py,sha256=INpQgc9gZ2Fp-aTHcpR3TEHGtEtPzjOB8T7-jLzVM0E,1547
|
|
65
|
+
lattifai/workflow/agents.py,sha256=yEOnxnhcTvr1iOhCorNvp8B76P6nQsLRXJCu_rCYFfM,38
|
|
66
|
+
lattifai/workflow/base.py,sha256=8QoVIBZwJfr5mppJbtUFafHv5QR9lL-XrULjTWD0oBg,6257
|
|
67
|
+
lattifai/workflow/file_manager.py,sha256=yc29Vb7JNUMJ9rwM_YjkAHfDInl8HMVAl9A7z7XiIOU,32974
|
|
68
|
+
lattifai/youtube/__init__.py,sha256=_uO3KCx-t6I-JaYFpcYLYpvkbmEOOni3xBqGEbExg68,1587
|
|
69
|
+
lattifai/youtube/client.py,sha256=aEOnd8jp4w1ZZkTfxppl7yz2TVdxMTkb8lGCqQxLqxE,47128
|
|
70
|
+
lattifai/youtube/types.py,sha256=80RgBmvM4tRbxqyNv9GU6hr9vPp_yhKrK0RJ_vG2h4E,472
|
|
71
|
+
lattifai-1.2.2.dist-info/licenses/LICENSE,sha256=xGMLmdFJy6Jkz3Hd0znyQLmcxC93FSZB5isKnEDMoQQ,1066
|
|
72
|
+
lattifai-1.2.2.dist-info/METADATA,sha256=NncEA5sSiDyj2DfZCt251tLSranIOn2Gd4KD2D0Q118,19757
|
|
73
|
+
lattifai-1.2.2.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
|
74
|
+
lattifai-1.2.2.dist-info/entry_points.txt,sha256=MfoqXNjXrhD7VMApHgaHmAECTcGVUMUiR0uqnTg7Ads,502
|
|
75
|
+
lattifai-1.2.2.dist-info/top_level.txt,sha256=tHSoXF26r-IGfbIP_JoYATqbmf14h5NrnNJGH4j5reI,9
|
|
76
|
+
lattifai-1.2.2.dist-info/RECORD,,
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
[console_scripts]
|
|
2
2
|
lai-align = lattifai.cli.alignment:main
|
|
3
|
-
lai-app-install = lattifai.cli.app_installer:main
|
|
4
3
|
lai-diarize = lattifai.cli.diarization:main
|
|
5
|
-
lai-server = lattifai.cli.server:main
|
|
6
4
|
lai-transcribe = lattifai.cli.transcribe:main
|
|
7
5
|
lai-youtube = lattifai.cli.youtube:main
|
|
8
6
|
laicap-convert = lattifai.cli.caption:main_convert
|
|
7
|
+
laicap-diff = lattifai.cli.caption:main_diff
|
|
9
8
|
laicap-normalize = lattifai.cli.caption:main_normalize
|
|
10
9
|
laicap-shift = lattifai.cli.caption:main_shift
|
|
11
10
|
|
|
@@ -1,371 +0,0 @@
|
|
|
1
|
-
"""Reader for YouTube transcript files with speaker labels and timestamps."""
|
|
2
|
-
|
|
3
|
-
import re
|
|
4
|
-
from dataclasses import dataclass
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import List, Optional
|
|
7
|
-
|
|
8
|
-
from lhotse.utils import Pathlike
|
|
9
|
-
|
|
10
|
-
from .supervision import Supervision
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
@dataclass
|
|
14
|
-
class GeminiSegment:
|
|
15
|
-
"""Represents a segment in the Gemini transcript with metadata."""
|
|
16
|
-
|
|
17
|
-
text: str
|
|
18
|
-
timestamp: Optional[float] = None
|
|
19
|
-
speaker: Optional[str] = None
|
|
20
|
-
section: Optional[str] = None
|
|
21
|
-
segment_type: str = "dialogue" # 'dialogue', 'event', or 'section_header'
|
|
22
|
-
line_number: int = 0
|
|
23
|
-
|
|
24
|
-
@property
|
|
25
|
-
def start(self) -> float:
|
|
26
|
-
"""Return start time in seconds."""
|
|
27
|
-
return self.timestamp if self.timestamp is not None else 0.0
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
class GeminiReader:
|
|
31
|
-
"""Parser for YouTube transcript format with speaker labels and timestamps."""
|
|
32
|
-
|
|
33
|
-
# Regex patterns for parsing (supports both [HH:MM:SS] and [MM:SS] formats)
|
|
34
|
-
TIMESTAMP_PATTERN = re.compile(r"\[(\d{1,2}):(\d{2}):(\d{2})\]|\[(\d{1,2}):(\d{2})\]")
|
|
35
|
-
SECTION_HEADER_PATTERN = re.compile(r"^##\s*\[(\d{1,2}):(\d{2}):(\d{2})\]\s*(.+)$")
|
|
36
|
-
SPEAKER_PATTERN = re.compile(r"^\*\*(.+?[::])\*\*\s*(.+)$")
|
|
37
|
-
EVENT_PATTERN = re.compile(r"^\[([^\]]+)\]\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
|
|
38
|
-
INLINE_TIMESTAMP_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
|
|
39
|
-
|
|
40
|
-
# New patterns for YouTube link format: [[MM:SS](URL&t=seconds)]
|
|
41
|
-
YOUTUBE_SECTION_PATTERN = re.compile(r"^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$")
|
|
42
|
-
YOUTUBE_INLINE_PATTERN = re.compile(r"^(.+?)\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]$")
|
|
43
|
-
|
|
44
|
-
@classmethod
|
|
45
|
-
def parse_timestamp(cls, *args) -> float:
|
|
46
|
-
"""Convert timestamp to seconds.
|
|
47
|
-
|
|
48
|
-
Supports both HH:MM:SS and MM:SS formats.
|
|
49
|
-
Args can be (hours, minutes, seconds) or (minutes, seconds).
|
|
50
|
-
Can also accept a single argument which is seconds.
|
|
51
|
-
"""
|
|
52
|
-
if len(args) == 3:
|
|
53
|
-
# HH:MM:SS format
|
|
54
|
-
hours, minutes, seconds = args
|
|
55
|
-
return int(hours) * 3600 + int(minutes) * 60 + int(seconds)
|
|
56
|
-
elif len(args) == 2:
|
|
57
|
-
# MM:SS format
|
|
58
|
-
minutes, seconds = args
|
|
59
|
-
return int(minutes) * 60 + int(seconds)
|
|
60
|
-
elif len(args) == 1:
|
|
61
|
-
# Direct seconds (from YouTube &t= parameter)
|
|
62
|
-
return int(args[0])
|
|
63
|
-
else:
|
|
64
|
-
raise ValueError(f"Invalid timestamp args: {args}")
|
|
65
|
-
|
|
66
|
-
@classmethod
|
|
67
|
-
def read(
|
|
68
|
-
cls,
|
|
69
|
-
transcript_path: Pathlike,
|
|
70
|
-
include_events: bool = False,
|
|
71
|
-
include_sections: bool = False,
|
|
72
|
-
) -> List[GeminiSegment]:
|
|
73
|
-
"""Parse YouTube transcript file and return list of transcript segments.
|
|
74
|
-
|
|
75
|
-
Args:
|
|
76
|
-
transcript_path: Path to the transcript file
|
|
77
|
-
include_events: Whether to include event descriptions like [Applause]
|
|
78
|
-
include_sections: Whether to include section headers
|
|
79
|
-
|
|
80
|
-
Returns:
|
|
81
|
-
List of GeminiSegment objects with all metadata
|
|
82
|
-
"""
|
|
83
|
-
transcript_path = Path(transcript_path).expanduser().resolve()
|
|
84
|
-
if not transcript_path.exists():
|
|
85
|
-
raise FileNotFoundError(f"Transcript file not found: {transcript_path}")
|
|
86
|
-
|
|
87
|
-
segments: List[GeminiSegment] = []
|
|
88
|
-
current_section = None
|
|
89
|
-
current_speaker = None
|
|
90
|
-
|
|
91
|
-
with open(transcript_path, "r", encoding="utf-8") as f:
|
|
92
|
-
lines = f.readlines()
|
|
93
|
-
|
|
94
|
-
for line_num, line in enumerate(lines, start=1):
|
|
95
|
-
line = line.strip()
|
|
96
|
-
if not line:
|
|
97
|
-
continue
|
|
98
|
-
|
|
99
|
-
# Skip table of contents
|
|
100
|
-
if line.startswith("* ["):
|
|
101
|
-
continue
|
|
102
|
-
if line.startswith("## Table of Contents"):
|
|
103
|
-
continue
|
|
104
|
-
|
|
105
|
-
# Parse section headers
|
|
106
|
-
section_match = cls.SECTION_HEADER_PATTERN.match(line)
|
|
107
|
-
if section_match:
|
|
108
|
-
hours, minutes, seconds, section_title = section_match.groups()
|
|
109
|
-
timestamp = cls.parse_timestamp(hours, minutes, seconds)
|
|
110
|
-
current_section = section_title.strip()
|
|
111
|
-
if include_sections:
|
|
112
|
-
segments.append(
|
|
113
|
-
GeminiSegment(
|
|
114
|
-
text=section_title.strip(),
|
|
115
|
-
timestamp=timestamp,
|
|
116
|
-
section=current_section,
|
|
117
|
-
segment_type="section_header",
|
|
118
|
-
line_number=line_num,
|
|
119
|
-
)
|
|
120
|
-
)
|
|
121
|
-
continue
|
|
122
|
-
|
|
123
|
-
# Parse YouTube format section headers: ## [[MM:SS](URL&t=seconds)] Title
|
|
124
|
-
youtube_section_match = cls.YOUTUBE_SECTION_PATTERN.match(line)
|
|
125
|
-
if youtube_section_match:
|
|
126
|
-
minutes, seconds, url_seconds, section_title = youtube_section_match.groups()
|
|
127
|
-
# Use the URL seconds for more accuracy
|
|
128
|
-
timestamp = cls.parse_timestamp(url_seconds)
|
|
129
|
-
current_section = section_title.strip()
|
|
130
|
-
if include_sections:
|
|
131
|
-
segments.append(
|
|
132
|
-
GeminiSegment(
|
|
133
|
-
text=section_title.strip(),
|
|
134
|
-
timestamp=timestamp,
|
|
135
|
-
section=current_section,
|
|
136
|
-
segment_type="section_header",
|
|
137
|
-
line_number=line_num,
|
|
138
|
-
)
|
|
139
|
-
)
|
|
140
|
-
continue
|
|
141
|
-
|
|
142
|
-
# Parse event descriptions [event] [HH:MM:SS] or [MM:SS]
|
|
143
|
-
event_match = cls.EVENT_PATTERN.match(line)
|
|
144
|
-
if event_match:
|
|
145
|
-
groups = event_match.groups()
|
|
146
|
-
event_text = groups[0]
|
|
147
|
-
# Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
|
|
148
|
-
if groups[1] is not None: # HH:MM:SS format
|
|
149
|
-
timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
|
|
150
|
-
elif groups[4] is not None: # MM:SS format
|
|
151
|
-
timestamp = cls.parse_timestamp(groups[4], groups[5])
|
|
152
|
-
else:
|
|
153
|
-
timestamp = None
|
|
154
|
-
|
|
155
|
-
if include_events and timestamp is not None:
|
|
156
|
-
segments.append(
|
|
157
|
-
GeminiSegment(
|
|
158
|
-
text=event_text.strip(),
|
|
159
|
-
timestamp=timestamp,
|
|
160
|
-
section=current_section,
|
|
161
|
-
segment_type="event",
|
|
162
|
-
line_number=line_num,
|
|
163
|
-
)
|
|
164
|
-
)
|
|
165
|
-
continue
|
|
166
|
-
|
|
167
|
-
# Parse speaker dialogue: **Speaker:** Text [HH:MM:SS] or [MM:SS]
|
|
168
|
-
speaker_match = cls.SPEAKER_PATTERN.match(line)
|
|
169
|
-
if speaker_match:
|
|
170
|
-
speaker, text_with_timestamp = speaker_match.groups()
|
|
171
|
-
current_speaker = speaker.strip()
|
|
172
|
-
|
|
173
|
-
# Extract timestamp from the end of the text
|
|
174
|
-
timestamp_match = cls.INLINE_TIMESTAMP_PATTERN.match(text_with_timestamp.strip())
|
|
175
|
-
youtube_match = cls.YOUTUBE_INLINE_PATTERN.match(text_with_timestamp.strip())
|
|
176
|
-
|
|
177
|
-
if timestamp_match:
|
|
178
|
-
groups = timestamp_match.groups()
|
|
179
|
-
text = groups[0]
|
|
180
|
-
# Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
|
|
181
|
-
if groups[1] is not None: # HH:MM:SS format
|
|
182
|
-
timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
|
|
183
|
-
elif groups[4] is not None: # MM:SS format
|
|
184
|
-
timestamp = cls.parse_timestamp(groups[4], groups[5])
|
|
185
|
-
else:
|
|
186
|
-
timestamp = None
|
|
187
|
-
elif youtube_match:
|
|
188
|
-
groups = youtube_match.groups()
|
|
189
|
-
text = groups[0]
|
|
190
|
-
# Extract seconds from URL parameter
|
|
191
|
-
url_seconds = groups[3]
|
|
192
|
-
timestamp = cls.parse_timestamp(url_seconds)
|
|
193
|
-
else:
|
|
194
|
-
text = text_with_timestamp.strip()
|
|
195
|
-
timestamp = None
|
|
196
|
-
|
|
197
|
-
segments.append(
|
|
198
|
-
GeminiSegment(
|
|
199
|
-
text=text.strip(),
|
|
200
|
-
timestamp=timestamp,
|
|
201
|
-
speaker=current_speaker,
|
|
202
|
-
section=current_section,
|
|
203
|
-
segment_type="dialogue",
|
|
204
|
-
line_number=line_num,
|
|
205
|
-
)
|
|
206
|
-
)
|
|
207
|
-
current_speaker = None # Reset speaker after use
|
|
208
|
-
continue
|
|
209
|
-
|
|
210
|
-
# Parse plain text with timestamp at the end
|
|
211
|
-
inline_match = cls.INLINE_TIMESTAMP_PATTERN.match(line)
|
|
212
|
-
youtube_inline_match = cls.YOUTUBE_INLINE_PATTERN.match(line)
|
|
213
|
-
|
|
214
|
-
if inline_match:
|
|
215
|
-
groups = inline_match.groups()
|
|
216
|
-
text = groups[0]
|
|
217
|
-
# Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
|
|
218
|
-
if groups[1] is not None: # HH:MM:SS format
|
|
219
|
-
timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
|
|
220
|
-
elif groups[4] is not None: # MM:SS format
|
|
221
|
-
timestamp = cls.parse_timestamp(groups[4], groups[5])
|
|
222
|
-
else:
|
|
223
|
-
timestamp = None
|
|
224
|
-
|
|
225
|
-
segments.append(
|
|
226
|
-
GeminiSegment(
|
|
227
|
-
text=text.strip(),
|
|
228
|
-
timestamp=timestamp,
|
|
229
|
-
speaker=current_speaker,
|
|
230
|
-
section=current_section,
|
|
231
|
-
segment_type="dialogue",
|
|
232
|
-
line_number=line_num,
|
|
233
|
-
)
|
|
234
|
-
)
|
|
235
|
-
continue
|
|
236
|
-
elif youtube_inline_match:
|
|
237
|
-
groups = youtube_inline_match.groups()
|
|
238
|
-
text = groups[0]
|
|
239
|
-
# Extract seconds from URL parameter
|
|
240
|
-
url_seconds = groups[3]
|
|
241
|
-
timestamp = cls.parse_timestamp(url_seconds)
|
|
242
|
-
|
|
243
|
-
segments.append(
|
|
244
|
-
GeminiSegment(
|
|
245
|
-
text=text.strip(),
|
|
246
|
-
timestamp=timestamp,
|
|
247
|
-
speaker=current_speaker,
|
|
248
|
-
section=current_section,
|
|
249
|
-
segment_type="dialogue",
|
|
250
|
-
line_number=line_num,
|
|
251
|
-
)
|
|
252
|
-
)
|
|
253
|
-
continue
|
|
254
|
-
|
|
255
|
-
# Skip markdown headers and other formatting
|
|
256
|
-
if line.startswith("#"):
|
|
257
|
-
continue
|
|
258
|
-
|
|
259
|
-
return segments
|
|
260
|
-
|
|
261
|
-
@classmethod
|
|
262
|
-
def extract_for_alignment(
|
|
263
|
-
cls,
|
|
264
|
-
transcript_path: Pathlike,
|
|
265
|
-
merge_consecutive: bool = False,
|
|
266
|
-
min_duration: float = 0.1,
|
|
267
|
-
merge_max_gap: float = 2.0,
|
|
268
|
-
) -> List[Supervision]:
|
|
269
|
-
"""Extract text segments for forced alignment.
|
|
270
|
-
|
|
271
|
-
This extracts only dialogue segments (not events or section headers)
|
|
272
|
-
and converts them to Supervision objects suitable for alignment.
|
|
273
|
-
|
|
274
|
-
Args:
|
|
275
|
-
transcript_path: Path to the transcript file
|
|
276
|
-
merge_consecutive: Whether to merge consecutive segments from same speaker
|
|
277
|
-
min_duration: Minimum duration for a segment
|
|
278
|
-
merge_max_gap: Maximum time gap (seconds) to merge consecutive segments
|
|
279
|
-
|
|
280
|
-
Returns:
|
|
281
|
-
List of Supervision objects ready for alignment
|
|
282
|
-
"""
|
|
283
|
-
segments = cls.read(transcript_path, include_events=False, include_sections=False)
|
|
284
|
-
|
|
285
|
-
# Filter to only dialogue segments with timestamps
|
|
286
|
-
dialogue_segments = [s for s in segments if s.segment_type == "dialogue" and s.timestamp is not None]
|
|
287
|
-
|
|
288
|
-
if not dialogue_segments:
|
|
289
|
-
raise ValueError(f"No dialogue segments with timestamps found in {transcript_path}")
|
|
290
|
-
|
|
291
|
-
# Sort by timestamp
|
|
292
|
-
dialogue_segments.sort(key=lambda x: x.timestamp)
|
|
293
|
-
|
|
294
|
-
# Convert to Supervision objects
|
|
295
|
-
supervisions: List[Supervision] = []
|
|
296
|
-
|
|
297
|
-
for i, segment in enumerate(dialogue_segments):
|
|
298
|
-
# Estimate duration based on next segment
|
|
299
|
-
if i < len(dialogue_segments) - 1:
|
|
300
|
-
duration = dialogue_segments[i + 1].timestamp - segment.timestamp
|
|
301
|
-
else:
|
|
302
|
-
# Last segment: estimate based on text length (rough heuristic)
|
|
303
|
-
words = len(segment.text.split())
|
|
304
|
-
duration = words * 0.3 # ~0.3 seconds per word
|
|
305
|
-
|
|
306
|
-
supervisions.append(
|
|
307
|
-
Supervision(
|
|
308
|
-
text=segment.text,
|
|
309
|
-
start=segment.timestamp,
|
|
310
|
-
duration=max(duration, min_duration),
|
|
311
|
-
id=f"segment_{i:05d}",
|
|
312
|
-
speaker=segment.speaker,
|
|
313
|
-
)
|
|
314
|
-
)
|
|
315
|
-
|
|
316
|
-
# Optionally merge consecutive segments from same speaker
|
|
317
|
-
if merge_consecutive:
|
|
318
|
-
merged = []
|
|
319
|
-
current_speaker = None
|
|
320
|
-
current_texts = []
|
|
321
|
-
current_start = None
|
|
322
|
-
last_end_time = None
|
|
323
|
-
|
|
324
|
-
for i, (segment, sup) in enumerate(zip(dialogue_segments, supervisions)):
|
|
325
|
-
# Check if we should merge with previous segment
|
|
326
|
-
should_merge = False
|
|
327
|
-
if segment.speaker == current_speaker and current_start is not None:
|
|
328
|
-
# Same speaker - check time gap
|
|
329
|
-
time_gap = sup.start - last_end_time if last_end_time else 0
|
|
330
|
-
if time_gap <= merge_max_gap:
|
|
331
|
-
should_merge = True
|
|
332
|
-
|
|
333
|
-
if should_merge:
|
|
334
|
-
# Same speaker within time threshold, accumulate
|
|
335
|
-
current_texts.append(segment.text)
|
|
336
|
-
last_end_time = sup.start + sup.duration
|
|
337
|
-
else:
|
|
338
|
-
# Different speaker or gap too large, save previous segment
|
|
339
|
-
if current_texts:
|
|
340
|
-
merged_text = " ".join(current_texts)
|
|
341
|
-
merged.append(
|
|
342
|
-
Supervision(
|
|
343
|
-
text=merged_text,
|
|
344
|
-
start=current_start,
|
|
345
|
-
duration=last_end_time - current_start,
|
|
346
|
-
id=f"merged_{len(merged):05d}",
|
|
347
|
-
)
|
|
348
|
-
)
|
|
349
|
-
current_speaker = segment.speaker
|
|
350
|
-
current_texts = [segment.text]
|
|
351
|
-
current_start = sup.start
|
|
352
|
-
last_end_time = sup.start + sup.duration
|
|
353
|
-
|
|
354
|
-
# Add final segment
|
|
355
|
-
if current_texts:
|
|
356
|
-
merged_text = " ".join(current_texts)
|
|
357
|
-
merged.append(
|
|
358
|
-
Supervision(
|
|
359
|
-
text=merged_text,
|
|
360
|
-
start=current_start,
|
|
361
|
-
duration=last_end_time - current_start,
|
|
362
|
-
id=f"merged_{len(merged):05d}",
|
|
363
|
-
)
|
|
364
|
-
)
|
|
365
|
-
|
|
366
|
-
supervisions = merged
|
|
367
|
-
|
|
368
|
-
return supervisions
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
__all__ = ["GeminiReader", "GeminiSegment"]
|
|
@@ -1,173 +0,0 @@
|
|
|
1
|
-
"""Writer for YouTube transcript files with corrected timestamps from alignment."""
|
|
2
|
-
|
|
3
|
-
import re
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from typing import Dict, List, Optional
|
|
6
|
-
|
|
7
|
-
from lhotse.utils import Pathlike
|
|
8
|
-
|
|
9
|
-
from .gemini_reader import GeminiReader, GeminiSegment
|
|
10
|
-
from .supervision import Supervision
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class GeminiWriter:
|
|
14
|
-
"""Writer for updating YouTube transcript timestamps based on alignment results."""
|
|
15
|
-
|
|
16
|
-
@staticmethod
|
|
17
|
-
def format_timestamp(seconds: float) -> str:
|
|
18
|
-
"""Convert seconds to [HH:MM:SS] format."""
|
|
19
|
-
hours = int(seconds // 3600)
|
|
20
|
-
minutes = int((seconds % 3600) // 60)
|
|
21
|
-
secs = int(seconds % 60)
|
|
22
|
-
return f"[{hours:02d}:{minutes:02d}:{secs:02d}]"
|
|
23
|
-
|
|
24
|
-
@classmethod
|
|
25
|
-
def update_timestamps(
|
|
26
|
-
cls,
|
|
27
|
-
original_transcript: Pathlike,
|
|
28
|
-
aligned_supervisions: List[Supervision],
|
|
29
|
-
output_path: Pathlike,
|
|
30
|
-
timestamp_mapping: Optional[Dict[int, float]] = None,
|
|
31
|
-
) -> Pathlike:
|
|
32
|
-
"""Update transcript file with corrected timestamps from alignment.
|
|
33
|
-
|
|
34
|
-
Args:
|
|
35
|
-
original_transcript: Path to the original transcript file
|
|
36
|
-
aligned_supervisions: List of aligned Supervision objects with corrected timestamps
|
|
37
|
-
output_path: Path to write the updated transcript
|
|
38
|
-
timestamp_mapping: Optional manual mapping from line_number to new timestamp
|
|
39
|
-
|
|
40
|
-
Returns:
|
|
41
|
-
Path to the output file
|
|
42
|
-
"""
|
|
43
|
-
original_path = Path(original_transcript)
|
|
44
|
-
output_path = Path(output_path)
|
|
45
|
-
|
|
46
|
-
# Read original file
|
|
47
|
-
with open(original_path, "r", encoding="utf-8") as f:
|
|
48
|
-
lines = f.readlines()
|
|
49
|
-
|
|
50
|
-
# Parse original segments to get line numbers
|
|
51
|
-
original_segments = GeminiReader.read(original_transcript, include_events=True, include_sections=True)
|
|
52
|
-
|
|
53
|
-
# Create mapping from line number to new timestamp
|
|
54
|
-
if timestamp_mapping is None:
|
|
55
|
-
timestamp_mapping = cls._create_timestamp_mapping(original_segments, aligned_supervisions)
|
|
56
|
-
|
|
57
|
-
# Update timestamps in lines
|
|
58
|
-
updated_lines = []
|
|
59
|
-
for line_num, line in enumerate(lines, start=1):
|
|
60
|
-
if line_num in timestamp_mapping:
|
|
61
|
-
new_timestamp = timestamp_mapping[line_num]
|
|
62
|
-
updated_line = cls._replace_timestamp(line, new_timestamp)
|
|
63
|
-
updated_lines.append(updated_line)
|
|
64
|
-
else:
|
|
65
|
-
updated_lines.append(line)
|
|
66
|
-
|
|
67
|
-
# Write updated content
|
|
68
|
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
69
|
-
with open(output_path, "w", encoding="utf-8") as f:
|
|
70
|
-
f.writelines(updated_lines)
|
|
71
|
-
|
|
72
|
-
return output_path
|
|
73
|
-
|
|
74
|
-
@classmethod
|
|
75
|
-
def _create_timestamp_mapping(
|
|
76
|
-
cls, original_segments: List[GeminiSegment], aligned_supervisions: List[Supervision]
|
|
77
|
-
) -> Dict[int, float]:
|
|
78
|
-
"""Create mapping from line numbers to new timestamps based on alignment.
|
|
79
|
-
|
|
80
|
-
This performs text matching between original segments and aligned supervisions
|
|
81
|
-
to determine which timestamps should be updated.
|
|
82
|
-
"""
|
|
83
|
-
mapping = {}
|
|
84
|
-
|
|
85
|
-
# Create a simple text-based matching
|
|
86
|
-
dialogue_segments = [s for s in original_segments if s.segment_type == "dialogue"]
|
|
87
|
-
|
|
88
|
-
# Try to match based on text content
|
|
89
|
-
for aligned_sup in aligned_supervisions:
|
|
90
|
-
aligned_text = aligned_sup.text.strip()
|
|
91
|
-
|
|
92
|
-
# Find best matching original segment
|
|
93
|
-
best_match = None
|
|
94
|
-
best_score = 0
|
|
95
|
-
|
|
96
|
-
for orig_seg in dialogue_segments:
|
|
97
|
-
orig_text = orig_seg.text.strip()
|
|
98
|
-
|
|
99
|
-
# Simple text similarity (could be improved with fuzzy matching)
|
|
100
|
-
if aligned_text == orig_text:
|
|
101
|
-
best_match = orig_seg
|
|
102
|
-
best_score = 1.0
|
|
103
|
-
break
|
|
104
|
-
elif aligned_text in orig_text or orig_text in aligned_text:
|
|
105
|
-
score = min(len(aligned_text), len(orig_text)) / max(len(aligned_text), len(orig_text))
|
|
106
|
-
if score > best_score:
|
|
107
|
-
best_score = score
|
|
108
|
-
best_match = orig_seg
|
|
109
|
-
|
|
110
|
-
# If we found a good match, update the mapping
|
|
111
|
-
if best_match and best_score > 0.8:
|
|
112
|
-
mapping[best_match.line_number] = aligned_sup.start
|
|
113
|
-
|
|
114
|
-
return mapping
|
|
115
|
-
|
|
116
|
-
@classmethod
|
|
117
|
-
def _replace_timestamp(cls, line: str, new_timestamp: float) -> str:
|
|
118
|
-
"""Replace timestamp in a line with new timestamp."""
|
|
119
|
-
new_ts_str = cls.format_timestamp(new_timestamp)
|
|
120
|
-
|
|
121
|
-
# Replace timestamp patterns
|
|
122
|
-
# Pattern 1: [HH:MM:SS] at the end or in brackets
|
|
123
|
-
line = re.sub(r"\[\d{2}:\d{2}:\d{2}\]", new_ts_str, line)
|
|
124
|
-
|
|
125
|
-
return line
|
|
126
|
-
|
|
127
|
-
@classmethod
|
|
128
|
-
def write_aligned_transcript(
|
|
129
|
-
cls,
|
|
130
|
-
aligned_supervisions: List[Supervision],
|
|
131
|
-
output_path: Pathlike,
|
|
132
|
-
include_word_timestamps: bool = False,
|
|
133
|
-
) -> Pathlike:
|
|
134
|
-
"""Write a new transcript file from aligned supervisions.
|
|
135
|
-
|
|
136
|
-
This creates a simplified transcript format with accurate timestamps.
|
|
137
|
-
|
|
138
|
-
Args:
|
|
139
|
-
aligned_supervisions: List of aligned Supervision objects
|
|
140
|
-
output_path: Path to write the transcript
|
|
141
|
-
include_word_timestamps: Whether to include word-level timestamps if available
|
|
142
|
-
|
|
143
|
-
Returns:
|
|
144
|
-
Path to the output file
|
|
145
|
-
"""
|
|
146
|
-
output_path = Path(output_path)
|
|
147
|
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
148
|
-
|
|
149
|
-
with open(output_path, "w", encoding="utf-8") as f:
|
|
150
|
-
f.write("# Aligned Transcript\n\n")
|
|
151
|
-
|
|
152
|
-
for i, sup in enumerate(aligned_supervisions):
|
|
153
|
-
# Write segment with timestamp
|
|
154
|
-
start_ts = cls.format_timestamp(sup.start)
|
|
155
|
-
f.write(f"{start_ts} {sup.text}\n")
|
|
156
|
-
|
|
157
|
-
# Optionally write word-level timestamps
|
|
158
|
-
if include_word_timestamps and hasattr(sup, "alignment") and sup.alignment:
|
|
159
|
-
if "word" in sup.alignment:
|
|
160
|
-
f.write(" Words: ")
|
|
161
|
-
word_parts = []
|
|
162
|
-
for word_info in sup.alignment["word"]:
|
|
163
|
-
word_ts = cls.format_timestamp(word_info["start"])
|
|
164
|
-
word_parts.append(f'{word_info["symbol"]}{word_ts}')
|
|
165
|
-
f.write(" ".join(word_parts))
|
|
166
|
-
f.write("\n")
|
|
167
|
-
|
|
168
|
-
f.write("\n")
|
|
169
|
-
|
|
170
|
-
return output_path
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
__all__ = ["GeminiWriter"]
|