lattifai 0.4.5__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +61 -47
- lattifai/alignment/__init__.py +6 -0
- lattifai/alignment/lattice1_aligner.py +119 -0
- lattifai/alignment/lattice1_worker.py +185 -0
- lattifai/{tokenizer → alignment}/phonemizer.py +4 -4
- lattifai/alignment/segmenter.py +166 -0
- lattifai/{tokenizer → alignment}/tokenizer.py +244 -169
- lattifai/audio2.py +211 -0
- lattifai/caption/__init__.py +20 -0
- lattifai/caption/caption.py +1275 -0
- lattifai/{io → caption}/gemini_reader.py +30 -30
- lattifai/{io → caption}/gemini_writer.py +17 -17
- lattifai/{io → caption}/supervision.py +4 -3
- lattifai/caption/text_parser.py +145 -0
- lattifai/cli/__init__.py +17 -0
- lattifai/cli/alignment.py +153 -0
- lattifai/cli/caption.py +204 -0
- lattifai/cli/server.py +19 -0
- lattifai/cli/transcribe.py +197 -0
- lattifai/cli/youtube.py +128 -0
- lattifai/client.py +460 -251
- lattifai/config/__init__.py +20 -0
- lattifai/config/alignment.py +73 -0
- lattifai/config/caption.py +178 -0
- lattifai/config/client.py +46 -0
- lattifai/config/diarization.py +67 -0
- lattifai/config/media.py +335 -0
- lattifai/config/transcription.py +84 -0
- lattifai/diarization/__init__.py +5 -0
- lattifai/diarization/lattifai.py +89 -0
- lattifai/errors.py +98 -91
- lattifai/logging.py +116 -0
- lattifai/mixin.py +552 -0
- lattifai/server/app.py +420 -0
- lattifai/transcription/__init__.py +76 -0
- lattifai/transcription/base.py +108 -0
- lattifai/transcription/gemini.py +219 -0
- lattifai/transcription/lattifai.py +103 -0
- lattifai/{workflows → transcription}/prompts/__init__.py +4 -4
- lattifai/types.py +30 -0
- lattifai/utils.py +16 -44
- lattifai/workflow/__init__.py +22 -0
- lattifai/workflow/agents.py +6 -0
- lattifai/{workflows → workflow}/base.py +22 -22
- lattifai/{workflows → workflow}/file_manager.py +239 -215
- lattifai/workflow/youtube.py +564 -0
- lattifai-1.0.0.dist-info/METADATA +736 -0
- lattifai-1.0.0.dist-info/RECORD +52 -0
- {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
- lattifai-1.0.0.dist-info/entry_points.txt +13 -0
- {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +1 -1
- lattifai/base_client.py +0 -126
- lattifai/bin/__init__.py +0 -3
- lattifai/bin/agent.py +0 -325
- lattifai/bin/align.py +0 -296
- lattifai/bin/cli_base.py +0 -25
- lattifai/bin/subtitle.py +0 -210
- lattifai/io/__init__.py +0 -42
- lattifai/io/reader.py +0 -85
- lattifai/io/text_parser.py +0 -75
- lattifai/io/utils.py +0 -15
- lattifai/io/writer.py +0 -90
- lattifai/tokenizer/__init__.py +0 -3
- lattifai/workers/__init__.py +0 -3
- lattifai/workers/lattice1_alpha.py +0 -284
- lattifai/workflows/__init__.py +0 -34
- lattifai/workflows/agents.py +0 -10
- lattifai/workflows/gemini.py +0 -167
- lattifai/workflows/prompts/README.md +0 -22
- lattifai/workflows/prompts/gemini/README.md +0 -24
- lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
- lattifai/workflows/youtube.py +0 -931
- lattifai-0.4.5.dist-info/METADATA +0 -808
- lattifai-0.4.5.dist-info/RECORD +0 -39
- lattifai-0.4.5.dist-info/entry_points.txt +0 -3
- {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
"""Reader for YouTube transcript files with speaker labels and timestamps."""
|
|
2
2
|
|
|
3
3
|
import re
|
|
4
|
-
from dataclasses import dataclass
|
|
4
|
+
from dataclasses import dataclass
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import List, Optional
|
|
6
|
+
from typing import List, Optional
|
|
7
7
|
|
|
8
8
|
from lhotse.utils import Pathlike
|
|
9
9
|
|
|
@@ -18,7 +18,7 @@ class GeminiSegment:
|
|
|
18
18
|
timestamp: Optional[float] = None
|
|
19
19
|
speaker: Optional[str] = None
|
|
20
20
|
section: Optional[str] = None
|
|
21
|
-
segment_type: str =
|
|
21
|
+
segment_type: str = "dialogue" # 'dialogue', 'event', or 'section_header'
|
|
22
22
|
line_number: int = 0
|
|
23
23
|
|
|
24
24
|
@property
|
|
@@ -31,15 +31,15 @@ class GeminiReader:
|
|
|
31
31
|
"""Parser for YouTube transcript format with speaker labels and timestamps."""
|
|
32
32
|
|
|
33
33
|
# Regex patterns for parsing (supports both [HH:MM:SS] and [MM:SS] formats)
|
|
34
|
-
TIMESTAMP_PATTERN = re.compile(r
|
|
35
|
-
SECTION_HEADER_PATTERN = re.compile(r
|
|
36
|
-
SPEAKER_PATTERN = re.compile(r
|
|
37
|
-
EVENT_PATTERN = re.compile(r
|
|
38
|
-
INLINE_TIMESTAMP_PATTERN = re.compile(r
|
|
34
|
+
TIMESTAMP_PATTERN = re.compile(r"\[(\d{1,2}):(\d{2}):(\d{2})\]|\[(\d{1,2}):(\d{2})\]")
|
|
35
|
+
SECTION_HEADER_PATTERN = re.compile(r"^##\s*\[(\d{1,2}):(\d{2}):(\d{2})\]\s*(.+)$")
|
|
36
|
+
SPEAKER_PATTERN = re.compile(r"^\*\*(.+?[::])\*\*\s*(.+)$")
|
|
37
|
+
EVENT_PATTERN = re.compile(r"^\[([^\]]+)\]\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
|
|
38
|
+
INLINE_TIMESTAMP_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
|
|
39
39
|
|
|
40
40
|
# New patterns for YouTube link format: [[MM:SS](URL&t=seconds)]
|
|
41
|
-
YOUTUBE_SECTION_PATTERN = re.compile(r
|
|
42
|
-
YOUTUBE_INLINE_PATTERN = re.compile(r
|
|
41
|
+
YOUTUBE_SECTION_PATTERN = re.compile(r"^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$")
|
|
42
|
+
YOUTUBE_INLINE_PATTERN = re.compile(r"^(.+?)\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]$")
|
|
43
43
|
|
|
44
44
|
@classmethod
|
|
45
45
|
def parse_timestamp(cls, *args) -> float:
|
|
@@ -61,7 +61,7 @@ class GeminiReader:
|
|
|
61
61
|
# Direct seconds (from YouTube &t= parameter)
|
|
62
62
|
return int(args[0])
|
|
63
63
|
else:
|
|
64
|
-
raise ValueError(f
|
|
64
|
+
raise ValueError(f"Invalid timestamp args: {args}")
|
|
65
65
|
|
|
66
66
|
@classmethod
|
|
67
67
|
def read(
|
|
@@ -82,13 +82,13 @@ class GeminiReader:
|
|
|
82
82
|
"""
|
|
83
83
|
transcript_path = Path(transcript_path).expanduser().resolve()
|
|
84
84
|
if not transcript_path.exists():
|
|
85
|
-
raise FileNotFoundError(f
|
|
85
|
+
raise FileNotFoundError(f"Transcript file not found: {transcript_path}")
|
|
86
86
|
|
|
87
87
|
segments: List[GeminiSegment] = []
|
|
88
88
|
current_section = None
|
|
89
89
|
current_speaker = None
|
|
90
90
|
|
|
91
|
-
with open(transcript_path,
|
|
91
|
+
with open(transcript_path, "r", encoding="utf-8") as f:
|
|
92
92
|
lines = f.readlines()
|
|
93
93
|
|
|
94
94
|
for line_num, line in enumerate(lines, start=1):
|
|
@@ -97,9 +97,9 @@ class GeminiReader:
|
|
|
97
97
|
continue
|
|
98
98
|
|
|
99
99
|
# Skip table of contents
|
|
100
|
-
if line.startswith(
|
|
100
|
+
if line.startswith("* ["):
|
|
101
101
|
continue
|
|
102
|
-
if line.startswith(
|
|
102
|
+
if line.startswith("## Table of Contents"):
|
|
103
103
|
continue
|
|
104
104
|
|
|
105
105
|
# Parse section headers
|
|
@@ -114,7 +114,7 @@ class GeminiReader:
|
|
|
114
114
|
text=section_title.strip(),
|
|
115
115
|
timestamp=timestamp,
|
|
116
116
|
section=current_section,
|
|
117
|
-
segment_type=
|
|
117
|
+
segment_type="section_header",
|
|
118
118
|
line_number=line_num,
|
|
119
119
|
)
|
|
120
120
|
)
|
|
@@ -133,7 +133,7 @@ class GeminiReader:
|
|
|
133
133
|
text=section_title.strip(),
|
|
134
134
|
timestamp=timestamp,
|
|
135
135
|
section=current_section,
|
|
136
|
-
segment_type=
|
|
136
|
+
segment_type="section_header",
|
|
137
137
|
line_number=line_num,
|
|
138
138
|
)
|
|
139
139
|
)
|
|
@@ -158,7 +158,7 @@ class GeminiReader:
|
|
|
158
158
|
text=event_text.strip(),
|
|
159
159
|
timestamp=timestamp,
|
|
160
160
|
section=current_section,
|
|
161
|
-
segment_type=
|
|
161
|
+
segment_type="event",
|
|
162
162
|
line_number=line_num,
|
|
163
163
|
)
|
|
164
164
|
)
|
|
@@ -200,7 +200,7 @@ class GeminiReader:
|
|
|
200
200
|
timestamp=timestamp,
|
|
201
201
|
speaker=current_speaker,
|
|
202
202
|
section=current_section,
|
|
203
|
-
segment_type=
|
|
203
|
+
segment_type="dialogue",
|
|
204
204
|
line_number=line_num,
|
|
205
205
|
)
|
|
206
206
|
)
|
|
@@ -228,7 +228,7 @@ class GeminiReader:
|
|
|
228
228
|
timestamp=timestamp,
|
|
229
229
|
speaker=current_speaker,
|
|
230
230
|
section=current_section,
|
|
231
|
-
segment_type=
|
|
231
|
+
segment_type="dialogue",
|
|
232
232
|
line_number=line_num,
|
|
233
233
|
)
|
|
234
234
|
)
|
|
@@ -246,14 +246,14 @@ class GeminiReader:
|
|
|
246
246
|
timestamp=timestamp,
|
|
247
247
|
speaker=current_speaker,
|
|
248
248
|
section=current_section,
|
|
249
|
-
segment_type=
|
|
249
|
+
segment_type="dialogue",
|
|
250
250
|
line_number=line_num,
|
|
251
251
|
)
|
|
252
252
|
)
|
|
253
253
|
continue
|
|
254
254
|
|
|
255
255
|
# Skip markdown headers and other formatting
|
|
256
|
-
if line.startswith(
|
|
256
|
+
if line.startswith("#"):
|
|
257
257
|
continue
|
|
258
258
|
|
|
259
259
|
return segments
|
|
@@ -283,10 +283,10 @@ class GeminiReader:
|
|
|
283
283
|
segments = cls.read(transcript_path, include_events=False, include_sections=False)
|
|
284
284
|
|
|
285
285
|
# Filter to only dialogue segments with timestamps
|
|
286
|
-
dialogue_segments = [s for s in segments if s.segment_type ==
|
|
286
|
+
dialogue_segments = [s for s in segments if s.segment_type == "dialogue" and s.timestamp is not None]
|
|
287
287
|
|
|
288
288
|
if not dialogue_segments:
|
|
289
|
-
raise ValueError(f
|
|
289
|
+
raise ValueError(f"No dialogue segments with timestamps found in {transcript_path}")
|
|
290
290
|
|
|
291
291
|
# Sort by timestamp
|
|
292
292
|
dialogue_segments.sort(key=lambda x: x.timestamp)
|
|
@@ -308,7 +308,7 @@ class GeminiReader:
|
|
|
308
308
|
text=segment.text,
|
|
309
309
|
start=segment.timestamp,
|
|
310
310
|
duration=max(duration, min_duration),
|
|
311
|
-
id=f
|
|
311
|
+
id=f"segment_{i:05d}",
|
|
312
312
|
speaker=segment.speaker,
|
|
313
313
|
)
|
|
314
314
|
)
|
|
@@ -337,13 +337,13 @@ class GeminiReader:
|
|
|
337
337
|
else:
|
|
338
338
|
# Different speaker or gap too large, save previous segment
|
|
339
339
|
if current_texts:
|
|
340
|
-
merged_text =
|
|
340
|
+
merged_text = " ".join(current_texts)
|
|
341
341
|
merged.append(
|
|
342
342
|
Supervision(
|
|
343
343
|
text=merged_text,
|
|
344
344
|
start=current_start,
|
|
345
345
|
duration=last_end_time - current_start,
|
|
346
|
-
id=f
|
|
346
|
+
id=f"merged_{len(merged):05d}",
|
|
347
347
|
)
|
|
348
348
|
)
|
|
349
349
|
current_speaker = segment.speaker
|
|
@@ -353,13 +353,13 @@ class GeminiReader:
|
|
|
353
353
|
|
|
354
354
|
# Add final segment
|
|
355
355
|
if current_texts:
|
|
356
|
-
merged_text =
|
|
356
|
+
merged_text = " ".join(current_texts)
|
|
357
357
|
merged.append(
|
|
358
358
|
Supervision(
|
|
359
359
|
text=merged_text,
|
|
360
360
|
start=current_start,
|
|
361
361
|
duration=last_end_time - current_start,
|
|
362
|
-
id=f
|
|
362
|
+
id=f"merged_{len(merged):05d}",
|
|
363
363
|
)
|
|
364
364
|
)
|
|
365
365
|
|
|
@@ -368,4 +368,4 @@ class GeminiReader:
|
|
|
368
368
|
return supervisions
|
|
369
369
|
|
|
370
370
|
|
|
371
|
-
__all__ = [
|
|
371
|
+
__all__ = ["GeminiReader", "GeminiSegment"]
|
|
@@ -19,7 +19,7 @@ class GeminiWriter:
|
|
|
19
19
|
hours = int(seconds // 3600)
|
|
20
20
|
minutes = int((seconds % 3600) // 60)
|
|
21
21
|
secs = int(seconds % 60)
|
|
22
|
-
return f
|
|
22
|
+
return f"[{hours:02d}:{minutes:02d}:{secs:02d}]"
|
|
23
23
|
|
|
24
24
|
@classmethod
|
|
25
25
|
def update_timestamps(
|
|
@@ -44,7 +44,7 @@ class GeminiWriter:
|
|
|
44
44
|
output_path = Path(output_path)
|
|
45
45
|
|
|
46
46
|
# Read original file
|
|
47
|
-
with open(original_path,
|
|
47
|
+
with open(original_path, "r", encoding="utf-8") as f:
|
|
48
48
|
lines = f.readlines()
|
|
49
49
|
|
|
50
50
|
# Parse original segments to get line numbers
|
|
@@ -66,7 +66,7 @@ class GeminiWriter:
|
|
|
66
66
|
|
|
67
67
|
# Write updated content
|
|
68
68
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
69
|
-
with open(output_path,
|
|
69
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
70
70
|
f.writelines(updated_lines)
|
|
71
71
|
|
|
72
72
|
return output_path
|
|
@@ -83,7 +83,7 @@ class GeminiWriter:
|
|
|
83
83
|
mapping = {}
|
|
84
84
|
|
|
85
85
|
# Create a simple text-based matching
|
|
86
|
-
dialogue_segments = [s for s in original_segments if s.segment_type ==
|
|
86
|
+
dialogue_segments = [s for s in original_segments if s.segment_type == "dialogue"]
|
|
87
87
|
|
|
88
88
|
# Try to match based on text content
|
|
89
89
|
for aligned_sup in aligned_supervisions:
|
|
@@ -120,7 +120,7 @@ class GeminiWriter:
|
|
|
120
120
|
|
|
121
121
|
# Replace timestamp patterns
|
|
122
122
|
# Pattern 1: [HH:MM:SS] at the end or in brackets
|
|
123
|
-
line = re.sub(r
|
|
123
|
+
line = re.sub(r"\[\d{2}:\d{2}:\d{2}\]", new_ts_str, line)
|
|
124
124
|
|
|
125
125
|
return line
|
|
126
126
|
|
|
@@ -146,28 +146,28 @@ class GeminiWriter:
|
|
|
146
146
|
output_path = Path(output_path)
|
|
147
147
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
148
148
|
|
|
149
|
-
with open(output_path,
|
|
150
|
-
f.write(
|
|
149
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
150
|
+
f.write("# Aligned Transcript\n\n")
|
|
151
151
|
|
|
152
152
|
for i, sup in enumerate(aligned_supervisions):
|
|
153
153
|
# Write segment with timestamp
|
|
154
154
|
start_ts = cls.format_timestamp(sup.start)
|
|
155
|
-
f.write(f
|
|
155
|
+
f.write(f"{start_ts} {sup.text}\n")
|
|
156
156
|
|
|
157
157
|
# Optionally write word-level timestamps
|
|
158
|
-
if include_word_timestamps and hasattr(sup,
|
|
159
|
-
if
|
|
160
|
-
f.write(
|
|
158
|
+
if include_word_timestamps and hasattr(sup, "alignment") and sup.alignment:
|
|
159
|
+
if "word" in sup.alignment:
|
|
160
|
+
f.write(" Words: ")
|
|
161
161
|
word_parts = []
|
|
162
|
-
for word_info in sup.alignment[
|
|
163
|
-
word_ts = cls.format_timestamp(word_info[
|
|
162
|
+
for word_info in sup.alignment["word"]:
|
|
163
|
+
word_ts = cls.format_timestamp(word_info["start"])
|
|
164
164
|
word_parts.append(f'{word_info["symbol"]}{word_ts}')
|
|
165
|
-
f.write(
|
|
166
|
-
f.write(
|
|
165
|
+
f.write(" ".join(word_parts))
|
|
166
|
+
f.write("\n")
|
|
167
167
|
|
|
168
|
-
f.write(
|
|
168
|
+
f.write("\n")
|
|
169
169
|
|
|
170
170
|
return output_path
|
|
171
171
|
|
|
172
172
|
|
|
173
|
-
__all__ = [
|
|
173
|
+
__all__ = ["GeminiWriter"]
|
|
@@ -24,10 +24,11 @@ class Supervision(SupervisionSegment):
|
|
|
24
24
|
"""
|
|
25
25
|
|
|
26
26
|
text: Optional[str] = None
|
|
27
|
-
|
|
28
|
-
|
|
27
|
+
speaker: Optional[str] = None
|
|
28
|
+
id: str = ""
|
|
29
|
+
recording_id: str = ""
|
|
29
30
|
start: Seconds = 0.0
|
|
30
31
|
duration: Seconds = 0.0
|
|
31
32
|
|
|
32
33
|
|
|
33
|
-
__all__ = [
|
|
34
|
+
__all__ = ["Supervision"]
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
from typing import Optional, Tuple
|
|
4
|
+
|
|
5
|
+
# Timestamp pattern: [start-end] text
|
|
6
|
+
# Example: [1.23-4.56] Hello world
|
|
7
|
+
TIMESTAMP_PATTERN = re.compile(r"^\[([\d.]+)-([\d.]+)\]\s*(.*)$")
|
|
8
|
+
|
|
9
|
+
# 来自于字幕中常见的说话人标记格式
|
|
10
|
+
SPEAKER_PATTERN = re.compile(r"((?:>>|>>|>|>).*?[::])\s*(.*)")
|
|
11
|
+
|
|
12
|
+
# Transcriber Output Example:
|
|
13
|
+
# 26:19.919 --> 26:34.921
|
|
14
|
+
# [SPEAKER_01]: 越来越多的科技巨头入...
|
|
15
|
+
SPEAKER_LATTIFAI = re.compile(r"(^\[SPEAKER_.*?\][::])\s*(.*)")
|
|
16
|
+
|
|
17
|
+
# NISHTHA BHATIA: Hey, everyone.
|
|
18
|
+
# DIETER: Oh, hey, Nishtha.
|
|
19
|
+
# GEMINI: That might
|
|
20
|
+
SPEAKER_PATTERN2 = re.compile(r"^([A-Z]{1,15}(?:\s+[A-Z]{1,15})?[::])\s*(.*)$")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def normalize_text(text: str) -> str:
|
|
24
|
+
"""Normalize caption text by:
|
|
25
|
+
- Decoding common HTML entities
|
|
26
|
+
- Removing HTML tags (e.g., <i>, <font>, <b>, <br>)
|
|
27
|
+
- Collapsing multiple whitespace into a single space
|
|
28
|
+
- Converting curly apostrophes to straight ones in common contractions
|
|
29
|
+
"""
|
|
30
|
+
if not text:
|
|
31
|
+
return ""
|
|
32
|
+
|
|
33
|
+
# # Remove HTML tags first (replace with space to avoid concatenation)
|
|
34
|
+
# text = re.sub(r"<[^>]+>", " ", text)
|
|
35
|
+
|
|
36
|
+
html_entities = {
|
|
37
|
+
"&": "&",
|
|
38
|
+
"<": "<",
|
|
39
|
+
">": ">",
|
|
40
|
+
""": '"',
|
|
41
|
+
"'": "'",
|
|
42
|
+
" ": " ",
|
|
43
|
+
"\\N": " ",
|
|
44
|
+
"…": " ", # replace ellipsis with space to avoid merging words
|
|
45
|
+
}
|
|
46
|
+
for entity, char in html_entities.items():
|
|
47
|
+
text = text.replace(entity, char)
|
|
48
|
+
|
|
49
|
+
# Convert curly apostrophes to straight apostrophes for common English contractions
|
|
50
|
+
text = re.sub(r"([a-zA-Z])’([tsdm]|ll|re|ve)\b", r"\1'\2", text, flags=re.IGNORECASE)
|
|
51
|
+
text = re.sub(r"([0-9])’([s])\b", r"\1'\2", text, flags=re.IGNORECASE)
|
|
52
|
+
|
|
53
|
+
# Collapse whitespace (after replacements)
|
|
54
|
+
text = re.sub(r"\s+", " ", text)
|
|
55
|
+
|
|
56
|
+
return text.strip()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def parse_speaker_text(line) -> Tuple[Optional[str], str]:
|
|
60
|
+
"""Parse a line of text to extract speaker and content."""
|
|
61
|
+
|
|
62
|
+
if ":" not in line and ":" not in line:
|
|
63
|
+
return None, line
|
|
64
|
+
|
|
65
|
+
# 匹配以 >> 开头的行,并去除开头的名字和冒号
|
|
66
|
+
match = SPEAKER_PATTERN.match(line)
|
|
67
|
+
if match:
|
|
68
|
+
return match.group(1).strip(), match.group(2).strip()
|
|
69
|
+
|
|
70
|
+
match = SPEAKER_LATTIFAI.match(line)
|
|
71
|
+
if match:
|
|
72
|
+
assert len(match.groups()) == 2, match.groups()
|
|
73
|
+
if not match.group(1):
|
|
74
|
+
logging.error(f"ParseSub LINE [{line}]")
|
|
75
|
+
else:
|
|
76
|
+
return match.group(1).strip(), match.group(2).strip()
|
|
77
|
+
|
|
78
|
+
match = SPEAKER_PATTERN2.match(line)
|
|
79
|
+
if match:
|
|
80
|
+
assert len(match.groups()) == 2, match.groups()
|
|
81
|
+
return match.group(1).strip(), match.group(2).strip()
|
|
82
|
+
|
|
83
|
+
return None, line
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def parse_timestamp_text(line: str) -> Tuple[Optional[float], Optional[float], str]:
|
|
87
|
+
"""
|
|
88
|
+
Parse a line of text to extract timestamp and content.
|
|
89
|
+
|
|
90
|
+
Format: [start-end] text
|
|
91
|
+
Example: [1.23-4.56] Hello world
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
line: Input line to parse
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Tuple of (start_time, end_time, text)
|
|
98
|
+
- start_time: Start timestamp in seconds, or None if not found
|
|
99
|
+
- end_time: End timestamp in seconds, or None if not found
|
|
100
|
+
- text: The text content after the timestamp
|
|
101
|
+
"""
|
|
102
|
+
match = TIMESTAMP_PATTERN.match(line)
|
|
103
|
+
if match:
|
|
104
|
+
try:
|
|
105
|
+
start = float(match.group(1))
|
|
106
|
+
end = float(match.group(2))
|
|
107
|
+
text = match.group(3).strip()
|
|
108
|
+
return start, end, text
|
|
109
|
+
except ValueError:
|
|
110
|
+
# If conversion fails, treat as plain text
|
|
111
|
+
return None, None, line
|
|
112
|
+
|
|
113
|
+
return None, None, line
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
if __name__ == "__main__":
|
|
117
|
+
pattern = re.compile(r">>\s*(.*?)\s*[::]\s*(.*)")
|
|
118
|
+
pattern = re.compile(r"(>>.*?[::])\s*(.*)")
|
|
119
|
+
|
|
120
|
+
test_strings = [
|
|
121
|
+
">>Key: Value",
|
|
122
|
+
">> Key with space : Value with space ",
|
|
123
|
+
">> 全角键 : 全角值",
|
|
124
|
+
">>Key:Value xxx. >>Key:Value",
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
for text in test_strings:
|
|
128
|
+
match = pattern.match(text)
|
|
129
|
+
if match:
|
|
130
|
+
print(f"Input: '{text}'")
|
|
131
|
+
print(f"Speaker: '{match.group(1)}'")
|
|
132
|
+
print(f"Content: '{match.group(2)}'")
|
|
133
|
+
print("-------------")
|
|
134
|
+
|
|
135
|
+
# pattern2
|
|
136
|
+
test_strings2 = ["NISHTHA BHATIA: Hey, everyone.", "DIETER: Oh, hey, Nishtha.", "GEMINI: That might"]
|
|
137
|
+
for text in test_strings2:
|
|
138
|
+
match = SPEAKER_PATTERN2.match(text)
|
|
139
|
+
if match:
|
|
140
|
+
print(f" Input: '{text}'")
|
|
141
|
+
print(f"Speaker: '{match.group(1)}'")
|
|
142
|
+
print(f"Content: '{match.group(2)}'")
|
|
143
|
+
print("-------------")
|
|
144
|
+
else:
|
|
145
|
+
raise ValueError(f"No match for: '{text}'")
|
lattifai/cli/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""CLI module for LattifAI with nemo_run entry points."""
|
|
2
|
+
|
|
3
|
+
import nemo_run as run # noqa: F401
|
|
4
|
+
|
|
5
|
+
# Import and re-export entrypoints at package level so NeMo Run can find them
|
|
6
|
+
from lattifai.cli.alignment import align
|
|
7
|
+
from lattifai.cli.caption import convert
|
|
8
|
+
from lattifai.cli.transcribe import transcribe, transcribe_align
|
|
9
|
+
from lattifai.cli.youtube import youtube
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"align",
|
|
13
|
+
"convert",
|
|
14
|
+
"transcribe",
|
|
15
|
+
"transcribe_align",
|
|
16
|
+
"youtube",
|
|
17
|
+
]
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""Alignment CLI entry point with nemo_run."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import nemo_run as run
|
|
6
|
+
from lhotse.utils import Pathlike
|
|
7
|
+
from typing_extensions import Annotated
|
|
8
|
+
|
|
9
|
+
from lattifai.client import LattifAI
|
|
10
|
+
from lattifai.config import (
|
|
11
|
+
AlignmentConfig,
|
|
12
|
+
CaptionConfig,
|
|
13
|
+
ClientConfig,
|
|
14
|
+
DiarizationConfig,
|
|
15
|
+
MediaConfig,
|
|
16
|
+
TranscriptionConfig,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
__all__ = ["align"]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@run.cli.entrypoint(name="align", namespace="alignment")
|
|
23
|
+
def align(
|
|
24
|
+
input_media: Optional[str] = None,
|
|
25
|
+
input_caption: Optional[str] = None,
|
|
26
|
+
output_caption: Optional[str] = None,
|
|
27
|
+
media: Annotated[Optional[MediaConfig], run.Config[MediaConfig]] = None,
|
|
28
|
+
caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
|
|
29
|
+
client: Annotated[Optional[ClientConfig], run.Config[ClientConfig]] = None,
|
|
30
|
+
alignment: Annotated[Optional[AlignmentConfig], run.Config[AlignmentConfig]] = None,
|
|
31
|
+
transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
|
|
32
|
+
diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
|
|
33
|
+
):
|
|
34
|
+
"""
|
|
35
|
+
Align audio/video with caption file.
|
|
36
|
+
|
|
37
|
+
This command performs forced alignment between audio/video media and caption text,
|
|
38
|
+
generating accurate timestamps for each caption segment and optionally word-level
|
|
39
|
+
timestamps. The alignment engine uses advanced speech recognition models to ensure
|
|
40
|
+
precise synchronization between audio and text.
|
|
41
|
+
|
|
42
|
+
Shortcut: invoking ``lai-align`` is equivalent to running ``lai alignment align``.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
media: Media configuration for audio/video input and output handling.
|
|
46
|
+
Fields: input_path, media_format, sample_rate, channels, output_dir,
|
|
47
|
+
output_path, output_format, prefer_audio, default_audio_format,
|
|
48
|
+
default_video_format, force_overwrite
|
|
49
|
+
client: API client configuration.
|
|
50
|
+
Fields: api_key, timeout, max_retries, default_headers
|
|
51
|
+
alignment: Alignment configuration (model selection and inference settings).
|
|
52
|
+
Fields: model_name, device, batch_size
|
|
53
|
+
caption: Caption I/O configuration (file reading/writing and formatting).
|
|
54
|
+
Fields: input_format, input_path, output_format, output_path,
|
|
55
|
+
normalize_text, split_sentence, word_level,
|
|
56
|
+
include_speaker_in_text, encoding
|
|
57
|
+
|
|
58
|
+
Examples:
|
|
59
|
+
# Basic usage with positional arguments
|
|
60
|
+
lai alignment align audio.wav caption.srt output.srt
|
|
61
|
+
|
|
62
|
+
# Mixing positional and keyword arguments
|
|
63
|
+
lai alignment align audio.mp4 caption.srt output.json \\
|
|
64
|
+
alignment.device=cuda \\
|
|
65
|
+
caption.word_level=true
|
|
66
|
+
|
|
67
|
+
# Smart sentence splitting with custom output format
|
|
68
|
+
lai alignment align audio.wav caption.srt output.vtt \\
|
|
69
|
+
caption.split_sentence=true
|
|
70
|
+
|
|
71
|
+
# Using keyword arguments (traditional syntax)
|
|
72
|
+
lai alignment align \\
|
|
73
|
+
input_media=audio.wav \\
|
|
74
|
+
input_caption=caption.srt \\
|
|
75
|
+
output_caption=output.srt
|
|
76
|
+
|
|
77
|
+
# Full configuration with nested config objects
|
|
78
|
+
lai alignment align audio.wav caption.srt aligned.json \\
|
|
79
|
+
media.output_dir=/tmp/output \\
|
|
80
|
+
caption.split_sentence=true \\
|
|
81
|
+
caption.word_level=true \\
|
|
82
|
+
caption.normalize_text=true \\
|
|
83
|
+
alignment.device=mps \\
|
|
84
|
+
alignment.model_name=Lattifai/Lattice-1-Alpha
|
|
85
|
+
"""
|
|
86
|
+
media_config = media or MediaConfig()
|
|
87
|
+
|
|
88
|
+
# Validate that input_media and media_config.input_path are not both provided
|
|
89
|
+
if input_media and media_config.input_path:
|
|
90
|
+
raise ValueError(
|
|
91
|
+
"Cannot specify both positional input_media and media.input_path. "
|
|
92
|
+
"Use either positional argument or config, not both."
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Assign input_media to media_config.input_path if provided
|
|
96
|
+
if input_media:
|
|
97
|
+
media_config.set_input_path(input_media)
|
|
98
|
+
|
|
99
|
+
if not media_config.input_path:
|
|
100
|
+
raise ValueError("Input media path must be specified via positional argument input_media= or media.input_path=")
|
|
101
|
+
|
|
102
|
+
caption_config = caption or CaptionConfig()
|
|
103
|
+
|
|
104
|
+
# Validate that output_caption_path and caption_config.output_path are not both provided
|
|
105
|
+
if output_caption and caption_config.output_path:
|
|
106
|
+
raise ValueError(
|
|
107
|
+
"Cannot specify both positional output_caption and caption.output_path. "
|
|
108
|
+
"Use either positional argument or config, not both."
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Assign paths to caption_config if provided
|
|
112
|
+
if input_caption:
|
|
113
|
+
caption_config.set_input_path(input_caption)
|
|
114
|
+
|
|
115
|
+
if output_caption:
|
|
116
|
+
caption_config.set_output_path(output_caption)
|
|
117
|
+
|
|
118
|
+
client = LattifAI(
|
|
119
|
+
client_config=client,
|
|
120
|
+
alignment_config=alignment,
|
|
121
|
+
caption_config=caption_config,
|
|
122
|
+
transcription_config=transcription,
|
|
123
|
+
diarization_config=diarization,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
is_url = media_config.input_path.startswith(("http://", "https://"))
|
|
127
|
+
if is_url:
|
|
128
|
+
# Call the client's youtube method
|
|
129
|
+
return client.youtube(
|
|
130
|
+
url=media_config.input_path,
|
|
131
|
+
output_dir=media_config.output_dir,
|
|
132
|
+
output_caption_path=caption_config.output_path,
|
|
133
|
+
media_format=media_config.normalize_format() if media_config.output_format else None,
|
|
134
|
+
force_overwrite=media_config.force_overwrite,
|
|
135
|
+
split_sentence=caption_config.split_sentence,
|
|
136
|
+
channel_selector=media_config.channel_selector,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
return client.alignment(
|
|
140
|
+
input_media=media_config.input_path,
|
|
141
|
+
input_caption=caption_config.input_path,
|
|
142
|
+
output_caption_path=caption_config.output_path,
|
|
143
|
+
split_sentence=caption_config.split_sentence,
|
|
144
|
+
channel_selector=media_config.channel_selector,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def main():
|
|
149
|
+
run.cli.main(align)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
if __name__ == "__main__":
|
|
153
|
+
main()
|