lattifai 0.4.5__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. lattifai/__init__.py +61 -47
  2. lattifai/alignment/__init__.py +6 -0
  3. lattifai/alignment/lattice1_aligner.py +119 -0
  4. lattifai/alignment/lattice1_worker.py +185 -0
  5. lattifai/{tokenizer → alignment}/phonemizer.py +4 -4
  6. lattifai/alignment/segmenter.py +166 -0
  7. lattifai/{tokenizer → alignment}/tokenizer.py +244 -169
  8. lattifai/audio2.py +211 -0
  9. lattifai/caption/__init__.py +20 -0
  10. lattifai/caption/caption.py +1275 -0
  11. lattifai/{io → caption}/gemini_reader.py +30 -30
  12. lattifai/{io → caption}/gemini_writer.py +17 -17
  13. lattifai/{io → caption}/supervision.py +4 -3
  14. lattifai/caption/text_parser.py +145 -0
  15. lattifai/cli/__init__.py +17 -0
  16. lattifai/cli/alignment.py +153 -0
  17. lattifai/cli/caption.py +204 -0
  18. lattifai/cli/server.py +19 -0
  19. lattifai/cli/transcribe.py +197 -0
  20. lattifai/cli/youtube.py +128 -0
  21. lattifai/client.py +460 -251
  22. lattifai/config/__init__.py +20 -0
  23. lattifai/config/alignment.py +73 -0
  24. lattifai/config/caption.py +178 -0
  25. lattifai/config/client.py +46 -0
  26. lattifai/config/diarization.py +67 -0
  27. lattifai/config/media.py +335 -0
  28. lattifai/config/transcription.py +84 -0
  29. lattifai/diarization/__init__.py +5 -0
  30. lattifai/diarization/lattifai.py +89 -0
  31. lattifai/errors.py +98 -91
  32. lattifai/logging.py +116 -0
  33. lattifai/mixin.py +552 -0
  34. lattifai/server/app.py +420 -0
  35. lattifai/transcription/__init__.py +76 -0
  36. lattifai/transcription/base.py +108 -0
  37. lattifai/transcription/gemini.py +219 -0
  38. lattifai/transcription/lattifai.py +103 -0
  39. lattifai/{workflows → transcription}/prompts/__init__.py +4 -4
  40. lattifai/types.py +30 -0
  41. lattifai/utils.py +16 -44
  42. lattifai/workflow/__init__.py +22 -0
  43. lattifai/workflow/agents.py +6 -0
  44. lattifai/{workflows → workflow}/base.py +22 -22
  45. lattifai/{workflows → workflow}/file_manager.py +239 -215
  46. lattifai/workflow/youtube.py +564 -0
  47. lattifai-1.0.0.dist-info/METADATA +736 -0
  48. lattifai-1.0.0.dist-info/RECORD +52 -0
  49. {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
  50. lattifai-1.0.0.dist-info/entry_points.txt +13 -0
  51. {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +1 -1
  52. lattifai/base_client.py +0 -126
  53. lattifai/bin/__init__.py +0 -3
  54. lattifai/bin/agent.py +0 -325
  55. lattifai/bin/align.py +0 -296
  56. lattifai/bin/cli_base.py +0 -25
  57. lattifai/bin/subtitle.py +0 -210
  58. lattifai/io/__init__.py +0 -42
  59. lattifai/io/reader.py +0 -85
  60. lattifai/io/text_parser.py +0 -75
  61. lattifai/io/utils.py +0 -15
  62. lattifai/io/writer.py +0 -90
  63. lattifai/tokenizer/__init__.py +0 -3
  64. lattifai/workers/__init__.py +0 -3
  65. lattifai/workers/lattice1_alpha.py +0 -284
  66. lattifai/workflows/__init__.py +0 -34
  67. lattifai/workflows/agents.py +0 -10
  68. lattifai/workflows/gemini.py +0 -167
  69. lattifai/workflows/prompts/README.md +0 -22
  70. lattifai/workflows/prompts/gemini/README.md +0 -24
  71. lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
  72. lattifai/workflows/youtube.py +0 -931
  73. lattifai-0.4.5.dist-info/METADATA +0 -808
  74. lattifai-0.4.5.dist-info/RECORD +0 -39
  75. lattifai-0.4.5.dist-info/entry_points.txt +0 -3
  76. {lattifai-0.4.5.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,9 @@
1
1
  """Reader for YouTube transcript files with speaker labels and timestamps."""
2
2
 
3
3
  import re
4
- from dataclasses import dataclass, field
4
+ from dataclasses import dataclass
5
5
  from pathlib import Path
6
- from typing import List, Optional, Tuple
6
+ from typing import List, Optional
7
7
 
8
8
  from lhotse.utils import Pathlike
9
9
 
@@ -18,7 +18,7 @@ class GeminiSegment:
18
18
  timestamp: Optional[float] = None
19
19
  speaker: Optional[str] = None
20
20
  section: Optional[str] = None
21
- segment_type: str = 'dialogue' # 'dialogue', 'event', or 'section_header'
21
+ segment_type: str = "dialogue" # 'dialogue', 'event', or 'section_header'
22
22
  line_number: int = 0
23
23
 
24
24
  @property
@@ -31,15 +31,15 @@ class GeminiReader:
31
31
  """Parser for YouTube transcript format with speaker labels and timestamps."""
32
32
 
33
33
  # Regex patterns for parsing (supports both [HH:MM:SS] and [MM:SS] formats)
34
- TIMESTAMP_PATTERN = re.compile(r'\[(\d{1,2}):(\d{2}):(\d{2})\]|\[(\d{1,2}):(\d{2})\]')
35
- SECTION_HEADER_PATTERN = re.compile(r'^##\s*\[(\d{1,2}):(\d{2}):(\d{2})\]\s*(.+)$')
36
- SPEAKER_PATTERN = re.compile(r'^\*\*(.+?[::])\*\*\s*(.+)$')
37
- EVENT_PATTERN = re.compile(r'^\[([^\]]+)\]\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$')
38
- INLINE_TIMESTAMP_PATTERN = re.compile(r'^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$')
34
+ TIMESTAMP_PATTERN = re.compile(r"\[(\d{1,2}):(\d{2}):(\d{2})\]|\[(\d{1,2}):(\d{2})\]")
35
+ SECTION_HEADER_PATTERN = re.compile(r"^##\s*\[(\d{1,2}):(\d{2}):(\d{2})\]\s*(.+)$")
36
+ SPEAKER_PATTERN = re.compile(r"^\*\*(.+?[::])\*\*\s*(.+)$")
37
+ EVENT_PATTERN = re.compile(r"^\[([^\]]+)\]\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
38
+ INLINE_TIMESTAMP_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
39
39
 
40
40
  # New patterns for YouTube link format: [[MM:SS](URL&t=seconds)]
41
- YOUTUBE_SECTION_PATTERN = re.compile(r'^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$')
42
- YOUTUBE_INLINE_PATTERN = re.compile(r'^(.+?)\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]$')
41
+ YOUTUBE_SECTION_PATTERN = re.compile(r"^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$")
42
+ YOUTUBE_INLINE_PATTERN = re.compile(r"^(.+?)\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]$")
43
43
 
44
44
  @classmethod
45
45
  def parse_timestamp(cls, *args) -> float:
@@ -61,7 +61,7 @@ class GeminiReader:
61
61
  # Direct seconds (from YouTube &t= parameter)
62
62
  return int(args[0])
63
63
  else:
64
- raise ValueError(f'Invalid timestamp args: {args}')
64
+ raise ValueError(f"Invalid timestamp args: {args}")
65
65
 
66
66
  @classmethod
67
67
  def read(
@@ -82,13 +82,13 @@ class GeminiReader:
82
82
  """
83
83
  transcript_path = Path(transcript_path).expanduser().resolve()
84
84
  if not transcript_path.exists():
85
- raise FileNotFoundError(f'Transcript file not found: {transcript_path}')
85
+ raise FileNotFoundError(f"Transcript file not found: {transcript_path}")
86
86
 
87
87
  segments: List[GeminiSegment] = []
88
88
  current_section = None
89
89
  current_speaker = None
90
90
 
91
- with open(transcript_path, 'r', encoding='utf-8') as f:
91
+ with open(transcript_path, "r", encoding="utf-8") as f:
92
92
  lines = f.readlines()
93
93
 
94
94
  for line_num, line in enumerate(lines, start=1):
@@ -97,9 +97,9 @@ class GeminiReader:
97
97
  continue
98
98
 
99
99
  # Skip table of contents
100
- if line.startswith('* ['):
100
+ if line.startswith("* ["):
101
101
  continue
102
- if line.startswith('## Table of Contents'):
102
+ if line.startswith("## Table of Contents"):
103
103
  continue
104
104
 
105
105
  # Parse section headers
@@ -114,7 +114,7 @@ class GeminiReader:
114
114
  text=section_title.strip(),
115
115
  timestamp=timestamp,
116
116
  section=current_section,
117
- segment_type='section_header',
117
+ segment_type="section_header",
118
118
  line_number=line_num,
119
119
  )
120
120
  )
@@ -133,7 +133,7 @@ class GeminiReader:
133
133
  text=section_title.strip(),
134
134
  timestamp=timestamp,
135
135
  section=current_section,
136
- segment_type='section_header',
136
+ segment_type="section_header",
137
137
  line_number=line_num,
138
138
  )
139
139
  )
@@ -158,7 +158,7 @@ class GeminiReader:
158
158
  text=event_text.strip(),
159
159
  timestamp=timestamp,
160
160
  section=current_section,
161
- segment_type='event',
161
+ segment_type="event",
162
162
  line_number=line_num,
163
163
  )
164
164
  )
@@ -200,7 +200,7 @@ class GeminiReader:
200
200
  timestamp=timestamp,
201
201
  speaker=current_speaker,
202
202
  section=current_section,
203
- segment_type='dialogue',
203
+ segment_type="dialogue",
204
204
  line_number=line_num,
205
205
  )
206
206
  )
@@ -228,7 +228,7 @@ class GeminiReader:
228
228
  timestamp=timestamp,
229
229
  speaker=current_speaker,
230
230
  section=current_section,
231
- segment_type='dialogue',
231
+ segment_type="dialogue",
232
232
  line_number=line_num,
233
233
  )
234
234
  )
@@ -246,14 +246,14 @@ class GeminiReader:
246
246
  timestamp=timestamp,
247
247
  speaker=current_speaker,
248
248
  section=current_section,
249
- segment_type='dialogue',
249
+ segment_type="dialogue",
250
250
  line_number=line_num,
251
251
  )
252
252
  )
253
253
  continue
254
254
 
255
255
  # Skip markdown headers and other formatting
256
- if line.startswith('#'):
256
+ if line.startswith("#"):
257
257
  continue
258
258
 
259
259
  return segments
@@ -283,10 +283,10 @@ class GeminiReader:
283
283
  segments = cls.read(transcript_path, include_events=False, include_sections=False)
284
284
 
285
285
  # Filter to only dialogue segments with timestamps
286
- dialogue_segments = [s for s in segments if s.segment_type == 'dialogue' and s.timestamp is not None]
286
+ dialogue_segments = [s for s in segments if s.segment_type == "dialogue" and s.timestamp is not None]
287
287
 
288
288
  if not dialogue_segments:
289
- raise ValueError(f'No dialogue segments with timestamps found in {transcript_path}')
289
+ raise ValueError(f"No dialogue segments with timestamps found in {transcript_path}")
290
290
 
291
291
  # Sort by timestamp
292
292
  dialogue_segments.sort(key=lambda x: x.timestamp)
@@ -308,7 +308,7 @@ class GeminiReader:
308
308
  text=segment.text,
309
309
  start=segment.timestamp,
310
310
  duration=max(duration, min_duration),
311
- id=f'segment_{i:05d}',
311
+ id=f"segment_{i:05d}",
312
312
  speaker=segment.speaker,
313
313
  )
314
314
  )
@@ -337,13 +337,13 @@ class GeminiReader:
337
337
  else:
338
338
  # Different speaker or gap too large, save previous segment
339
339
  if current_texts:
340
- merged_text = ' '.join(current_texts)
340
+ merged_text = " ".join(current_texts)
341
341
  merged.append(
342
342
  Supervision(
343
343
  text=merged_text,
344
344
  start=current_start,
345
345
  duration=last_end_time - current_start,
346
- id=f'merged_{len(merged):05d}',
346
+ id=f"merged_{len(merged):05d}",
347
347
  )
348
348
  )
349
349
  current_speaker = segment.speaker
@@ -353,13 +353,13 @@ class GeminiReader:
353
353
 
354
354
  # Add final segment
355
355
  if current_texts:
356
- merged_text = ' '.join(current_texts)
356
+ merged_text = " ".join(current_texts)
357
357
  merged.append(
358
358
  Supervision(
359
359
  text=merged_text,
360
360
  start=current_start,
361
361
  duration=last_end_time - current_start,
362
- id=f'merged_{len(merged):05d}',
362
+ id=f"merged_{len(merged):05d}",
363
363
  )
364
364
  )
365
365
 
@@ -368,4 +368,4 @@ class GeminiReader:
368
368
  return supervisions
369
369
 
370
370
 
371
- __all__ = ['GeminiReader', 'GeminiSegment']
371
+ __all__ = ["GeminiReader", "GeminiSegment"]
@@ -19,7 +19,7 @@ class GeminiWriter:
19
19
  hours = int(seconds // 3600)
20
20
  minutes = int((seconds % 3600) // 60)
21
21
  secs = int(seconds % 60)
22
- return f'[{hours:02d}:{minutes:02d}:{secs:02d}]'
22
+ return f"[{hours:02d}:{minutes:02d}:{secs:02d}]"
23
23
 
24
24
  @classmethod
25
25
  def update_timestamps(
@@ -44,7 +44,7 @@ class GeminiWriter:
44
44
  output_path = Path(output_path)
45
45
 
46
46
  # Read original file
47
- with open(original_path, 'r', encoding='utf-8') as f:
47
+ with open(original_path, "r", encoding="utf-8") as f:
48
48
  lines = f.readlines()
49
49
 
50
50
  # Parse original segments to get line numbers
@@ -66,7 +66,7 @@ class GeminiWriter:
66
66
 
67
67
  # Write updated content
68
68
  output_path.parent.mkdir(parents=True, exist_ok=True)
69
- with open(output_path, 'w', encoding='utf-8') as f:
69
+ with open(output_path, "w", encoding="utf-8") as f:
70
70
  f.writelines(updated_lines)
71
71
 
72
72
  return output_path
@@ -83,7 +83,7 @@ class GeminiWriter:
83
83
  mapping = {}
84
84
 
85
85
  # Create a simple text-based matching
86
- dialogue_segments = [s for s in original_segments if s.segment_type == 'dialogue']
86
+ dialogue_segments = [s for s in original_segments if s.segment_type == "dialogue"]
87
87
 
88
88
  # Try to match based on text content
89
89
  for aligned_sup in aligned_supervisions:
@@ -120,7 +120,7 @@ class GeminiWriter:
120
120
 
121
121
  # Replace timestamp patterns
122
122
  # Pattern 1: [HH:MM:SS] at the end or in brackets
123
- line = re.sub(r'\[\d{2}:\d{2}:\d{2}\]', new_ts_str, line)
123
+ line = re.sub(r"\[\d{2}:\d{2}:\d{2}\]", new_ts_str, line)
124
124
 
125
125
  return line
126
126
 
@@ -146,28 +146,28 @@ class GeminiWriter:
146
146
  output_path = Path(output_path)
147
147
  output_path.parent.mkdir(parents=True, exist_ok=True)
148
148
 
149
- with open(output_path, 'w', encoding='utf-8') as f:
150
- f.write('# Aligned Transcript\n\n')
149
+ with open(output_path, "w", encoding="utf-8") as f:
150
+ f.write("# Aligned Transcript\n\n")
151
151
 
152
152
  for i, sup in enumerate(aligned_supervisions):
153
153
  # Write segment with timestamp
154
154
  start_ts = cls.format_timestamp(sup.start)
155
- f.write(f'{start_ts} {sup.text}\n')
155
+ f.write(f"{start_ts} {sup.text}\n")
156
156
 
157
157
  # Optionally write word-level timestamps
158
- if include_word_timestamps and hasattr(sup, 'alignment') and sup.alignment:
159
- if 'word' in sup.alignment:
160
- f.write(' Words: ')
158
+ if include_word_timestamps and hasattr(sup, "alignment") and sup.alignment:
159
+ if "word" in sup.alignment:
160
+ f.write(" Words: ")
161
161
  word_parts = []
162
- for word_info in sup.alignment['word']:
163
- word_ts = cls.format_timestamp(word_info['start'])
162
+ for word_info in sup.alignment["word"]:
163
+ word_ts = cls.format_timestamp(word_info["start"])
164
164
  word_parts.append(f'{word_info["symbol"]}{word_ts}')
165
- f.write(' '.join(word_parts))
166
- f.write('\n')
165
+ f.write(" ".join(word_parts))
166
+ f.write("\n")
167
167
 
168
- f.write('\n')
168
+ f.write("\n")
169
169
 
170
170
  return output_path
171
171
 
172
172
 
173
- __all__ = ['GeminiWriter']
173
+ __all__ = ["GeminiWriter"]
@@ -24,10 +24,11 @@ class Supervision(SupervisionSegment):
24
24
  """
25
25
 
26
26
  text: Optional[str] = None
27
- id: str = ''
28
- recording_id: str = ''
27
+ speaker: Optional[str] = None
28
+ id: str = ""
29
+ recording_id: str = ""
29
30
  start: Seconds = 0.0
30
31
  duration: Seconds = 0.0
31
32
 
32
33
 
33
- __all__ = ['Supervision']
34
+ __all__ = ["Supervision"]
@@ -0,0 +1,145 @@
1
+ import logging
2
+ import re
3
+ from typing import Optional, Tuple
4
+
5
+ # Timestamp pattern: [start-end] text
6
+ # Example: [1.23-4.56] Hello world
7
+ TIMESTAMP_PATTERN = re.compile(r"^\[([\d.]+)-([\d.]+)\]\s*(.*)$")
8
+
9
+ # 来自于字幕中常见的说话人标记格式
10
+ SPEAKER_PATTERN = re.compile(r"((?:>>|>>|>|>).*?[::])\s*(.*)")
11
+
12
+ # Transcriber Output Example:
13
+ # 26:19.919 --> 26:34.921
14
+ # [SPEAKER_01]: 越来越多的科技巨头入...
15
+ SPEAKER_LATTIFAI = re.compile(r"(^\[SPEAKER_.*?\][::])\s*(.*)")
16
+
17
+ # NISHTHA BHATIA: Hey, everyone.
18
+ # DIETER: Oh, hey, Nishtha.
19
+ # GEMINI: That might
20
+ SPEAKER_PATTERN2 = re.compile(r"^([A-Z]{1,15}(?:\s+[A-Z]{1,15})?[::])\s*(.*)$")
21
+
22
+
23
+ def normalize_text(text: str) -> str:
24
+ """Normalize caption text by:
25
+ - Decoding common HTML entities
26
+ - Removing HTML tags (e.g., <i>, <font>, <b>, <br>)
27
+ - Collapsing multiple whitespace into a single space
28
+ - Converting curly apostrophes to straight ones in common contractions
29
+ """
30
+ if not text:
31
+ return ""
32
+
33
+ # # Remove HTML tags first (replace with space to avoid concatenation)
34
+ # text = re.sub(r"<[^>]+>", " ", text)
35
+
36
+ html_entities = {
37
+ "&amp;": "&",
38
+ "&lt;": "<",
39
+ "&gt;": ">",
40
+ "&quot;": '"',
41
+ "&#39;": "'",
42
+ "&nbsp;": " ",
43
+ "\\N": " ",
44
+ "…": " ", # replace ellipsis with space to avoid merging words
45
+ }
46
+ for entity, char in html_entities.items():
47
+ text = text.replace(entity, char)
48
+
49
+ # Convert curly apostrophes to straight apostrophes for common English contractions
50
+ text = re.sub(r"([a-zA-Z])’([tsdm]|ll|re|ve)\b", r"\1'\2", text, flags=re.IGNORECASE)
51
+ text = re.sub(r"([0-9])’([s])\b", r"\1'\2", text, flags=re.IGNORECASE)
52
+
53
+ # Collapse whitespace (after replacements)
54
+ text = re.sub(r"\s+", " ", text)
55
+
56
+ return text.strip()
57
+
58
+
59
+ def parse_speaker_text(line) -> Tuple[Optional[str], str]:
60
+ """Parse a line of text to extract speaker and content."""
61
+
62
+ if ":" not in line and ":" not in line:
63
+ return None, line
64
+
65
+ # 匹配以 >> 开头的行,并去除开头的名字和冒号
66
+ match = SPEAKER_PATTERN.match(line)
67
+ if match:
68
+ return match.group(1).strip(), match.group(2).strip()
69
+
70
+ match = SPEAKER_LATTIFAI.match(line)
71
+ if match:
72
+ assert len(match.groups()) == 2, match.groups()
73
+ if not match.group(1):
74
+ logging.error(f"ParseSub LINE [{line}]")
75
+ else:
76
+ return match.group(1).strip(), match.group(2).strip()
77
+
78
+ match = SPEAKER_PATTERN2.match(line)
79
+ if match:
80
+ assert len(match.groups()) == 2, match.groups()
81
+ return match.group(1).strip(), match.group(2).strip()
82
+
83
+ return None, line
84
+
85
+
86
+ def parse_timestamp_text(line: str) -> Tuple[Optional[float], Optional[float], str]:
87
+ """
88
+ Parse a line of text to extract timestamp and content.
89
+
90
+ Format: [start-end] text
91
+ Example: [1.23-4.56] Hello world
92
+
93
+ Args:
94
+ line: Input line to parse
95
+
96
+ Returns:
97
+ Tuple of (start_time, end_time, text)
98
+ - start_time: Start timestamp in seconds, or None if not found
99
+ - end_time: End timestamp in seconds, or None if not found
100
+ - text: The text content after the timestamp
101
+ """
102
+ match = TIMESTAMP_PATTERN.match(line)
103
+ if match:
104
+ try:
105
+ start = float(match.group(1))
106
+ end = float(match.group(2))
107
+ text = match.group(3).strip()
108
+ return start, end, text
109
+ except ValueError:
110
+ # If conversion fails, treat as plain text
111
+ return None, None, line
112
+
113
+ return None, None, line
114
+
115
+
116
+ if __name__ == "__main__":
117
+ pattern = re.compile(r">>\s*(.*?)\s*[::]\s*(.*)")
118
+ pattern = re.compile(r"(>>.*?[::])\s*(.*)")
119
+
120
+ test_strings = [
121
+ ">>Key: Value",
122
+ ">> Key with space : Value with space ",
123
+ ">> 全角键 : 全角值",
124
+ ">>Key:Value xxx. >>Key:Value",
125
+ ]
126
+
127
+ for text in test_strings:
128
+ match = pattern.match(text)
129
+ if match:
130
+ print(f"Input: '{text}'")
131
+ print(f"Speaker: '{match.group(1)}'")
132
+ print(f"Content: '{match.group(2)}'")
133
+ print("-------------")
134
+
135
+ # pattern2
136
+ test_strings2 = ["NISHTHA BHATIA: Hey, everyone.", "DIETER: Oh, hey, Nishtha.", "GEMINI: That might"]
137
+ for text in test_strings2:
138
+ match = SPEAKER_PATTERN2.match(text)
139
+ if match:
140
+ print(f" Input: '{text}'")
141
+ print(f"Speaker: '{match.group(1)}'")
142
+ print(f"Content: '{match.group(2)}'")
143
+ print("-------------")
144
+ else:
145
+ raise ValueError(f"No match for: '{text}'")
@@ -0,0 +1,17 @@
1
+ """CLI module for LattifAI with nemo_run entry points."""
2
+
3
+ import nemo_run as run # noqa: F401
4
+
5
+ # Import and re-export entrypoints at package level so NeMo Run can find them
6
+ from lattifai.cli.alignment import align
7
+ from lattifai.cli.caption import convert
8
+ from lattifai.cli.transcribe import transcribe, transcribe_align
9
+ from lattifai.cli.youtube import youtube
10
+
11
+ __all__ = [
12
+ "align",
13
+ "convert",
14
+ "transcribe",
15
+ "transcribe_align",
16
+ "youtube",
17
+ ]
@@ -0,0 +1,153 @@
1
+ """Alignment CLI entry point with nemo_run."""
2
+
3
+ from typing import Optional
4
+
5
+ import nemo_run as run
6
+ from lhotse.utils import Pathlike
7
+ from typing_extensions import Annotated
8
+
9
+ from lattifai.client import LattifAI
10
+ from lattifai.config import (
11
+ AlignmentConfig,
12
+ CaptionConfig,
13
+ ClientConfig,
14
+ DiarizationConfig,
15
+ MediaConfig,
16
+ TranscriptionConfig,
17
+ )
18
+
19
+ __all__ = ["align"]
20
+
21
+
22
+ @run.cli.entrypoint(name="align", namespace="alignment")
23
+ def align(
24
+ input_media: Optional[str] = None,
25
+ input_caption: Optional[str] = None,
26
+ output_caption: Optional[str] = None,
27
+ media: Annotated[Optional[MediaConfig], run.Config[MediaConfig]] = None,
28
+ caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
29
+ client: Annotated[Optional[ClientConfig], run.Config[ClientConfig]] = None,
30
+ alignment: Annotated[Optional[AlignmentConfig], run.Config[AlignmentConfig]] = None,
31
+ transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
32
+ diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
33
+ ):
34
+ """
35
+ Align audio/video with caption file.
36
+
37
+ This command performs forced alignment between audio/video media and caption text,
38
+ generating accurate timestamps for each caption segment and optionally word-level
39
+ timestamps. The alignment engine uses advanced speech recognition models to ensure
40
+ precise synchronization between audio and text.
41
+
42
+ Shortcut: invoking ``lai-align`` is equivalent to running ``lai alignment align``.
43
+
44
+ Args:
45
+ media: Media configuration for audio/video input and output handling.
46
+ Fields: input_path, media_format, sample_rate, channels, output_dir,
47
+ output_path, output_format, prefer_audio, default_audio_format,
48
+ default_video_format, force_overwrite
49
+ client: API client configuration.
50
+ Fields: api_key, timeout, max_retries, default_headers
51
+ alignment: Alignment configuration (model selection and inference settings).
52
+ Fields: model_name, device, batch_size
53
+ caption: Caption I/O configuration (file reading/writing and formatting).
54
+ Fields: input_format, input_path, output_format, output_path,
55
+ normalize_text, split_sentence, word_level,
56
+ include_speaker_in_text, encoding
57
+
58
+ Examples:
59
+ # Basic usage with positional arguments
60
+ lai alignment align audio.wav caption.srt output.srt
61
+
62
+ # Mixing positional and keyword arguments
63
+ lai alignment align audio.mp4 caption.srt output.json \\
64
+ alignment.device=cuda \\
65
+ caption.word_level=true
66
+
67
+ # Smart sentence splitting with custom output format
68
+ lai alignment align audio.wav caption.srt output.vtt \\
69
+ caption.split_sentence=true
70
+
71
+ # Using keyword arguments (traditional syntax)
72
+ lai alignment align \\
73
+ input_media=audio.wav \\
74
+ input_caption=caption.srt \\
75
+ output_caption=output.srt
76
+
77
+ # Full configuration with nested config objects
78
+ lai alignment align audio.wav caption.srt aligned.json \\
79
+ media.output_dir=/tmp/output \\
80
+ caption.split_sentence=true \\
81
+ caption.word_level=true \\
82
+ caption.normalize_text=true \\
83
+ alignment.device=mps \\
84
+ alignment.model_name=Lattifai/Lattice-1-Alpha
85
+ """
86
+ media_config = media or MediaConfig()
87
+
88
+ # Validate that input_media and media_config.input_path are not both provided
89
+ if input_media and media_config.input_path:
90
+ raise ValueError(
91
+ "Cannot specify both positional input_media and media.input_path. "
92
+ "Use either positional argument or config, not both."
93
+ )
94
+
95
+ # Assign input_media to media_config.input_path if provided
96
+ if input_media:
97
+ media_config.set_input_path(input_media)
98
+
99
+ if not media_config.input_path:
100
+ raise ValueError("Input media path must be specified via positional argument input_media= or media.input_path=")
101
+
102
+ caption_config = caption or CaptionConfig()
103
+
104
+ # Validate that output_caption_path and caption_config.output_path are not both provided
105
+ if output_caption and caption_config.output_path:
106
+ raise ValueError(
107
+ "Cannot specify both positional output_caption and caption.output_path. "
108
+ "Use either positional argument or config, not both."
109
+ )
110
+
111
+ # Assign paths to caption_config if provided
112
+ if input_caption:
113
+ caption_config.set_input_path(input_caption)
114
+
115
+ if output_caption:
116
+ caption_config.set_output_path(output_caption)
117
+
118
+ client = LattifAI(
119
+ client_config=client,
120
+ alignment_config=alignment,
121
+ caption_config=caption_config,
122
+ transcription_config=transcription,
123
+ diarization_config=diarization,
124
+ )
125
+
126
+ is_url = media_config.input_path.startswith(("http://", "https://"))
127
+ if is_url:
128
+ # Call the client's youtube method
129
+ return client.youtube(
130
+ url=media_config.input_path,
131
+ output_dir=media_config.output_dir,
132
+ output_caption_path=caption_config.output_path,
133
+ media_format=media_config.normalize_format() if media_config.output_format else None,
134
+ force_overwrite=media_config.force_overwrite,
135
+ split_sentence=caption_config.split_sentence,
136
+ channel_selector=media_config.channel_selector,
137
+ )
138
+
139
+ return client.alignment(
140
+ input_media=media_config.input_path,
141
+ input_caption=caption_config.input_path,
142
+ output_caption_path=caption_config.output_path,
143
+ split_sentence=caption_config.split_sentence,
144
+ channel_selector=media_config.channel_selector,
145
+ )
146
+
147
+
148
+ def main():
149
+ run.cli.main(align)
150
+
151
+
152
+ if __name__ == "__main__":
153
+ main()