lattifai 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. lattifai/_init.py +20 -0
  2. lattifai/alignment/__init__.py +9 -1
  3. lattifai/alignment/lattice1_aligner.py +175 -54
  4. lattifai/alignment/lattice1_worker.py +47 -4
  5. lattifai/alignment/punctuation.py +38 -0
  6. lattifai/alignment/segmenter.py +3 -2
  7. lattifai/alignment/text_align.py +441 -0
  8. lattifai/alignment/tokenizer.py +134 -65
  9. lattifai/audio2.py +162 -183
  10. lattifai/cli/__init__.py +2 -1
  11. lattifai/cli/alignment.py +5 -0
  12. lattifai/cli/caption.py +111 -4
  13. lattifai/cli/transcribe.py +2 -6
  14. lattifai/cli/youtube.py +7 -1
  15. lattifai/client.py +72 -123
  16. lattifai/config/__init__.py +28 -0
  17. lattifai/config/alignment.py +14 -0
  18. lattifai/config/caption.py +45 -31
  19. lattifai/config/client.py +16 -0
  20. lattifai/config/event.py +102 -0
  21. lattifai/config/media.py +20 -0
  22. lattifai/config/transcription.py +25 -1
  23. lattifai/data/__init__.py +8 -0
  24. lattifai/data/caption.py +228 -0
  25. lattifai/diarization/__init__.py +41 -1
  26. lattifai/errors.py +78 -53
  27. lattifai/event/__init__.py +65 -0
  28. lattifai/event/lattifai.py +166 -0
  29. lattifai/mixin.py +49 -32
  30. lattifai/transcription/base.py +8 -2
  31. lattifai/transcription/gemini.py +147 -16
  32. lattifai/transcription/lattifai.py +25 -63
  33. lattifai/types.py +1 -1
  34. lattifai/utils.py +7 -13
  35. lattifai/workflow/__init__.py +28 -4
  36. lattifai/workflow/file_manager.py +2 -5
  37. lattifai/youtube/__init__.py +43 -0
  38. lattifai/youtube/client.py +1265 -0
  39. lattifai/youtube/types.py +23 -0
  40. lattifai-1.3.0.dist-info/METADATA +678 -0
  41. lattifai-1.3.0.dist-info/RECORD +57 -0
  42. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +1 -2
  43. lattifai/__init__.py +0 -88
  44. lattifai/alignment/sentence_splitter.py +0 -219
  45. lattifai/caption/__init__.py +0 -20
  46. lattifai/caption/caption.py +0 -1467
  47. lattifai/caption/gemini_reader.py +0 -462
  48. lattifai/caption/gemini_writer.py +0 -173
  49. lattifai/caption/supervision.py +0 -34
  50. lattifai/caption/text_parser.py +0 -145
  51. lattifai/cli/app_installer.py +0 -142
  52. lattifai/cli/server.py +0 -44
  53. lattifai/server/app.py +0 -427
  54. lattifai/workflow/youtube.py +0 -577
  55. lattifai-1.2.1.dist-info/METADATA +0 -1134
  56. lattifai-1.2.1.dist-info/RECORD +0 -58
  57. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
  58. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
  59. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,102 @@
1
+ """Audio Event Detection configuration for LattifAI."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import TYPE_CHECKING, Dict, List, Literal, Optional
5
+
6
+ from ..utils import _select_device
7
+
8
+ if TYPE_CHECKING:
9
+ from ..client import SyncAPIClient
10
+
11
+
12
+ @dataclass
13
+ class EventConfig:
14
+ """
15
+ Audio Event Detection configuration.
16
+
17
+ Settings for detecting audio events (Speech, Music, Male, Female...) in audio files using the AED model.
18
+
19
+ Event Matching:
20
+ When event_matching is enabled, the AED system will:
21
+ 1. Parse [Event] markers from input captions (e.g., [Music], [Applause])
22
+ 2. Match caption events to AED labels using semantic matching
23
+ 3. Force detection of matched labels even if not in top_k
24
+ 4. Update caption timestamps based on AED detection results
25
+
26
+ Event matching logic is implemented in lattifai_core.event.EventMatcher.
27
+ """
28
+
29
+ enabled: bool = False
30
+ """Enable audio event detection."""
31
+
32
+ device: Literal["cpu", "cuda", "mps", "auto"] = "auto"
33
+ """Computation device for Event Detection models."""
34
+
35
+ vad_chunk_size: float = 30.0
36
+ """VAD chunk size in seconds for speech segmentation."""
37
+
38
+ vad_max_gap: float = 2.0
39
+ """Maximum gap in seconds between VAD segments to merge."""
40
+
41
+ fast_mode: bool = True
42
+ """Enable fast mode (only detect top_k classes, skip others)."""
43
+
44
+ model_path: str = ""
45
+ """Path to pretrained model. If empty, uses default bundled model."""
46
+
47
+ event_matching: bool = True
48
+ """Whether update events in the alignment"""
49
+
50
+ extra_events: List[str] = field(default_factory=list)
51
+ """Additional event types to always detect, even if not in top_k.
52
+ Example: ["Applause", "Laughter", "Music"]
53
+ """
54
+
55
+ event_aliases: Dict[str, List[str]] = field(default_factory=dict)
56
+ """Custom aliases mapping [Event] markers to AED labels.
57
+
58
+ Core AED labels (14 types):
59
+ [Applause], [Baby cry], [Battle cry], [Bellow], [Children shouting],
60
+ [Laughter], [Music], [Shout], [Singing], [Sound effect],
61
+ [Speech], [Whoop], [Yell]
62
+
63
+ Custom aliases extend built-ins (not replace):
64
+ {"[Audience reaction]": ["[Applause]", "[Cheering]"]}
65
+ """
66
+
67
+ time_tolerance: float = 20.0
68
+ """Max time (seconds) non-Speech events can extend beyond supervision boundaries."""
69
+
70
+ update_timestamps: bool = True
71
+ """Whether to update caption event timestamps based on AED detections."""
72
+
73
+ duplicate_strategy: Literal["keep_all", "merge_first", "split"] = "merge_first"
74
+ """Strategy for handling multiple [Event] markers mapped to same AED interval.
75
+ - keep_all: Update all events to same time range (may cause overlapping)
76
+ - merge_first: Keep only first event per interval, skip duplicates
77
+ - split: Split interval at speech boundaries (not yet implemented)
78
+ """
79
+
80
+ client_wrapper: Optional["SyncAPIClient"] = field(default=None, repr=False)
81
+ """Reference to the SyncAPIClient instance. Auto-set during client initialization."""
82
+
83
+ def __post_init__(self):
84
+ """Validate and auto-populate configuration after initialization."""
85
+ # Validate device
86
+ if self.device not in ("cpu", "cuda", "mps", "auto") and not self.device.startswith("cuda:"):
87
+ raise ValueError(f"device must be one of ('cpu', 'cuda', 'mps', 'auto'), got '{self.device}'")
88
+
89
+ if self.device == "auto":
90
+ self.device = _select_device(self.device)
91
+
92
+ # Validate vad_chunk_size
93
+ if self.vad_chunk_size < 0:
94
+ raise ValueError("vad_chunk_size must be non-negative")
95
+
96
+ # Validate vad_max_gap
97
+ if self.vad_max_gap < 0:
98
+ raise ValueError("vad_max_gap must be non-negative")
99
+
100
+ # Validate time_tolerance
101
+ if self.time_tolerance < 0:
102
+ raise ValueError("time_tolerance must be non-negative")
lattifai/config/media.py CHANGED
@@ -91,6 +91,26 @@ class MediaConfig:
91
91
  force_overwrite: bool = False
92
92
  """Overwrite existing output files without prompting."""
93
93
 
94
+ audio_track_id: Optional[str] = "original"
95
+ """Audio track ID for multi-language YouTube videos.
96
+ - "original": Select the original audio track (default)
97
+ - Language code (e.g., "en", "ja", "fr"): Select by language
98
+ - Format ID (e.g., "251-drc", "140-0"): Select specific format
99
+ - None: No filtering, use yt-dlp default selection
100
+ """
101
+
102
+ quality: str = "best"
103
+ """Media quality for YouTube downloads.
104
+ For audio:
105
+ - "best": Highest bitrate (default)
106
+ - "medium": ~128 kbps
107
+ - "low": ~50 kbps
108
+ - Numeric string (e.g., "128"): Target bitrate in kbps
109
+ For video:
110
+ - "best": Highest resolution (default)
111
+ - "1080", "720", "480", "360": Target resolution
112
+ """
113
+
94
114
  def __post_init__(self) -> None:
95
115
  """Validate configuration and normalize paths/formats."""
96
116
  self._setup_output_directory()
@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Literal, Optional
7
7
  from ..utils import _select_device
8
8
 
9
9
  if TYPE_CHECKING:
10
- from ..base_client import SyncAPIClient
10
+ from ..client import SyncAPIClient
11
11
 
12
12
  SUPPORTED_TRANSCRIPTION_MODELS = Literal[
13
13
  "gemini-2.5-pro",
@@ -48,6 +48,30 @@ class TranscriptionConfig:
48
48
  language: Optional[str] = None
49
49
  """Target language code for transcription (e.g., 'en', 'zh', 'ja')."""
50
50
 
51
+ prompt: Optional[str] = None
52
+ """Custom prompt text or path to prompt file for transcription.
53
+ If the value is an existing file path, the file contents will be used.
54
+ Otherwise, the value is used directly as the prompt text."""
55
+
56
+ description: Optional[str] = None
57
+ """Media description from platforms like YouTube, Xiaoyuzhou (小宇宙), etc.
58
+ Used to provide context for transcription."""
59
+
60
+ thinking: bool = True
61
+ """Enable Gemini's thinking mode (Gemini models only). Set to False to disable thinking."""
62
+
63
+ include_thoughts: bool = False
64
+ """Include Gemini's thinking process in the output (Gemini models only). Requires thinking=True."""
65
+
66
+ temperature: Optional[float] = None
67
+ """Sampling temperature for generation. Higher values increase randomness."""
68
+
69
+ top_k: Optional[float] = None
70
+ """Top-k sampling parameter. Limits token selection to top k candidates."""
71
+
72
+ top_p: Optional[float] = None
73
+ """Nucleus sampling parameter. Limits token selection by cumulative probability."""
74
+
51
75
  lattice_model_path: Optional[str] = None
52
76
  """Path to local LattifAI model. Will be auto-set in LattifAI client."""
53
77
 
@@ -0,0 +1,8 @@
1
+ """Data types for LattifAI.
2
+
3
+ Provides extended Caption class with transcription/alignment/diarization support.
4
+ """
5
+
6
+ from .caption import Caption
7
+
8
+ __all__ = ["Caption"]
@@ -0,0 +1,228 @@
1
+ """Extended Caption class with transcription, alignment, and diarization support."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeVar
7
+
8
+ from lattifai.caption import Caption as BaseCaption
9
+ from lattifai.caption import Pathlike, Supervision
10
+
11
+ if TYPE_CHECKING:
12
+ from lattifai_core.event import LEDOutput
13
+
14
+ DiarizationOutput = TypeVar("DiarizationOutput")
15
+
16
+
17
+ @dataclass
18
+ class Caption(BaseCaption):
19
+ """
20
+ Extended Caption with transcription, alignment, and diarization support.
21
+
22
+ Inherits from BaseCaption and adds fields for:
23
+ - alignments: Post-alignment results
24
+ - transcription: ASR results
25
+ - event: LattifAI Event Detection results (LEDOutput)
26
+ - diarization: Speaker diarization results
27
+
28
+
29
+ These fields are used in the LattifAI pipeline for:
30
+ - Forced alignment results
31
+ - Storing intermediate transcription results
32
+ - LattifAI Event Detection (music, applause, speech, etc.)
33
+ - Speaker identification and separation
34
+
35
+ """
36
+
37
+ # Alignment results
38
+ alignments: List[Supervision] = field(default_factory=list)
39
+
40
+ # Transcription results
41
+ transcription: List[Supervision] = field(default_factory=list)
42
+
43
+ # LattifAI Event Detection results
44
+ event: Optional["LEDOutput"] = None
45
+
46
+ # Speaker Diarization results
47
+ diarization: Optional[DiarizationOutput] = None
48
+
49
+ def __len__(self) -> int:
50
+ """Return the number of supervision segments."""
51
+ return len(self.supervisions or self.transcription)
52
+
53
+ def __repr__(self) -> str:
54
+ """String representation of Caption."""
55
+ lang = f"lang={self.language}" if self.language else "lang=unknown"
56
+ kind_str = f"kind={self.kind}" if self.kind else ""
57
+ parts = [f"Caption({len(self.supervisions or self.transcription)} segments", lang]
58
+ if kind_str:
59
+ parts.append(kind_str)
60
+ if self.duration:
61
+ parts.append(f"duration={self.duration:.2f}s")
62
+ return ", ".join(parts) + ")"
63
+
64
+ def with_margins(
65
+ self,
66
+ start_margin: float = 0.08,
67
+ end_margin: float = 0.20,
68
+ min_gap: float = 0.08,
69
+ collision_mode: str = "trim",
70
+ ) -> "Caption":
71
+ """
72
+ Create a new Caption with segment boundaries adjusted based on word-level alignment.
73
+
74
+ Uses supervision.alignment['word'] to recalculate segment start/end times
75
+ with the specified margins applied around the actual speech boundaries.
76
+
77
+ Prefers alignments > supervisions > transcription as source.
78
+
79
+ Args:
80
+ start_margin: Seconds to extend before the first word (default: 0.08)
81
+ end_margin: Seconds to extend after the last word (default: 0.20)
82
+ min_gap: Minimum gap between segments for collision handling (default: 0.08)
83
+ collision_mode: How to handle segment overlap - 'trim' or 'gap' (default: 'trim')
84
+
85
+ Returns:
86
+ New Caption instance with adjusted timestamps
87
+
88
+ Note:
89
+ Segments without alignment data will keep their original timestamps.
90
+ """
91
+ from lattifai.caption.standardize import apply_margins_to_captions
92
+
93
+ # Determine which supervisions to use (priority: alignments > supervisions > transcription)
94
+ if self.alignments:
95
+ source_sups = self.alignments
96
+ elif self.supervisions:
97
+ source_sups = self.supervisions
98
+ else:
99
+ source_sups = self.transcription
100
+
101
+ adjusted_sups = apply_margins_to_captions(
102
+ source_sups,
103
+ start_margin=start_margin,
104
+ end_margin=end_margin,
105
+ min_gap=min_gap,
106
+ collision_mode=collision_mode,
107
+ )
108
+
109
+ return Caption(
110
+ supervisions=adjusted_sups,
111
+ transcription=self.transcription,
112
+ event=self.event,
113
+ diarization=self.diarization,
114
+ alignments=[], # Clear alignments since we've applied them
115
+ language=self.language,
116
+ kind=self.kind,
117
+ source_format=self.source_format,
118
+ source_path=self.source_path,
119
+ metadata=self.metadata.copy() if self.metadata else {},
120
+ )
121
+
122
+ def write(
123
+ self,
124
+ path=None,
125
+ output_format: Optional[str] = None,
126
+ include_speaker_in_text: bool = True,
127
+ word_level: bool = False,
128
+ karaoke_config=None,
129
+ metadata: Optional[Dict[str, Any]] = None,
130
+ ):
131
+ """
132
+ Write caption to file or return as bytes.
133
+
134
+ Prefers alignments > supervisions > transcription as source.
135
+
136
+ Args:
137
+ path: Path to output caption file, BytesIO object, or None to return bytes
138
+ output_format: Output format (e.g., 'srt', 'vtt', 'ass')
139
+ include_speaker_in_text: Whether to include speaker labels in text
140
+ word_level: Use word-level output format if supported
141
+ karaoke_config: Karaoke configuration
142
+ metadata: Optional metadata dict to pass to writer
143
+
144
+ Returns:
145
+ Path to the written file if path is a file path, or bytes if path is BytesIO/None
146
+ """
147
+ # Temporarily swap supervisions to use the priority order
148
+ original_supervisions = self.supervisions
149
+
150
+ if self.alignments:
151
+ self.supervisions = self.alignments
152
+ elif not self.supervisions and self.transcription:
153
+ self.supervisions = self.transcription
154
+
155
+ try:
156
+ result = super().write(
157
+ path=path,
158
+ output_format=output_format,
159
+ include_speaker_in_text=include_speaker_in_text,
160
+ word_level=word_level,
161
+ karaoke_config=karaoke_config,
162
+ metadata=metadata,
163
+ )
164
+ finally:
165
+ # Restore original supervisions
166
+ self.supervisions = original_supervisions
167
+
168
+ return result
169
+
170
+ @classmethod
171
+ def from_transcription_results(
172
+ cls,
173
+ transcription: List[Supervision],
174
+ event: Optional["LEDOutput"] = None,
175
+ diarization: Optional[DiarizationOutput] = None,
176
+ language: Optional[str] = None,
177
+ source_path: Optional[Pathlike] = None,
178
+ metadata: Optional[Dict[str, str]] = None,
179
+ ) -> "Caption":
180
+ """
181
+ Create Caption from transcription results including audio events and diarization.
182
+
183
+ Args:
184
+ transcription: List of transcription supervision segments
185
+ event: Optional LEDOutput with event detection results
186
+ diarization: Optional DiarizationOutput with speaker diarization results
187
+ language: Language code
188
+ source_path: Source file path
189
+ metadata: Additional metadata
190
+
191
+ Returns:
192
+ New Caption instance with transcription data
193
+ """
194
+ return cls(
195
+ transcription=transcription,
196
+ event=event,
197
+ diarization=diarization,
198
+ language=language,
199
+ kind="transcription",
200
+ source_format="asr",
201
+ source_path=source_path,
202
+ metadata=metadata or {},
203
+ )
204
+
205
+ def read_diarization(
206
+ self,
207
+ path: Pathlike,
208
+ ) -> "DiarizationOutput":
209
+ """
210
+ Read speaker diarization TextGrid from file.
211
+ """
212
+ from lattifai_core.diarization import DiarizationOutput
213
+
214
+ self.diarization = DiarizationOutput.read(path)
215
+ return self.diarization
216
+
217
+ def write_diarization(
218
+ self,
219
+ path: Pathlike,
220
+ ) -> Pathlike:
221
+ """
222
+ Write speaker diarization TextGrid to file.
223
+ """
224
+ if not self.diarization:
225
+ raise ValueError("No speaker diarization data to write.")
226
+
227
+ self.diarization.write(path)
228
+ return path
@@ -1,4 +1,44 @@
1
- """Speaker diarization module for LattifAI."""
1
+ """Speaker diarization module for LattifAI.
2
+
3
+ This module provides multi-speaker identification and labeling capabilities
4
+ using pyannote.audio-based diarization models. It can identify who spoke
5
+ when in an audio file and optionally match detected speakers with existing
6
+ speaker labels from input captions.
7
+
8
+ Key Components:
9
+ LattifAIDiarizer: Main diarization class that wraps pyannote.audio
10
+ pipelines for speaker segmentation and clustering.
11
+
12
+ Features:
13
+ - Automatic speaker detection with configurable min/max speaker counts
14
+ - Speaker label preservation from input captions (e.g., "Alice:", ">> Bob:")
15
+ - Integration with alignment results to assign speakers to words/segments
16
+ - Support for pre-computed diarization results (avoid reprocessing)
17
+
18
+ Configuration:
19
+ Use DiarizationConfig to control:
20
+ - enabled: Whether to run diarization
21
+ - min_speakers/max_speakers: Constrain speaker count detection
22
+ - device: GPU/CPU device selection
23
+ - debug: Enable verbose output
24
+
25
+ Example:
26
+ >>> from lattifai import LattifAI
27
+ >>> from lattifai.config import DiarizationConfig
28
+ >>> client = LattifAI(diarization_config=DiarizationConfig(enabled=True))
29
+ >>> caption = client.alignment(audio="speech.wav", input_caption="transcript.srt")
30
+ >>> for seg in caption.supervisions:
31
+ ... print(f"{seg.speaker}: {seg.text}")
32
+
33
+ Performance Notes:
34
+ - Diarization adds ~10-30% processing time to alignment
35
+ - GPU acceleration recommended for longer audio files
36
+ - Results are cached when output_path is provided
37
+
38
+ See Also:
39
+ - lattifai.config.DiarizationConfig: Configuration options
40
+ - lattifai.client.LattifAI.speaker_diarization: Direct diarization method
41
+ """
2
42
 
3
43
  from .lattifai import LattifAIDiarizer
4
44