lattifai 0.4.6__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. lattifai/__init__.py +42 -27
  2. lattifai/alignment/__init__.py +6 -0
  3. lattifai/alignment/lattice1_aligner.py +119 -0
  4. lattifai/{workers/lattice1_alpha.py → alignment/lattice1_worker.py} +33 -132
  5. lattifai/{tokenizer → alignment}/phonemizer.py +1 -1
  6. lattifai/alignment/segmenter.py +166 -0
  7. lattifai/{tokenizer → alignment}/tokenizer.py +186 -112
  8. lattifai/audio2.py +211 -0
  9. lattifai/caption/__init__.py +20 -0
  10. lattifai/caption/caption.py +1275 -0
  11. lattifai/{io → caption}/supervision.py +1 -0
  12. lattifai/{io → caption}/text_parser.py +53 -10
  13. lattifai/cli/__init__.py +17 -0
  14. lattifai/cli/alignment.py +153 -0
  15. lattifai/cli/caption.py +204 -0
  16. lattifai/cli/server.py +19 -0
  17. lattifai/cli/transcribe.py +197 -0
  18. lattifai/cli/youtube.py +128 -0
  19. lattifai/client.py +455 -246
  20. lattifai/config/__init__.py +20 -0
  21. lattifai/config/alignment.py +73 -0
  22. lattifai/config/caption.py +178 -0
  23. lattifai/config/client.py +46 -0
  24. lattifai/config/diarization.py +67 -0
  25. lattifai/config/media.py +335 -0
  26. lattifai/config/transcription.py +84 -0
  27. lattifai/diarization/__init__.py +5 -0
  28. lattifai/diarization/lattifai.py +89 -0
  29. lattifai/errors.py +41 -34
  30. lattifai/logging.py +116 -0
  31. lattifai/mixin.py +552 -0
  32. lattifai/server/app.py +420 -0
  33. lattifai/transcription/__init__.py +76 -0
  34. lattifai/transcription/base.py +108 -0
  35. lattifai/transcription/gemini.py +219 -0
  36. lattifai/transcription/lattifai.py +103 -0
  37. lattifai/types.py +30 -0
  38. lattifai/utils.py +3 -31
  39. lattifai/workflow/__init__.py +22 -0
  40. lattifai/workflow/agents.py +6 -0
  41. lattifai/{workflows → workflow}/file_manager.py +81 -57
  42. lattifai/workflow/youtube.py +564 -0
  43. lattifai-1.0.0.dist-info/METADATA +736 -0
  44. lattifai-1.0.0.dist-info/RECORD +52 -0
  45. {lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
  46. lattifai-1.0.0.dist-info/entry_points.txt +13 -0
  47. lattifai/base_client.py +0 -126
  48. lattifai/bin/__init__.py +0 -3
  49. lattifai/bin/agent.py +0 -324
  50. lattifai/bin/align.py +0 -295
  51. lattifai/bin/cli_base.py +0 -25
  52. lattifai/bin/subtitle.py +0 -210
  53. lattifai/io/__init__.py +0 -43
  54. lattifai/io/reader.py +0 -86
  55. lattifai/io/utils.py +0 -15
  56. lattifai/io/writer.py +0 -102
  57. lattifai/tokenizer/__init__.py +0 -3
  58. lattifai/workers/__init__.py +0 -3
  59. lattifai/workflows/__init__.py +0 -34
  60. lattifai/workflows/agents.py +0 -12
  61. lattifai/workflows/gemini.py +0 -167
  62. lattifai/workflows/prompts/README.md +0 -22
  63. lattifai/workflows/prompts/gemini/README.md +0 -24
  64. lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
  65. lattifai/workflows/youtube.py +0 -931
  66. lattifai-0.4.6.dist-info/METADATA +0 -806
  67. lattifai-0.4.6.dist-info/RECORD +0 -39
  68. lattifai-0.4.6.dist-info/entry_points.txt +0 -3
  69. /lattifai/{io → caption}/gemini_reader.py +0 -0
  70. /lattifai/{io → caption}/gemini_writer.py +0 -0
  71. /lattifai/{workflows → transcription}/prompts/__init__.py +0 -0
  72. /lattifai/{workflows → workflow}/base.py +0 -0
  73. {lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +0 -0
  74. {lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,20 @@
1
+ """Configuration system for LattifAI using nemo_run."""
2
+
3
+ from .alignment import AlignmentConfig
4
+ from .caption import CaptionConfig
5
+ from .client import ClientConfig
6
+ from .diarization import DiarizationConfig
7
+ from .media import AUDIO_FORMATS, MEDIA_FORMATS, VIDEO_FORMATS, MediaConfig
8
+ from .transcription import TranscriptionConfig
9
+
10
+ __all__ = [
11
+ "ClientConfig",
12
+ "AlignmentConfig",
13
+ "CaptionConfig",
14
+ "TranscriptionConfig",
15
+ "DiarizationConfig",
16
+ "MediaConfig",
17
+ "AUDIO_FORMATS",
18
+ "VIDEO_FORMATS",
19
+ "MEDIA_FORMATS",
20
+ ]
@@ -0,0 +1,73 @@
1
+ """Alignment configuration for LattifAI."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import TYPE_CHECKING, Dict, Literal, Optional
5
+
6
+ from ..utils import _select_device
7
+
8
+ if TYPE_CHECKING:
9
+ from ..base_client import SyncAPIClient
10
+
11
+
12
+ @dataclass
13
+ class AlignmentConfig:
14
+ """
15
+ Core alignment configuration.
16
+
17
+ Defines model selection, decoding behavior, and API settings for forced alignment.
18
+ """
19
+
20
+ # Alignment configuration
21
+ model_name: str = "Lattifai/Lattice-1"
22
+ """Model identifier or path to local model directory (e.g., 'Lattifai/Lattice-1')."""
23
+
24
+ device: Literal["cpu", "cuda", "mps", "auto"] = "auto"
25
+ """Computation device: 'cpu' for CPU, 'cuda' for NVIDIA GPU, 'mps' for Apple Silicon."""
26
+
27
+ batch_size: int = 1
28
+ """Batch size for inference (number of samples processed simultaneously)."""
29
+
30
+ # Segmented Alignment for Long Audio
31
+ trust_caption_timestamps: bool = False
32
+ """When True, use original caption timestamps as strong reference constraints during alignment.
33
+ The alignment process will still adjust timestamps but stay close to the input timing.
34
+ Use this when you want to re-segment caption sentence boundaries (caption.split_sentence=True)
35
+ while preserving the approximate timing from the original captions.
36
+ When False (default), performs unconstrained forced alignment based purely on media-caption matching.
37
+ """
38
+
39
+ strategy: Literal["caption", "transcription", "entire"] = "entire"
40
+ """Alignment strategy for long audio alignment:
41
+ - 'entire': Process entire audio as single alignment (default, suitable for <30 min)
42
+ - 'caption': Split based on existing caption boundaries and gaps (segment_max_gap)
43
+ work with `alignment.trust_caption_timestamps=true`
44
+ - 'transcription': Align media with transcription first, then segment based on transcription
45
+
46
+ Use segmentation for long audio (>30 min) to reduce memory usage and improve performance.
47
+ """
48
+
49
+ segment_duration: float = 300.0
50
+ """Target duration (in seconds) for each alignment segment when using 'caption' strategy.
51
+ Default: 300.0 (5 minutes). Typical range: 30-600 seconds (30s-10min).
52
+ Shorter segments = lower memory, longer segments = better context for alignment.
53
+ """
54
+
55
+ segment_max_gap: float = 4.0
56
+ """Maximum gap (in seconds) between captions to consider them part of the same segment.
57
+ Used by 'caption' and 'adaptive' strategies. Gaps larger than this trigger segment splitting.
58
+ Default: 4.0 seconds. Useful for detecting scene changes or natural breaks in content.
59
+ """
60
+
61
+ client_wrapper: Optional["SyncAPIClient"] = field(default=None, repr=False)
62
+ """Reference to the SyncAPIClient instance. Auto-set during client initialization."""
63
+
64
+ def __post_init__(self):
65
+ """Validate and auto-populate configuration after initialization."""
66
+ # Validate alignment parameters
67
+ if self.batch_size < 1:
68
+ raise ValueError("batch_size must be at least 1")
69
+ if self.device not in ("cpu", "cuda", "mps", "auto") and not self.device.startswith("cuda:"):
70
+ raise ValueError(f"device must be one of ('cpu', 'cuda', 'mps', 'auto'), got {self.device}")
71
+
72
+ if self.device == "auto":
73
+ self.device = _select_device(self.device)
@@ -0,0 +1,178 @@
1
+ """Caption I/O configuration for LattifAI."""
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Literal, Optional
6
+
7
+ from lhotse.utils import Pathlike
8
+
9
+ # Supported caption formats for reading/writing
10
+ CAPTION_FORMATS = ["srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "md", "ttml", "sami", "smi"]
11
+
12
+ # Input caption formats (includes special formats like 'auto' and 'gemini')
13
+ INPUT_CAPTION_FORMATS = ["srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "ttml", "sami", "smi", "auto", "gemini"]
14
+
15
+ # Output caption formats (includes special formats like 'TextGrid' and 'json')
16
+ OUTPUT_CAPTION_FORMATS = ["srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "ttml", "sami", "smi", "TextGrid", "json"]
17
+
18
+ # All caption formats combined (for file detection)
19
+ ALL_CAPTION_FORMATS = list(set(CAPTION_FORMATS + ["TextGrid", "json", "gemini"]))
20
+
21
+ # Type aliases for better type hints
22
+ InputCaptionFormat = Literal["auto", "srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "ttml", "sami", "smi", "gemini"]
23
+ OutputCaptionFormat = Literal[
24
+ "srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "ttml", "sami", "smi", "TextGrid", "json"
25
+ ]
26
+
27
+
28
+ @dataclass
29
+ class CaptionConfig:
30
+ """
31
+ Caption I/O configuration.
32
+
33
+ Controls caption file reading, writing, and formatting options.
34
+ """
35
+
36
+ input_format: InputCaptionFormat = "auto"
37
+ """Input caption format: 'auto', 'srt', 'vtt', 'ass', 'txt', or 'json'."""
38
+
39
+ input_path: Optional[str] = None
40
+ """Path to input caption file."""
41
+
42
+ output_format: OutputCaptionFormat = "srt"
43
+ """Output caption format: 'srt', 'vtt', 'ass', 'txt', or 'json'."""
44
+
45
+ output_path: Optional[str] = None
46
+ """Path to output caption file."""
47
+
48
+ include_speaker_in_text: bool = True
49
+ """Preserve speaker labels in caption text content."""
50
+
51
+ normalize_text: bool = False
52
+ """Clean HTML entities and normalize whitespace in caption text."""
53
+
54
+ split_sentence: bool = False
55
+ """Re-segment captions intelligently based on punctuation and semantics."""
56
+
57
+ word_level: bool = False
58
+ """Include word-level timestamps in alignment results (useful for karaoke, dubbing)."""
59
+
60
+ encoding: str = "utf-8"
61
+ """Character encoding for reading/writing caption files (default: utf-8)."""
62
+
63
+ source_lang: Optional[str] = None
64
+ """Source language code for the caption content (e.g., 'en', 'zh', 'de')."""
65
+
66
+ def __post_init__(self):
67
+ """Validate configuration after initialization."""
68
+ self._normalize_paths()
69
+ self._validate_formats()
70
+
71
+ @property
72
+ def need_alignment(self, trust_timestamps: bool) -> bool:
73
+ """Determine if alignment is needed based on configuration."""
74
+ if trust_timestamps and not self.split_sentence:
75
+ if not self.word_level:
76
+ return False
77
+ if self.normalize_text:
78
+ print(
79
+ "⚠️ Warning: Text normalization with 'trust_input_timestamps=True' and 'split_sentence=False'"
80
+ "💡 Recommended command:\n"
81
+ " lai caption normalize input.srt normalized.srt\n"
82
+ )
83
+
84
+ return False
85
+
86
+ return True
87
+
88
+ def _normalize_paths(self) -> None:
89
+ """Normalize and expand input/output paths."""
90
+ # Expand and normalize input path if provided, but don't require it to exist yet
91
+ # (it might be set later after downloading captions)
92
+ if self.input_path is not None:
93
+ self.input_path = str(Path(self.input_path).expanduser())
94
+
95
+ if self.output_path is not None:
96
+ self.output_path = str(Path(self.output_path).expanduser())
97
+ output_dir = Path(self.output_path).parent
98
+ output_dir.mkdir(parents=True, exist_ok=True)
99
+
100
+ def _validate_formats(self) -> None:
101
+ """Validate input and output format fields."""
102
+ if self.input_format not in INPUT_CAPTION_FORMATS:
103
+ raise ValueError(f"input_format must be one of {INPUT_CAPTION_FORMATS}, got '{self.input_format}'")
104
+
105
+ if self.output_format not in OUTPUT_CAPTION_FORMATS:
106
+ raise ValueError(f"output_format must be one of {OUTPUT_CAPTION_FORMATS}, got '{self.output_format}'")
107
+
108
+ def set_input_path(self, path: Pathlike) -> Path:
109
+ """
110
+ Set input caption path and validate it.
111
+
112
+ Args:
113
+ path: Path to input caption file (str or Path)
114
+
115
+ Returns:
116
+ Resolved path as Path object
117
+
118
+ Raises:
119
+ FileNotFoundError: If the file does not exist
120
+ ValueError: If the path is not a file
121
+ """
122
+ resolved = Path(path).expanduser().resolve()
123
+ if not resolved.exists():
124
+ raise FileNotFoundError(f"Input caption file does not exist: '{resolved}'")
125
+ if not resolved.is_file():
126
+ raise ValueError(f"Input caption path is not a file: '{resolved}'")
127
+ self.input_path = str(resolved)
128
+ self.check_input_sanity()
129
+ return resolved
130
+
131
+ def set_output_path(self, path: Pathlike) -> Path:
132
+ """
133
+ Set output caption path and create parent directories if needed.
134
+
135
+ Args:
136
+ path: Path to output caption file (str or Path)
137
+
138
+ Returns:
139
+ Resolved path as Path object
140
+ """
141
+ resolved = Path(path).expanduser().resolve()
142
+ resolved.parent.mkdir(parents=True, exist_ok=True)
143
+ self.output_path = str(resolved)
144
+ return resolved
145
+
146
+ def check_input_sanity(self) -> None:
147
+ """
148
+ Validate that input_path is properly configured and accessible.
149
+
150
+ Raises:
151
+ ValueError: If input_path is not set or is invalid
152
+ FileNotFoundError: If input_path does not exist
153
+ """
154
+ if not self.input_path:
155
+ raise ValueError("input_path is required but not set in CaptionConfig")
156
+
157
+ input_file = Path(self.input_path).expanduser()
158
+ if not input_file.exists():
159
+ raise FileNotFoundError(
160
+ f"Input caption file does not exist: '{input_file}'. " "Please check the path and try again."
161
+ )
162
+ if not input_file.is_file():
163
+ raise ValueError(
164
+ f"Input caption path is not a file: '{input_file}'. " "Expected a valid caption file path."
165
+ )
166
+
167
+ def check_sanity(self) -> bool:
168
+ """Perform sanity checks on the configuration."""
169
+ assert self.is_input_path_existed(), "Input caption path must be provided and exist."
170
+
171
+ def is_input_path_existed(self) -> bool:
172
+ """Check if input caption path is provided and exists."""
173
+ if self.input_path is None:
174
+ return False
175
+
176
+ input_file = Path(self.input_path).expanduser()
177
+ self.input_path = str(input_file)
178
+ return input_file.exists() and input_file.is_file()
@@ -0,0 +1,46 @@
1
+ """LattifAI Client configuration."""
2
+
3
+ import os
4
+ from dataclasses import dataclass, field
5
+ from typing import Dict, Optional
6
+
7
+
8
+ @dataclass
9
+ class ClientConfig:
10
+ """
11
+ Core alignment configuration.
12
+
13
+ Defines model selection, decoding behavior, and API settings for forced alignment.
14
+ """
15
+
16
+ # API configuration
17
+ api_key: Optional[str] = field(default=None)
18
+ """LattifAI API key. If None, reads from LATTIFAI_API_KEY environment variable."""
19
+
20
+ timeout: float = 120.0
21
+ """Request timeout in seconds."""
22
+
23
+ max_retries: int = 2
24
+ """Maximum number of retry attempts for failed requests."""
25
+
26
+ default_headers: Optional[Dict[str, str]] = field(default=None)
27
+ """Optional static headers to include in all requests."""
28
+
29
+ def __post_init__(self):
30
+ """Validate and auto-populate configuration after initialization."""
31
+
32
+ # Load environment variables from .env file
33
+ from dotenv import find_dotenv, load_dotenv
34
+
35
+ # Try to find and load .env file from current directory or parent directories
36
+ load_dotenv(find_dotenv(usecwd=True))
37
+
38
+ # Auto-load API key from environment if not provided
39
+ if self.api_key is None:
40
+ object.__setattr__(self, "api_key", os.environ.get("LATTIFAI_API_KEY"))
41
+
42
+ # Validate API parameters
43
+ if self.timeout <= 0:
44
+ raise ValueError("timeout must be greater than 0")
45
+ if self.max_retries < 0:
46
+ raise ValueError("max_retries must be non-negative")
@@ -0,0 +1,67 @@
1
+ """Speaker diarization configuration for LattifAI."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import TYPE_CHECKING, Literal, Optional
5
+
6
+ from ..utils import _select_device
7
+
8
+ if TYPE_CHECKING:
9
+ from ..base_client import SyncAPIClient
10
+
11
+
12
+ @dataclass
13
+ class DiarizationConfig:
14
+ """
15
+ Speaker diarization configuration.
16
+
17
+ Settings for speaker diarization operations.
18
+ """
19
+
20
+ enabled: bool = False
21
+ """Enable speaker diarization."""
22
+
23
+ device: Literal["cpu", "cuda", "mps", "auto"] = "auto"
24
+ """Computation device for diarization models."""
25
+
26
+ num_speakers: Optional[int] = None
27
+ """Number of speakers, when known. If not set, diarization will attempt to infer the number of speakers."""
28
+
29
+ min_speakers: Optional[int] = None
30
+ """Minimum number of speakers. Has no effect when `num_speakers` is provided."""
31
+
32
+ max_speakers: Optional[int] = None
33
+ """Maximum number of speakers. Has no effect when `num_speakers` is provided."""
34
+
35
+ model_name: str = "pyannote/speaker-diarization-community-1"
36
+ """Model name for speaker diarization."""
37
+
38
+ verbose: bool = False
39
+ """Enable debug logging for diarization operations."""
40
+
41
+ debug: bool = False
42
+ """Enable debug mode for diarization operations."""
43
+
44
+ client_wrapper: Optional["SyncAPIClient"] = field(default=None, repr=False)
45
+ """Reference to the SyncAPIClient instance. Auto-set during client initialization."""
46
+
47
+ def __post_init__(self):
48
+ """Validate and auto-populate configuration after initialization."""
49
+ # Validate device
50
+ if self.device not in ("cpu", "cuda", "mps", "auto") and not self.device.startswith("cuda:"):
51
+ raise ValueError(f"device must be one of ('cpu', 'cuda', 'mps', 'auto'), got '{self.device}'")
52
+
53
+ if self.device == "auto":
54
+ self.device = _select_device(self.device)
55
+
56
+ # Validate speaker counts
57
+ if self.num_speakers is not None and self.num_speakers < 1:
58
+ raise ValueError("num_speakers must be at least 1")
59
+
60
+ if self.min_speakers is not None and self.min_speakers < 1:
61
+ raise ValueError("min_speakers must be at least 1")
62
+
63
+ if self.max_speakers is not None and self.max_speakers < 1:
64
+ raise ValueError("max_speakers must be at least 1")
65
+
66
+ if self.min_speakers is not None and self.max_speakers is not None and self.min_speakers > self.max_speakers:
67
+ raise ValueError("min_speakers cannot be greater than max_speakers")