lattifai 0.4.6__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. lattifai/__init__.py +42 -27
  2. lattifai/alignment/__init__.py +6 -0
  3. lattifai/alignment/lattice1_aligner.py +119 -0
  4. lattifai/{workers/lattice1_alpha.py → alignment/lattice1_worker.py} +33 -132
  5. lattifai/{tokenizer → alignment}/phonemizer.py +1 -1
  6. lattifai/alignment/segmenter.py +166 -0
  7. lattifai/{tokenizer → alignment}/tokenizer.py +186 -112
  8. lattifai/audio2.py +211 -0
  9. lattifai/caption/__init__.py +20 -0
  10. lattifai/caption/caption.py +1275 -0
  11. lattifai/{io → caption}/supervision.py +1 -0
  12. lattifai/{io → caption}/text_parser.py +53 -10
  13. lattifai/cli/__init__.py +17 -0
  14. lattifai/cli/alignment.py +153 -0
  15. lattifai/cli/caption.py +204 -0
  16. lattifai/cli/server.py +19 -0
  17. lattifai/cli/transcribe.py +197 -0
  18. lattifai/cli/youtube.py +128 -0
  19. lattifai/client.py +455 -246
  20. lattifai/config/__init__.py +20 -0
  21. lattifai/config/alignment.py +73 -0
  22. lattifai/config/caption.py +178 -0
  23. lattifai/config/client.py +46 -0
  24. lattifai/config/diarization.py +67 -0
  25. lattifai/config/media.py +335 -0
  26. lattifai/config/transcription.py +84 -0
  27. lattifai/diarization/__init__.py +5 -0
  28. lattifai/diarization/lattifai.py +89 -0
  29. lattifai/errors.py +41 -34
  30. lattifai/logging.py +116 -0
  31. lattifai/mixin.py +552 -0
  32. lattifai/server/app.py +420 -0
  33. lattifai/transcription/__init__.py +76 -0
  34. lattifai/transcription/base.py +108 -0
  35. lattifai/transcription/gemini.py +219 -0
  36. lattifai/transcription/lattifai.py +103 -0
  37. lattifai/types.py +30 -0
  38. lattifai/utils.py +3 -31
  39. lattifai/workflow/__init__.py +22 -0
  40. lattifai/workflow/agents.py +6 -0
  41. lattifai/{workflows → workflow}/file_manager.py +81 -57
  42. lattifai/workflow/youtube.py +564 -0
  43. lattifai-1.0.0.dist-info/METADATA +736 -0
  44. lattifai-1.0.0.dist-info/RECORD +52 -0
  45. {lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
  46. lattifai-1.0.0.dist-info/entry_points.txt +13 -0
  47. lattifai/base_client.py +0 -126
  48. lattifai/bin/__init__.py +0 -3
  49. lattifai/bin/agent.py +0 -324
  50. lattifai/bin/align.py +0 -295
  51. lattifai/bin/cli_base.py +0 -25
  52. lattifai/bin/subtitle.py +0 -210
  53. lattifai/io/__init__.py +0 -43
  54. lattifai/io/reader.py +0 -86
  55. lattifai/io/utils.py +0 -15
  56. lattifai/io/writer.py +0 -102
  57. lattifai/tokenizer/__init__.py +0 -3
  58. lattifai/workers/__init__.py +0 -3
  59. lattifai/workflows/__init__.py +0 -34
  60. lattifai/workflows/agents.py +0 -12
  61. lattifai/workflows/gemini.py +0 -167
  62. lattifai/workflows/prompts/README.md +0 -22
  63. lattifai/workflows/prompts/gemini/README.md +0 -24
  64. lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
  65. lattifai/workflows/youtube.py +0 -931
  66. lattifai-0.4.6.dist-info/METADATA +0 -806
  67. lattifai-0.4.6.dist-info/RECORD +0 -39
  68. lattifai-0.4.6.dist-info/entry_points.txt +0 -3
  69. /lattifai/{io → caption}/gemini_reader.py +0 -0
  70. /lattifai/{io → caption}/gemini_writer.py +0 -0
  71. /lattifai/{workflows → transcription}/prompts/__init__.py +0 -0
  72. /lattifai/{workflows → workflow}/base.py +0 -0
  73. {lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +0 -0
  74. {lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,219 @@
1
+ """Gemini 2.5 Pro transcription module with config-driven architecture."""
2
+
3
+ import asyncio
4
+ from pathlib import Path
5
+ from typing import Optional, Union
6
+
7
+ from google import genai
8
+ from google.genai.types import GenerateContentConfig, Part, ThinkingConfig
9
+
10
+ from lattifai.audio2 import AudioData
11
+ from lattifai.config import TranscriptionConfig
12
+ from lattifai.transcription.base import BaseTranscriber
13
+ from lattifai.transcription.prompts import get_prompt_loader
14
+
15
+
16
+ class GeminiTranscriber(BaseTranscriber):
17
+ """
18
+ Gemini 2.5/3 Pro audio transcription with config-driven architecture.
19
+
20
+ Uses TranscriptionConfig for all behavioral settings.
21
+ """
22
+
23
+ # Transcriber metadata
24
+ file_suffix = ".md"
25
+
26
+ # The specific Gem URL
27
+ GEM_URL = "https://gemini.google.com/gem/1870ly7xvW2hU_umtv-LedGsjywT0sQiN"
28
+
29
+ def __init__(
30
+ self,
31
+ transcription_config: Optional[TranscriptionConfig] = None,
32
+ ):
33
+ """
34
+ Initialize Gemini transcriber.
35
+
36
+ Args:
37
+ transcription_config: Transcription configuration. If None, uses default.
38
+ """
39
+ super().__init__(config=transcription_config)
40
+
41
+ self._client: Optional[genai.Client] = None
42
+ self._generation_config: Optional[GenerateContentConfig] = None
43
+ self._system_prompt: Optional[str] = None
44
+
45
+ # Warn if API key not available
46
+ if not self.config.gemini_api_key:
47
+ self.logger.warning(
48
+ "⚠️ Gemini API key not provided. API key will be required when calling transcription methods."
49
+ )
50
+
51
+ @property
52
+ def name(self) -> str:
53
+ """Human-readable name of the transcriber."""
54
+ return f"{self.config.model_name}"
55
+
56
+ async def transcribe_url(self, url: str, language: Optional[str] = None) -> str:
57
+ """
58
+ Transcribe audio from URL using Gemini 2.5 Pro.
59
+
60
+ Args:
61
+ url: URL to transcribe (e.g., YouTube)
62
+ language: Optional language code for transcription (overrides config)
63
+
64
+ Returns:
65
+ Transcribed text
66
+
67
+ Raises:
68
+ ValueError: If API key not provided
69
+ RuntimeError: If transcription fails
70
+ """
71
+ if self.config.verbose:
72
+ self.logger.info(f"🎤 Starting Gemini transcription for: {url}")
73
+
74
+ try:
75
+ contents = Part.from_uri(file_uri=url, mime_type="video/*")
76
+ return await self._run_generation(contents, source=url)
77
+
78
+ except ImportError:
79
+ raise RuntimeError("Google GenAI SDK not installed. Please install with: pip install google-genai")
80
+ except Exception as e:
81
+ self.logger.error(f"Gemini transcription failed: {str(e)}")
82
+ raise RuntimeError(f"Gemini transcription failed: {str(e)}")
83
+
84
+ async def transcribe_file(self, media_file: Union[str, Path, AudioData], language: Optional[str] = None) -> str:
85
+ """
86
+ Transcribe audio/video from local file using Gemini 2.5 Pro.
87
+
88
+ Args:
89
+ media_file: Path to local audio/video file
90
+ language: Optional language code for transcription (overrides config)
91
+
92
+ Returns:
93
+ Transcribed text
94
+
95
+ Raises:
96
+ ValueError: If API key not provided
97
+ RuntimeError: If transcription fails
98
+ """
99
+ media_file = str(media_file)
100
+
101
+ if self.config.verbose:
102
+ self.logger.info(f"🎤 Starting Gemini transcription for file: {media_file}")
103
+
104
+ try:
105
+ client = self._get_client()
106
+
107
+ # Upload audio file
108
+ if self.config.verbose:
109
+ self.logger.info("📤 Uploading audio file to Gemini...")
110
+ media_file = client.files.upload(path=media_file)
111
+
112
+ contents = Part.from_uri(file_uri=media_file.uri, mime_type=media_file.mime_type)
113
+ return await self._run_generation(contents, source=media_file, client=client)
114
+
115
+ except ImportError:
116
+ raise RuntimeError("Google GenAI SDK not installed. Please install with: pip install google-genai")
117
+ except Exception as e:
118
+ self.logger.error(f"Gemini transcription failed: {str(e)}")
119
+ raise RuntimeError(f"Gemini transcription failed: {str(e)}")
120
+
121
+ def _get_transcription_prompt(self) -> str:
122
+ """Get (and cache) transcription system prompt from prompts module."""
123
+ if self._system_prompt is not None:
124
+ return self._system_prompt
125
+
126
+ # Load prompt from prompts/gemini/transcription_gem.txt
127
+ prompt_loader = get_prompt_loader()
128
+ base_prompt = prompt_loader.get_gemini_transcription_prompt()
129
+
130
+ # Add language-specific instruction if configured
131
+ if self.config.language:
132
+ base_prompt += f"\n\n* Use {self.config.language} language for transcription."
133
+
134
+ self._system_prompt = base_prompt
135
+ return self._system_prompt
136
+
137
+ def get_gem_info(self) -> dict:
138
+ """Get information about the Gem being used."""
139
+ return {
140
+ "gem_name": "Media Transcription Gem",
141
+ "gem_url": self.GEM_URL,
142
+ "model": self.config.model_name,
143
+ "description": "Specialized Gem for media content transcription",
144
+ }
145
+
146
+ def _build_result(self, transcript: str, output_file: Path) -> dict:
147
+ """Augment the base result with Gemini-specific metadata."""
148
+ base_result = super()._build_result(transcript, output_file)
149
+ base_result.update({"model": self.config.model_name, "language": self.config.language})
150
+ return base_result
151
+
152
+ def _get_client(self) -> genai.Client:
153
+ """Lazily create the Gemini client when first needed."""
154
+ if not self.config.gemini_api_key:
155
+ raise ValueError("Gemini API key is required for transcription")
156
+
157
+ if self._client is None:
158
+ self._client = genai.Client(api_key=self.config.gemini_api_key)
159
+ return self._client
160
+
161
+ def _get_generation_config(self) -> GenerateContentConfig:
162
+ """Lazily build the generation config since it rarely changes."""
163
+ if self._generation_config is None:
164
+ self._generation_config = GenerateContentConfig(
165
+ system_instruction=self._get_transcription_prompt(),
166
+ response_modalities=["TEXT"],
167
+ thinking_config=ThinkingConfig(
168
+ include_thoughts=False,
169
+ thinking_budget=-1,
170
+ # thinking_level="high", # "low", "medium"
171
+ ),
172
+ )
173
+ return self._generation_config
174
+
175
+ async def _run_generation(
176
+ self,
177
+ contents: Part,
178
+ *,
179
+ source: str,
180
+ client: Optional[genai.Client] = None,
181
+ ) -> str:
182
+ """
183
+ Shared helper for sending generation requests and handling the response.
184
+ """
185
+ client = client or self._get_client()
186
+ config = self._get_generation_config()
187
+
188
+ if self.config.verbose:
189
+ self.logger.info(f"🔄 Sending transcription request to {self.config.model_name} ({source})...")
190
+
191
+ response = await asyncio.get_event_loop().run_in_executor(
192
+ None,
193
+ lambda: client.models.generate_content(
194
+ model=self.config.model_name,
195
+ contents=contents,
196
+ config=config,
197
+ ),
198
+ )
199
+
200
+ if not response.text:
201
+ raise RuntimeError("Empty response from Gemini API")
202
+
203
+ transcript = response.text.strip()
204
+
205
+ if self.config.verbose:
206
+ self.logger.info(f"✅ Transcription completed ({source}): {len(transcript)} characters")
207
+
208
+ return transcript
209
+
210
+ def write(
211
+ self, transcript: str, output_file: Path, encoding: str = "utf-8", cache_audio_events: bool = True
212
+ ) -> Path:
213
+ """
214
+ Persist transcript text to disk and return the file path.
215
+ """
216
+ if isinstance(output_file, str):
217
+ output_file = Path(output_file)
218
+ output_file.write_text(transcript, encoding=encoding)
219
+ return output_file
@@ -0,0 +1,103 @@
1
+ """Transcription module with config-driven architecture."""
2
+
3
+ from pathlib import Path
4
+ from typing import Optional, Union
5
+
6
+ from lattifai.audio2 import AudioData
7
+ from lattifai.caption import Caption
8
+ from lattifai.config import TranscriptionConfig
9
+ from lattifai.transcription.base import BaseTranscriber
10
+ from lattifai.transcription.prompts import get_prompt_loader # noqa: F401
11
+
12
+
13
+ class LattifAITranscriber(BaseTranscriber):
14
+ """
15
+ LattifAI local transcription with config-driven architecture.
16
+
17
+ Uses TranscriptionConfig for all behavioral settings.
18
+ Note: This transcriber only supports local file transcription, not URLs.
19
+ """
20
+
21
+ # Transcriber metadata
22
+ file_suffix = ".ass"
23
+ supports_url = False
24
+
25
+ def __init__(
26
+ self,
27
+ transcription_config: TranscriptionConfig,
28
+ ):
29
+ """
30
+ Initialize Gemini transcriber.
31
+
32
+ Args:
33
+ transcription_config: Transcription configuration. If None, uses default.
34
+ """
35
+ super().__init__(
36
+ config=transcription_config,
37
+ )
38
+
39
+ self._system_prompt: Optional[str] = None
40
+ self._transcriber = None
41
+
42
+ @property
43
+ def name(self) -> str:
44
+ return f"{self.config.model_name}"
45
+
46
+ async def transcribe_url(self, url: str, language: Optional[str] = None) -> str:
47
+ """
48
+ URL transcription not supported for LattifAI local models.
49
+
50
+ This method exists to satisfy the BaseTranscriber interface but
51
+ will never be called because supports_url = False and the base
52
+ class checks this flag before calling this method.
53
+
54
+ Args:
55
+ url: URL to transcribe (not supported)
56
+ language: Optional language code (not used)
57
+ """
58
+ raise NotImplementedError(
59
+ f"{self.__class__.__name__} does not support URL transcription. "
60
+ f"Please download the file first and use transcribe_file()."
61
+ )
62
+
63
+ async def transcribe_file(self, media_file: Union[str, Path, AudioData], language: Optional[str] = None) -> Caption:
64
+ if self._transcriber is None:
65
+ from lattifai_core.transcription import LattifAITranscriber as CoreLattifAITranscriber
66
+
67
+ self._transcriber = CoreLattifAITranscriber.from_pretrained(model_config=self.config)
68
+
69
+ transcription, audio_events = self._transcriber.transcribe(media_file, language=language, num_workers=2)
70
+ caption = Caption.from_transcription_results(
71
+ transcription=transcription,
72
+ audio_events=audio_events,
73
+ )
74
+
75
+ return caption
76
+
77
+ def write(
78
+ self, transcript: Caption, output_file: Path, encoding: str = "utf-8", cache_audio_events: bool = True
79
+ ) -> Path:
80
+ """
81
+ Persist transcript text to disk and return the file path.
82
+ """
83
+ transcript.write(
84
+ output_file,
85
+ include_speaker_in_text=False,
86
+ )
87
+ if cache_audio_events and transcript.audio_events:
88
+ from tgt import write_to_file
89
+
90
+ events_file = output_file.with_suffix(".AED")
91
+ write_to_file(transcript.audio_events, events_file, format="long")
92
+
93
+ return output_file
94
+
95
+ def _get_transcription_prompt(self) -> str:
96
+ """Get (and cache) transcription system prompt from prompts module."""
97
+ if self._system_prompt is not None:
98
+ return self._system_prompt
99
+
100
+ base_prompt = "" # TODO
101
+
102
+ self._system_prompt = base_prompt
103
+ return self._system_prompt
lattifai/types.py ADDED
@@ -0,0 +1,30 @@
1
+ """Common type definitions for LattifAI."""
2
+
3
+ from pathlib import Path
4
+ from typing import List, TypeAlias, Union
5
+
6
+ from lhotse.utils import Pathlike
7
+
8
+ from .caption import Supervision
9
+
10
+ # Path-like types
11
+ PathLike: TypeAlias = Pathlike # Re-export for convenience (str | Path)
12
+
13
+ # Caption types
14
+ SupervisionList: TypeAlias = List[Supervision]
15
+ """List of caption segments with timing and text information."""
16
+
17
+ # Media format types
18
+ MediaFormat: TypeAlias = str
19
+ """Media format string (e.g., 'mp3', 'wav', 'mp4')."""
20
+
21
+ # URL types
22
+ URL: TypeAlias = str
23
+ """String representing a URL."""
24
+
25
+ __all__ = [
26
+ "PathLike",
27
+ "SupervisionList",
28
+ "MediaFormat",
29
+ "URL",
30
+ ]
lattifai/utils.py CHANGED
@@ -6,8 +6,6 @@ from pathlib import Path
6
6
  from typing import Any, Optional, Type
7
7
 
8
8
  from lattifai.errors import ModelLoadError
9
- from lattifai.tokenizer import LatticeTokenizer
10
- from lattifai.workers import Lattice1AlphaWorker
11
9
 
12
10
 
13
11
  def _get_cache_marker_path(cache_dir: Path) -> Path:
@@ -55,8 +53,8 @@ def _create_cache_marker(cache_dir: Path) -> None:
55
53
 
56
54
  def _resolve_model_path(model_name_or_path: str) -> str:
57
55
  """Resolve model path, downloading from Hugging Face when necessary."""
58
- if Path(model_name_or_path).exists():
59
- return model_name_or_path
56
+ if Path(model_name_or_path).expanduser().exists():
57
+ return str(Path(model_name_or_path).expanduser())
60
58
 
61
59
  from huggingface_hub import snapshot_download
62
60
  from huggingface_hub.constants import HF_HUB_CACHE
@@ -94,7 +92,7 @@ def _resolve_model_path(model_name_or_path: str) -> str:
94
92
 
95
93
  def _select_device(device: Optional[str]) -> str:
96
94
  """Select best available torch device when not explicitly provided."""
97
- if device:
95
+ if device and device != "auto":
98
96
  return device
99
97
 
100
98
  import torch
@@ -105,29 +103,3 @@ def _select_device(device: Optional[str]) -> str:
105
103
  elif torch.cuda.is_available():
106
104
  detected = "cuda"
107
105
  return detected
108
-
109
-
110
- def _load_tokenizer(
111
- client_wrapper: Any,
112
- model_path: str,
113
- device: str,
114
- *,
115
- tokenizer_cls: Type[LatticeTokenizer] = LatticeTokenizer,
116
- ) -> LatticeTokenizer:
117
- """Instantiate tokenizer with consistent error handling."""
118
- try:
119
- return tokenizer_cls.from_pretrained(
120
- client_wrapper=client_wrapper,
121
- model_path=model_path,
122
- device=device,
123
- )
124
- except Exception as e:
125
- raise ModelLoadError(f"tokenizer from {model_path}", original_error=e)
126
-
127
-
128
- def _load_worker(model_path: str, device: str) -> Lattice1AlphaWorker:
129
- """Instantiate lattice worker with consistent error handling."""
130
- try:
131
- return Lattice1AlphaWorker(model_path, device=device, num_threads=8)
132
- except Exception as e:
133
- raise ModelLoadError(f"worker from {model_path}", original_error=e)
@@ -0,0 +1,22 @@
1
+ """
2
+ LattifAI Agentic Workflows
3
+
4
+ This module provides agentic workflow capabilities for automated processing
5
+ of multimedia content through intelligent agent-based pipelines.
6
+ """
7
+
8
+ # Import transcript processing functionality
9
+
10
+
11
+ from .base import WorkflowAgent, WorkflowResult, WorkflowStep
12
+ from .file_manager import TRANSCRIBE_CHOICE, FileExistenceManager
13
+ from .youtube import YouTubeDownloader
14
+
15
+ __all__ = [
16
+ "WorkflowAgent",
17
+ "WorkflowStep",
18
+ "WorkflowResult",
19
+ "FileExistenceManager",
20
+ "YouTubeDownloader",
21
+ "TRANSCRIBE_CHOICE",
22
+ ]
@@ -0,0 +1,6 @@
1
+ """
2
+ Caption Agents
3
+
4
+ """
5
+
6
+ __all__ = []