PyPI - lattifai - Versions diffs - 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

lattifai 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

lattifai/_init.py +20 -0
lattifai/alignment/__init__.py +9 -1
lattifai/alignment/lattice1_aligner.py +175 -54
lattifai/alignment/lattice1_worker.py +47 -4
lattifai/alignment/punctuation.py +38 -0
lattifai/alignment/segmenter.py +3 -2
lattifai/alignment/text_align.py +441 -0
lattifai/alignment/tokenizer.py +134 -65
lattifai/audio2.py +162 -183
lattifai/cli/__init__.py +2 -1
lattifai/cli/alignment.py +5 -0
lattifai/cli/caption.py +111 -4
lattifai/cli/transcribe.py +2 -6
lattifai/cli/youtube.py +7 -1
lattifai/client.py +72 -123
lattifai/config/__init__.py +28 -0
lattifai/config/alignment.py +14 -0
lattifai/config/caption.py +45 -31
lattifai/config/client.py +16 -0
lattifai/config/event.py +102 -0
lattifai/config/media.py +20 -0
lattifai/config/transcription.py +25 -1
lattifai/data/__init__.py +8 -0
lattifai/data/caption.py +228 -0
lattifai/diarization/__init__.py +41 -1
lattifai/errors.py +78 -53
lattifai/event/__init__.py +65 -0
lattifai/event/lattifai.py +166 -0
lattifai/mixin.py +49 -32
lattifai/transcription/base.py +8 -2
lattifai/transcription/gemini.py +147 -16
lattifai/transcription/lattifai.py +25 -63
lattifai/types.py +1 -1
lattifai/utils.py +7 -13
lattifai/workflow/__init__.py +28 -4
lattifai/workflow/file_manager.py +2 -5
lattifai/youtube/__init__.py +43 -0
lattifai/youtube/client.py +1265 -0
lattifai/youtube/types.py +23 -0
lattifai-1.3.0.dist-info/METADATA +678 -0
lattifai-1.3.0.dist-info/RECORD +57 -0
{lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +1 -2
lattifai/__init__.py +0 -88
lattifai/alignment/sentence_splitter.py +0 -219
lattifai/caption/__init__.py +0 -20
lattifai/caption/caption.py +0 -1467
lattifai/caption/gemini_reader.py +0 -462
lattifai/caption/gemini_writer.py +0 -173
lattifai/caption/supervision.py +0 -34
lattifai/caption/text_parser.py +0 -145
lattifai/cli/app_installer.py +0 -142
lattifai/cli/server.py +0 -44
lattifai/server/app.py +0 -427
lattifai/workflow/youtube.py +0 -577
lattifai-1.2.1.dist-info/METADATA +0 -1134
lattifai-1.2.1.dist-info/RECORD +0 -58
{lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
{lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
{lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0

lattifai/transcription/base.py CHANGED Viewed

@@ -7,8 +7,9 @@ from typing import List, Optional, Union
 import numpy as np
 from lattifai.audio2 import AudioData
-from lattifai.caption import Caption, Supervision
+from lattifai.caption import Supervision
 from lattifai.config import TranscriptionConfig
+from lattifai.data import Caption
 from lattifai.logging import get_logger
@@ -41,8 +42,13 @@ class BaseTranscriber(ABC):
         self.logger = get_logger("transcription")
     @property
+    @abstractmethod
     def name(self) -> str:
-        """Human-readable name of the transcriber."""
+        """Human-readable name of the transcriber.
+        Returns:
+            str: Identifier for the transcriber (e.g., 'gemini', 'parakeet').
+        """
     @property
     def file_name(self) -> str:

lattifai/transcription/gemini.py CHANGED Viewed

@@ -11,6 +11,7 @@ from google.genai.types import GenerateContentConfig, Part, ThinkingConfig
 from lattifai.audio2 import AudioData
 from lattifai.caption import Supervision
 from lattifai.config import TranscriptionConfig
+from lattifai.data import Caption
 from lattifai.transcription.base import BaseTranscriber
 from lattifai.transcription.prompts import get_prompt_loader
@@ -245,18 +246,41 @@ class GeminiTranscriber(BaseTranscriber):
         return transcript
     def _get_transcription_prompt(self) -> str:
-        """Get (and cache) transcription system prompt from prompts module."""
+        """Get (and cache) transcription system prompt.
+        Priority:
+        1. Custom prompt from config.prompt (file path or text)
+        2. Default prompt from prompts/gemini/transcription_gem.txt
+        """
         if self._system_prompt is not None:
             return self._system_prompt
-        # Load prompt from prompts/gemini/transcription_gem.txt
-        prompt_loader = get_prompt_loader()
-        base_prompt = prompt_loader.get_gemini_transcription_prompt()
+        # Check for custom prompt
+        if self.config.prompt:
+            prompt_path = Path(self.config.prompt)
+            if prompt_path.exists() and prompt_path.is_file():
+                # Load from file
+                base_prompt = prompt_path.read_text(encoding="utf-8").strip()
+                if self.config.verbose:
+                    self.logger.info(f"📝 Using custom prompt from file: {prompt_path}")
+            else:
+                # Use as direct text
+                base_prompt = self.config.prompt
+                if self.config.verbose:
+                    self.logger.info("📝 Using custom prompt text")
+        else:
+            # Load default prompt from prompts/gemini/transcription_gem.txt
+            prompt_loader = get_prompt_loader()
+            base_prompt = prompt_loader.get_gemini_transcription_prompt()
         # Add language-specific instruction if configured
         if self.config.language:
             base_prompt += f"\n\n* Use {self.config.language} language for transcription."
+        # Add media description context if available
+        if self.config.description:
+            base_prompt += f"\n\n## Media Context\n\n{self.config.description}"
         self._system_prompt = base_prompt
         return self._system_prompt
@@ -287,14 +311,21 @@ class GeminiTranscriber(BaseTranscriber):
     def _get_generation_config(self) -> GenerateContentConfig:
         """Lazily build the generation config since it rarely changes."""
         if self._generation_config is None:
+            # Only include thinking_config if thinking mode is enabled
+            thinking_config = None
+            if self.config.thinking:
+                thinking_config = ThinkingConfig(
+                    include_thoughts=self.config.include_thoughts,
+                    thinking_budget=-1,
+                )
             self._generation_config = GenerateContentConfig(
                 system_instruction=self._get_transcription_prompt(),
                 response_modalities=["TEXT"],
-                thinking_config=ThinkingConfig(
-                    include_thoughts=False,
-                    thinking_budget=-1,
-                    # thinking_level="high",  # "low", "medium"
-                ),
+                thinking_config=thinking_config,
+                temperature=self.config.temperature,
+                top_k=self.config.top_k,
+                top_p=self.config.top_p,
             )
         return self._generation_config
@@ -323,23 +354,123 @@ class GeminiTranscriber(BaseTranscriber):
             ),
         )
-        if not response.text:
-            raise RuntimeError("Empty response from Gemini API")
-        transcript = response.text.strip()
+        # Extract content based on include_thoughts setting
+        if self.config.include_thoughts:
+            transcript = self._extract_with_thoughts(response)
+        else:
+            if not response.text:
+                raise RuntimeError("Empty response from Gemini API")
+            transcript = response.text.strip()
         if self.config.verbose:
             self.logger.info(f"✅ Transcription completed ({source}): {len(transcript)} characters")
         return transcript
+    def _extract_with_thoughts(self, response) -> str:
+        """Extract response content including thinking process and metadata."""
+        output_parts = []
+        thoughts = []
+        text_parts = []
+        # Iterate through all parts in the response
+        for candidate in response.candidates:
+            for part in candidate.content.parts:
+                if hasattr(part, "thought") and part.thought:
+                    # This is a thinking part
+                    if hasattr(part, "text") and part.text:
+                        thoughts.append(part.text)
+                elif hasattr(part, "text") and part.text:
+                    # This is a regular text part
+                    text_parts.append(part.text)
+        # Extract metadata
+        metadata_lines = self._extract_response_metadata(response)
+        if metadata_lines:
+            output_parts.append("---")
+            output_parts.extend(metadata_lines)
+            output_parts.append("---\n")
+        # Format output with thoughts section if present
+        if thoughts:
+            output_parts.append("<thinking>")
+            output_parts.extend(thoughts)
+            output_parts.append("</thinking>\n")
+        output_parts.extend(text_parts)
+        result = "\n".join(output_parts).strip()
+        if not result:
+            raise RuntimeError("Empty response from Gemini API")
+        return result
+    def _extract_response_metadata(self, response) -> list:
+        """Extract useful metadata from Gemini response as YAML frontmatter."""
+        lines = []
+        # Model version
+        if hasattr(response, "model_version") and response.model_version:
+            lines.append(f"model_version: {response.model_version}")
+        # Usage metadata (token counts)
+        if hasattr(response, "usage_metadata") and response.usage_metadata:
+            usage = response.usage_metadata
+            if hasattr(usage, "prompt_token_count"):
+                lines.append(f"prompt_tokens: {usage.prompt_token_count}")
+            if hasattr(usage, "candidates_token_count"):
+                lines.append(f"output_tokens: {usage.candidates_token_count}")
+            if hasattr(usage, "total_token_count"):
+                lines.append(f"total_tokens: {usage.total_token_count}")
+            # Thinking tokens if available
+            if hasattr(usage, "thoughts_token_count") and usage.thoughts_token_count:
+                lines.append(f"thinking_tokens: {usage.thoughts_token_count}")
+        # Candidate-level metadata
+        if response.candidates:
+            candidate = response.candidates[0]
+            # Finish reason
+            if hasattr(candidate, "finish_reason") and candidate.finish_reason:
+                lines.append(f"finish_reason: {candidate.finish_reason}")
+            # Average log probability (confidence indicator)
+            if hasattr(candidate, "avg_logprobs") and candidate.avg_logprobs is not None:
+                lines.append(f"avg_logprobs: {candidate.avg_logprobs:.4f}")
+            # Citation metadata
+            if hasattr(candidate, "citation_metadata") and candidate.citation_metadata:
+                citations = getattr(candidate.citation_metadata, "citations", [])
+                if citations:
+                    lines.append("citations:")
+                    for cite in citations:
+                        uri = getattr(cite, "uri", "")
+                        start = getattr(cite, "start_index", "")
+                        end = getattr(cite, "end_index", "")
+                        if uri:
+                            lines.append(f"  - uri: {uri}")
+                            if start or end:
+                                lines.append(f"    range: [{start}, {end}]")
+        return lines
     def write(
-        self, transcript: str, output_file: Path, encoding: str = "utf-8", cache_audio_events: bool = True
+        self, transcript: Union[str, Caption], output_file: Path, encoding: str = "utf-8", cache_event: bool = True
     ) -> Path:
         """
-        Persist transcript text to disk and return the file path.
+        Persist transcript to disk and return the file path.
+        Supports both raw string (from transcribe_file) and Caption object
+        (after conversion in mixin._transcribe).
         """
         if isinstance(output_file, str):
             output_file = Path(output_file)
-        output_file.write_text(transcript, encoding=encoding)
+        if isinstance(transcript, Caption):
+            # Caption object - use its write method with gemini format
+            transcript.write(output_file, output_format="gemini")
+        else:
+            # Raw string from transcription
+            output_file.write_text(transcript, encoding=encoding)
         return output_file

lattifai/transcription/lattifai.py CHANGED Viewed

@@ -6,10 +6,10 @@ from typing import List, Optional, Union
 import numpy as np
 from lattifai.audio2 import AudioData
-from lattifai.caption import Caption, Supervision
+from lattifai.caption import Supervision
 from lattifai.config import TranscriptionConfig
+from lattifai.data import Caption
 from lattifai.transcription.base import BaseTranscriber
-from lattifai.transcription.prompts import get_prompt_loader  # noqa: F401
 class LattifAITranscriber(BaseTranscriber):
@@ -20,61 +20,42 @@ class LattifAITranscriber(BaseTranscriber):
     Note: This transcriber only supports local file transcription, not URLs.
     """
-    # Transcriber metadata
     file_suffix = ".ass"
     supports_url = False
-    def __init__(
-        self,
-        transcription_config: TranscriptionConfig,
-    ):
+    def __init__(self, transcription_config: TranscriptionConfig):
         """
-        Initialize Gemini transcriber.
+        Initialize LattifAI transcriber.
         Args:
-            transcription_config: Transcription configuration. If None, uses default.
+            transcription_config: Transcription configuration.
         """
-        super().__init__(
-            config=transcription_config,
-        )
-        self._system_prompt: Optional[str] = None
+        super().__init__(config=transcription_config)
         self._transcriber = None
     @property
     def name(self) -> str:
-        return f"{self.config.model_name}"
-    async def transcribe_url(self, url: str, language: Optional[str] = None) -> str:
-        """
-        URL transcription not supported for LattifAI local models.
-        This method exists to satisfy the BaseTranscriber interface but
-        will never be called because supports_url = False and the base
-        class checks this flag before calling this method.
+        return self.config.model_name
-        Args:
-            url: URL to transcribe (not supported)
-            language: Optional language code (not used)
-        """
-        raise NotImplementedError(
-            f"{self.__class__.__name__} does not support URL transcription. "
-            f"Please download the file first and use transcribe_file()."
-        )
-    async def transcribe_file(self, media_file: Union[str, Path, AudioData], language: Optional[str] = None) -> Caption:
+    def _ensure_transcriber(self):
+        """Lazily initialize the core transcriber."""
         if self._transcriber is None:
             from lattifai_core.transcription import LattifAITranscriber as CoreLattifAITranscriber
             self._transcriber = CoreLattifAITranscriber.from_pretrained(model_config=self.config)
+        return self._transcriber
-        transcription, audio_events = self._transcriber.transcribe(media_file, language=language, num_workers=2)
-        caption = Caption.from_transcription_results(
-            transcription=transcription,
-            audio_events=audio_events,
+    async def transcribe_url(self, url: str, language: Optional[str] = None) -> str:
+        """URL transcription not supported for LattifAI local models."""
+        raise NotImplementedError(
+            f"{self.__class__.__name__} does not support URL transcription. "
+            "Please download the file first and use transcribe_file()."
         )
-        return caption
+    async def transcribe_file(self, media_file: Union[str, Path, AudioData], language: Optional[str] = None) -> Caption:
+        transcriber = self._ensure_transcriber()
+        transcription, event = transcriber.transcribe(media_file, language=language, num_workers=2)
+        return Caption.from_transcription_results(transcription=transcription, event=event)
     def transcribe_numpy(
         self,
@@ -92,19 +73,12 @@ class LattifAITranscriber(BaseTranscriber):
         Returns:
             Supervision object (or list of Supervision objects) with transcription and alignment info.
         """
-        if self._transcriber is None:
-            from lattifai_core.transcription import LattifAITranscriber as CoreLattifAITranscriber
-            self._transcriber = CoreLattifAITranscriber.from_pretrained(model_config=self.config)
-        # Delegate to core transcriber which handles both single arrays and lists
-        return self._transcriber.transcribe(
+        transcriber = self._ensure_transcriber()
+        return transcriber.transcribe(
             audio, language=language, return_hypotheses=True, progress_bar=False, timestamps=True
         )[0]
-    def write(
-        self, transcript: Caption, output_file: Path, encoding: str = "utf-8", cache_audio_events: bool = True
-    ) -> Path:
+    def write(self, transcript: Caption, output_file: Path, encoding: str = "utf-8", cache_event: bool = True) -> Path:
         """
         Persist transcript text to disk and return the file path.
         """
@@ -112,20 +86,8 @@ class LattifAITranscriber(BaseTranscriber):
             output_file,
             include_speaker_in_text=False,
         )
-        if cache_audio_events and transcript.audio_events:
-            from tgt import write_to_file
-            events_file = output_file.with_suffix(".AED")
-            write_to_file(transcript.audio_events, events_file, format="long")
+        if cache_event and transcript.event:
+            events_file = output_file.with_suffix(".LED")
+            transcript.event.write(events_file)
         return output_file
-    def _get_transcription_prompt(self) -> str:
-        """Get (and cache) transcription system prompt from prompts module."""
-        if self._system_prompt is not None:
-            return self._system_prompt
-        base_prompt = ""  # TODO
-        self._system_prompt = base_prompt
-        return self._system_prompt

lattifai/types.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import List, TypeAlias, Union
 from lhotse.utils import Pathlike
-from .caption import Supervision
+from lattifai.caption import Supervision
 # Path-like types
 PathLike: TypeAlias = Pathlike  # Re-export for convenience (str | Path)

lattifai/utils.py CHANGED Viewed

@@ -94,19 +94,14 @@ def _resolve_model_path(model_name_or_path: str, model_hub: str = "huggingface")
         model_name_or_path: Local path or remote model identifier.
         model_hub: Which hub to use for downloads. Supported: "huggingface", "modelscope".
     """
-    if Path(model_name_or_path).expanduser().exists():
-        return str(Path(model_name_or_path).expanduser())
+    local_path = Path(model_name_or_path).expanduser()
+    if local_path.exists():
+        return str(local_path)
-    # Normalize hub name
     hub = (model_hub or "huggingface").lower()
     if hub not in ("huggingface", "modelscope"):
         raise ValueError(f"Unsupported model_hub: {model_hub}. Supported: 'huggingface', 'modelscope'.")
-    # If local path exists, return it regardless of hub
-    if Path(model_name_or_path).expanduser().exists():
-        return str(Path(model_name_or_path).expanduser())
     if hub == "huggingface":
         from huggingface_hub import HfApi, snapshot_download
         from huggingface_hub.constants import HF_HUB_CACHE
@@ -201,9 +196,8 @@ def _select_device(device: Optional[str]) -> str:
     import torch
-    detected = "cpu"
     if torch.backends.mps.is_available():
-        detected = "mps"
-    elif torch.cuda.is_available():
-        detected = "cuda"
-    return detected
+        return "mps"
+    if torch.cuda.is_available():
+        return "cuda"
+    return "cpu"

lattifai/workflow/__init__.py CHANGED Viewed

@@ -1,8 +1,34 @@
-"""
-LattifAI Agentic Workflows
+"""LattifAI Agentic Workflows.
 This module provides agentic workflow capabilities for automated processing
 of multimedia content through intelligent agent-based pipelines.
+Key Components:
+    WorkflowAgent: Abstract base class for implementing workflow agents.
+        Provides step-based execution with retry logic, state management,
+        and consistent logging.
+    WorkflowStep: Defines individual workflow steps with timing and
+        execution status tracking.
+    WorkflowResult: Encapsulates workflow execution results including
+        status, outputs, errors, and timing information.
+    FileExistenceManager: Handles file existence conflicts during workflows,
+        supporting interactive and automatic resolution modes.
+Example:
+    >>> from lattifai.workflow import WorkflowAgent, WorkflowStep, WorkflowResult
+    >>> class MyWorkflow(WorkflowAgent):
+    ...     def define_steps(self):
+    ...         return [WorkflowStep("download"), WorkflowStep("process")]
+    ...     def execute_step(self, step, context):
+    ...         # Implementation
+    ...         pass
+See Also:
+    - lattifai.client.LattifAI: Main client that orchestrates workflows
+    - lattifai.youtube: YouTube-specific workflow integration
 """
 # Import transcript processing functionality
@@ -10,13 +36,11 @@ of multimedia content through intelligent agent-based pipelines.
 from .base import WorkflowAgent, WorkflowResult, WorkflowStep
 from .file_manager import TRANSCRIBE_CHOICE, FileExistenceManager
-from .youtube import YouTubeDownloader
 __all__ = [
     "WorkflowAgent",
     "WorkflowStep",
     "WorkflowResult",
     "FileExistenceManager",
-    "YouTubeDownloader",
     "TRANSCRIBE_CHOICE",
 ]

lattifai/workflow/file_manager.py CHANGED Viewed

@@ -1,6 +1,4 @@
-""".
-File existence management utilities for video processing workflows
-"""
+"""File existence management utilities for video processing workflows."""
 import asyncio
 import os
@@ -187,8 +185,7 @@ class FileExistenceManager:
         if not files:
             return "proceed"
-        emoji, label = FileExistenceManager.FILE_TYPE_INFO.get(file_type, ("📄", file_type.capitalize()))
-        del emoji  # Unused variable
+        _, label = FileExistenceManager.FILE_TYPE_INFO.get(file_type, ("📄", file_type.capitalize()))
         # Header with warning color
         safe_print(f'\n{colorful.bold_yellow(f"⚠️  Existing {label} files found:")}')

lattifai/youtube/__init__.py ADDED Viewed

@@ -0,0 +1,43 @@
+"""YouTube Data Acquisition Module.
+This module provides YouTube video metadata extraction, media download,
+and caption retrieval functionality powered by yt-dlp.
+Key Components:
+    YoutubeLoader: Lightweight loader for fetching video metadata and
+        caption content in memory. Use this for quick metadata lookups
+        or when you don't need to save files to disk.
+    YouTubeDownloader: Full-featured downloader for media files and
+        captions with disk persistence. Supports various output formats
+        and quality settings.
+    VideoMetadata: Dataclass containing video information (title, duration,
+        channel, upload date, available captions, etc.).
+    CaptionTrack: Represents a single caption track with language code,
+        format, and content retrieval methods.
+Features:
+    - Proxy and cookie support for geo-restricted content
+    - Automatic caption format detection (manual vs auto-generated)
+    - Multiple audio/video format options
+    - Async and sync download APIs
+Example:
+    >>> from lattifai.youtube import YoutubeLoader, VideoMetadata
+    >>> loader = YoutubeLoader()
+    >>> metadata = loader.get_metadata("https://youtube.com/watch?v=...")
+    >>> print(metadata.title, metadata.duration)
+Requirements:
+    yt-dlp must be installed: `pip install yt-dlp`
+See Also:
+    - lattifai.client.LattifAI.youtube: High-level YouTube workflow method
+"""
+from .client import YouTubeDownloader, YoutubeLoader
+from .types import CaptionTrack, VideoMetadata
+__all__ = ["YoutubeLoader", "YouTubeDownloader", "VideoMetadata", "CaptionTrack"]

lattifai 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

lattifai 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl