lattifai 0.4.6__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. lattifai/__init__.py +42 -27
  2. lattifai/alignment/__init__.py +6 -0
  3. lattifai/alignment/lattice1_aligner.py +119 -0
  4. lattifai/{workers/lattice1_alpha.py → alignment/lattice1_worker.py} +33 -132
  5. lattifai/{tokenizer → alignment}/phonemizer.py +1 -1
  6. lattifai/alignment/segmenter.py +166 -0
  7. lattifai/{tokenizer → alignment}/tokenizer.py +186 -112
  8. lattifai/audio2.py +211 -0
  9. lattifai/caption/__init__.py +20 -0
  10. lattifai/caption/caption.py +1275 -0
  11. lattifai/{io → caption}/supervision.py +1 -0
  12. lattifai/{io → caption}/text_parser.py +53 -10
  13. lattifai/cli/__init__.py +17 -0
  14. lattifai/cli/alignment.py +153 -0
  15. lattifai/cli/caption.py +204 -0
  16. lattifai/cli/server.py +19 -0
  17. lattifai/cli/transcribe.py +197 -0
  18. lattifai/cli/youtube.py +128 -0
  19. lattifai/client.py +455 -246
  20. lattifai/config/__init__.py +20 -0
  21. lattifai/config/alignment.py +73 -0
  22. lattifai/config/caption.py +178 -0
  23. lattifai/config/client.py +46 -0
  24. lattifai/config/diarization.py +67 -0
  25. lattifai/config/media.py +335 -0
  26. lattifai/config/transcription.py +84 -0
  27. lattifai/diarization/__init__.py +5 -0
  28. lattifai/diarization/lattifai.py +89 -0
  29. lattifai/errors.py +41 -34
  30. lattifai/logging.py +116 -0
  31. lattifai/mixin.py +552 -0
  32. lattifai/server/app.py +420 -0
  33. lattifai/transcription/__init__.py +76 -0
  34. lattifai/transcription/base.py +108 -0
  35. lattifai/transcription/gemini.py +219 -0
  36. lattifai/transcription/lattifai.py +103 -0
  37. lattifai/types.py +30 -0
  38. lattifai/utils.py +3 -31
  39. lattifai/workflow/__init__.py +22 -0
  40. lattifai/workflow/agents.py +6 -0
  41. lattifai/{workflows → workflow}/file_manager.py +81 -57
  42. lattifai/workflow/youtube.py +564 -0
  43. lattifai-1.0.0.dist-info/METADATA +736 -0
  44. lattifai-1.0.0.dist-info/RECORD +52 -0
  45. {lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/WHEEL +1 -1
  46. lattifai-1.0.0.dist-info/entry_points.txt +13 -0
  47. lattifai/base_client.py +0 -126
  48. lattifai/bin/__init__.py +0 -3
  49. lattifai/bin/agent.py +0 -324
  50. lattifai/bin/align.py +0 -295
  51. lattifai/bin/cli_base.py +0 -25
  52. lattifai/bin/subtitle.py +0 -210
  53. lattifai/io/__init__.py +0 -43
  54. lattifai/io/reader.py +0 -86
  55. lattifai/io/utils.py +0 -15
  56. lattifai/io/writer.py +0 -102
  57. lattifai/tokenizer/__init__.py +0 -3
  58. lattifai/workers/__init__.py +0 -3
  59. lattifai/workflows/__init__.py +0 -34
  60. lattifai/workflows/agents.py +0 -12
  61. lattifai/workflows/gemini.py +0 -167
  62. lattifai/workflows/prompts/README.md +0 -22
  63. lattifai/workflows/prompts/gemini/README.md +0 -24
  64. lattifai/workflows/prompts/gemini/transcription_gem.txt +0 -81
  65. lattifai/workflows/youtube.py +0 -931
  66. lattifai-0.4.6.dist-info/METADATA +0 -806
  67. lattifai-0.4.6.dist-info/RECORD +0 -39
  68. lattifai-0.4.6.dist-info/entry_points.txt +0 -3
  69. /lattifai/{io → caption}/gemini_reader.py +0 -0
  70. /lattifai/{io → caption}/gemini_writer.py +0 -0
  71. /lattifai/{workflows → transcription}/prompts/__init__.py +0 -0
  72. /lattifai/{workflows → workflow}/base.py +0 -0
  73. {lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/licenses/LICENSE +0 -0
  74. {lattifai-0.4.6.dist-info → lattifai-1.0.0.dist-info}/top_level.txt +0 -0
lattifai/io/writer.py DELETED
@@ -1,102 +0,0 @@
1
- import json
2
- from abc import ABCMeta
3
- from typing import Any, List, Optional
4
-
5
- import pysubs2
6
- from lhotse.supervision import AlignmentItem
7
- from lhotse.utils import Pathlike
8
-
9
- from .reader import Supervision
10
-
11
-
12
- class SubtitleWriter(ABCMeta):
13
- """Class for writing subtitle files with optional word-level alignment."""
14
-
15
- @classmethod
16
- def write(cls, alignments: List[Supervision], output_path: Pathlike) -> Pathlike:
17
- if str(output_path)[-4:].lower() == ".txt":
18
- with open(output_path, "w", encoding="utf-8") as f:
19
- for sup in alignments:
20
- word_items = parse_alignment_from_supervision(sup)
21
- if word_items:
22
- for item in word_items:
23
- f.write(f"[{item.start:.2f}-{item.end:.2f}] {item.symbol}\n")
24
- else:
25
- text = f"{sup.speaker} {sup.text}" if sup.speaker is not None else sup.text
26
- f.write(f"[{sup.start:.2f}-{sup.end:.2f}] {text}\n")
27
-
28
- elif str(output_path)[-5:].lower() == ".json":
29
- with open(output_path, "w", encoding="utf-8") as f:
30
- # Enhanced JSON export with word-level alignment
31
- json_data = []
32
- for sup in alignments:
33
- sup_dict = sup.to_dict()
34
- json_data.append(sup_dict)
35
- json.dump(json_data, f, ensure_ascii=False, indent=4)
36
- elif str(output_path).lower().endswith(".textgrid"):
37
- from tgt import Interval, IntervalTier, TextGrid, write_to_file
38
-
39
- tg = TextGrid()
40
- supervisions, words, scores = [], [], {"utterances": [], "words": []}
41
- for supervision in sorted(alignments, key=lambda x: x.start):
42
- text = (
43
- f"{supervision.speaker} {supervision.text}" if supervision.speaker is not None else supervision.text
44
- )
45
- supervisions.append(Interval(supervision.start, supervision.end, text or ""))
46
- # Extract word-level alignment using helper function
47
- word_items = parse_alignment_from_supervision(supervision)
48
- if word_items:
49
- for item in word_items:
50
- words.append(Interval(item.start, item.end, item.symbol))
51
- if item.score is not None:
52
- scores["words"].append(Interval(item.start, item.end, f"{item.score:.2f}"))
53
- if supervision.has_custom("score"):
54
- scores["utterances"].append(
55
- Interval(supervision.start, supervision.end, f"{supervision.score:.2f}")
56
- )
57
-
58
- tg.add_tier(IntervalTier(name="utterances", objects=supervisions))
59
- if words:
60
- tg.add_tier(IntervalTier(name="words", objects=words))
61
-
62
- if scores["utterances"]:
63
- tg.add_tier(IntervalTier(name="utterance_scores", objects=scores["utterances"]))
64
- if scores["words"]:
65
- tg.add_tier(IntervalTier(name="word_scores", objects=scores["words"]))
66
-
67
- write_to_file(tg, output_path, format="long")
68
- else:
69
- subs = pysubs2.SSAFile()
70
- for sup in alignments:
71
- # Add word-level timing as metadata in the subtitle text
72
- word_items = parse_alignment_from_supervision(sup)
73
- if word_items:
74
- for word in word_items:
75
- subs.append(
76
- pysubs2.SSAEvent(start=int(word.start * 1000), end=int(word.end * 1000), text=word.symbol)
77
- )
78
- else:
79
- text = f"{sup.speaker} {sup.text}" if sup.speaker is not None else sup.text
80
- subs.append(pysubs2.SSAEvent(start=int(sup.start * 1000), end=int(sup.end * 1000), text=text or ""))
81
- subs.save(output_path)
82
-
83
- return output_path
84
-
85
-
86
- def parse_alignment_from_supervision(supervision: Any) -> Optional[List[AlignmentItem]]:
87
- """
88
- Extract word-level alignment items from Supervision object.
89
-
90
- Args:
91
- supervision: Supervision object with potential alignment data
92
-
93
- Returns:
94
- List of AlignmentItem objects, or None if no alignment data present
95
- """
96
- if not hasattr(supervision, "alignment") or not supervision.alignment:
97
- return None
98
-
99
- if "word" not in supervision.alignment:
100
- return None
101
-
102
- return supervision.alignment["word"]
@@ -1,3 +0,0 @@
1
- from .tokenizer import AsyncLatticeTokenizer, LatticeTokenizer
2
-
3
- __all__ = ["LatticeTokenizer", "AsyncLatticeTokenizer"]
@@ -1,3 +0,0 @@
1
- from .lattice1_alpha import Lattice1AlphaWorker
2
-
3
- __all__ = ["Lattice1AlphaWorker"]
@@ -1,34 +0,0 @@
1
- """
2
- LattifAI Agentic Workflows
3
-
4
- This module provides agentic workflow capabilities for automated processing
5
- of multimedia content through intelligent agent-based pipelines.
6
- """
7
-
8
- # Import transcript processing functionality
9
- from lattifai.io import (
10
- ALL_SUBTITLE_FORMATS,
11
- INPUT_SUBTITLE_FORMATS,
12
- OUTPUT_SUBTITLE_FORMATS,
13
- SUBTITLE_FORMATS,
14
- GeminiReader,
15
- GeminiWriter,
16
- )
17
-
18
- from .agents import YouTubeSubtitleAgent
19
- from .base import WorkflowAgent, WorkflowResult, WorkflowStep
20
- from .file_manager import FileExistenceManager
21
-
22
- __all__ = [
23
- "WorkflowAgent",
24
- "WorkflowStep",
25
- "WorkflowResult",
26
- "YouTubeSubtitleAgent",
27
- "FileExistenceManager",
28
- "GeminiReader",
29
- "GeminiWriter",
30
- "SUBTITLE_FORMATS",
31
- "INPUT_SUBTITLE_FORMATS",
32
- "OUTPUT_SUBTITLE_FORMATS",
33
- "ALL_SUBTITLE_FORMATS",
34
- ]
@@ -1,12 +0,0 @@
1
- """
2
- Subtitle Agents
3
-
4
- An agentic workflow for processing YouTube(or more) videos through:
5
- 1. URL processing and audio download
6
- 2. Gemini 2.5 Pro transcription
7
- 3. LattifAI alignment
8
- """
9
-
10
- from .youtube import YouTubeSubtitleAgent
11
-
12
- __all__ = ["YouTubeSubtitleAgent"]
@@ -1,167 +0,0 @@
1
- """
2
- Gemini 2.5 Pro transcription module
3
- """
4
-
5
- import asyncio
6
- from typing import Optional
7
-
8
- # Import Google GenAI SDK
9
- from google import genai
10
- from google.genai.types import GenerateContentConfig, Part, ThinkingConfig
11
-
12
- from .base import setup_workflow_logger
13
- from .prompts import get_prompt_loader
14
-
15
-
16
- class GeminiTranscriber:
17
- """Gemini 2.5 Pro audio transcription using the specified Gem
18
-
19
- Configuration (in __init__):
20
- - api_key: Gemini API key (required)
21
-
22
- Runtime parameters (in __call__):
23
- - youtube_url: YouTube URL to transcribe
24
- """
25
-
26
- # The specific Gem URL provided by the user
27
- GEM_URL = "https://gemini.google.com/gem/1870ly7xvW2hU_umtv-LedGsjywT0sQiN"
28
-
29
- def __init__(self, api_key: Optional[str] = None):
30
- self.api_key = api_key
31
- self.logger = setup_workflow_logger("gemini")
32
- self.prompt_loader = get_prompt_loader()
33
-
34
- if not self.api_key:
35
- self.logger.warning(
36
- "⚠️ Gemini API key not provided. API key will be required when calling transcription methods."
37
- )
38
-
39
- async def __call__(self, youtube_url: str) -> str:
40
- """Main entry point for transcription"""
41
- return await self.transcribe_url(youtube_url)
42
-
43
- async def transcribe_url(self, youtube_url: str) -> str:
44
- """
45
- Transcribe audio from YouTube URL using Gemini 2.5 Pro Gem
46
-
47
- Args:
48
- youtube_url: YouTube URL to transcribe
49
-
50
- Returns:
51
- Transcribed text
52
- """
53
- if not self.api_key:
54
- raise ValueError("Gemini API key is required for transcription")
55
-
56
- self.logger.info(f"🎤 Starting Gemini transcription for: {youtube_url}")
57
-
58
- try:
59
- # Initialize client
60
- client = genai.Client(api_key=self.api_key)
61
-
62
- # Load prompt from Gem configuration
63
- system_prompt = self.prompt_loader.get_gemini_transcription_prompt()
64
-
65
- # Generate transcription with extended thinking
66
- self.logger.info("🔄 Sending request to Gemini 2.5 Pro...")
67
- config = GenerateContentConfig(
68
- system_instruction=system_prompt,
69
- # Enable thinking by including it in response modalities
70
- response_modalities=["TEXT"],
71
- thinking_config=ThinkingConfig(
72
- include_thoughts=False,
73
- thinking_budget=-1,
74
- ),
75
- )
76
- response = await asyncio.get_event_loop().run_in_executor(
77
- None,
78
- lambda: client.models.generate_content(
79
- model="gemini-2.5-pro",
80
- contents=Part.from_uri(file_uri=youtube_url, mime_type="video/*"),
81
- config=config,
82
- ),
83
- )
84
-
85
- if not response.text:
86
- raise RuntimeError("Empty response from Gemini API")
87
-
88
- transcript = response.text.strip()
89
-
90
- self.logger.info(f"✅ Transcription completed: {len(transcript)} characters")
91
- return transcript
92
-
93
- except ImportError:
94
- raise RuntimeError("Google GenAI SDK not installed. Please install with: pip install google-genai")
95
- except Exception as e:
96
- self.logger.error(f"Gemini transcription failed: {str(e)}")
97
- raise RuntimeError(f"Gemini transcription failed: {str(e)}")
98
-
99
- async def transcribe_file(self, media_file_path: str) -> str:
100
- """
101
- Transcribe audio/video from local file using Gemini 2.5 Pro
102
-
103
- Args:
104
- media_file_path: Path to local audio file
105
-
106
- Returns:
107
- Transcribed text
108
- """
109
- if not self.api_key:
110
- raise ValueError("Gemini API key is required for transcription")
111
-
112
- self.logger.info(f"🎤 Starting Gemini transcription for file: {media_file_path}")
113
-
114
- try:
115
- # Initialize client
116
- client = genai.Client(api_key=self.api_key)
117
-
118
- # Load prompt from Gem configuration
119
- system_prompt = self.prompt_loader.get_gemini_transcription_prompt()
120
-
121
- # Upload audio file
122
- self.logger.info("📤 Uploading audio file to Gemini...")
123
- media_file = client.files.upload(path=media_file_path)
124
-
125
- # Generate transcription with extended thinking
126
- # Note: For thinking mode, you may want to use 'gemini-2.0-flash-thinking-exp' or similar models
127
- self.logger.info("🔄 Sending transcription request...")
128
- config = GenerateContentConfig(
129
- system_instruction=system_prompt,
130
- # Enable thinking by including it in response modalities
131
- response_modalities=["TEXT"],
132
- thinking_config=ThinkingConfig(
133
- include_thoughts=False,
134
- thinking_budget=-1,
135
- ),
136
- )
137
- response = await asyncio.get_event_loop().run_in_executor(
138
- None,
139
- lambda: client.models.generate_content(
140
- model="gemini-2.5-pro",
141
- contents=Part.from_uri(file_uri=media_file.uri, mime_type=media_file.mime_type),
142
- config=config,
143
- ),
144
- )
145
-
146
- if not response.text:
147
- raise RuntimeError("Empty response from Gemini API")
148
-
149
- transcript = response.text.strip()
150
-
151
- self.logger.info(f"✅ Transcription completed: {len(transcript)} characters")
152
- return transcript
153
-
154
- except ImportError:
155
- raise RuntimeError("Google GenAI SDK not installed. Please install with: pip install google-genai")
156
- except Exception as e:
157
- self.logger.error(f"Gemini transcription failed: {str(e)}")
158
- raise RuntimeError(f"Gemini transcription failed: {str(e)}")
159
-
160
- def get_gem_info(self) -> dict:
161
- """Get information about the Gem being used"""
162
- return {
163
- "gem_name": "Audio Transcription Gem",
164
- "gem_url": self.GEM_URL,
165
- "model": "Gemini 2.5 Pro",
166
- "description": "Specialized Gem for media content transcribe",
167
- }
@@ -1,22 +0,0 @@
1
- # Workflow Prompts
2
-
3
- This directory contains system prompts and instructions for various AI models used in workflows.
4
-
5
- ## Structure
6
-
7
- - `gemini/` - Prompts for Gemini models
8
- - `transcription_gem.txt` - System prompt from the Audio Transcription Gem
9
-
10
- ## Usage
11
-
12
- Prompts are loaded automatically by their respective workflow modules. To update a prompt:
13
-
14
- 1. Edit the corresponding `.txt` file
15
- 2. The changes will be picked up on the next workflow run
16
- 3. No code changes needed
17
-
18
- ## Adding New Prompts
19
-
20
- 1. Create a subdirectory for the model/service (e.g., `openai/`, `anthropic/`)
21
- 2. Add prompt files as `.txt` files with descriptive names
22
- 3. Update the loader in the corresponding workflow module
@@ -1,24 +0,0 @@
1
- # Gemini Workflow Prompts
2
-
3
- This directory contains system prompts for Gemini-based workflows.
4
-
5
- ## Files
6
-
7
- ### `transcription_gem.txt` [@dotey](https://x.com/dotey/status/1971810075867046131)
8
-
9
- System prompt extracted from the Audio/Video Transcription Gem:
10
- - **Gem URL**: https://gemini.google.com/gem/1870ly7xvW2hU_umtv-LedGsjywT0sQiN
11
- - **Model**: Gemini 2.5 Pro
12
- - **Purpose**: Audio/Video transcription with accuracy and natural formatting
13
-
14
- ## Updating the Prompt
15
-
16
- To update the transcription behavior:
17
-
18
- 1. Edit `transcription_gem.txt` directly
19
- 2. Changes take effect on the next workflow run
20
- 3. No code changes needed
21
-
22
- ## Gem Information
23
-
24
- The original Gem configuration is preserved in the `GeminiTranscriber.GEM_URL` constant and can be accessed via the `get_gem_info()` method.
@@ -1,81 +0,0 @@
1
- # Role
2
- You are an expert transcript specialist. Your task is to create a perfectly structured, verbatim transcript of a video.
3
-
4
- # Objective
5
- Produce a single, cohesive output containing the parts in this order:
6
- 1. A Video Title
7
- 2. A **Table of Contents (ToC)**
8
- 3. The **full, chapter-segmented transcript**
9
-
10
- * Use the same language as the transcription for the Title and ToC.
11
-
12
- # Critical Instructions
13
-
14
- ## 1. Transcription Fidelity: Verbatim & Untranslated
15
- * Transcribe every spoken word exactly as you hear it, including filler words (`um`, `uh`, `like`) and stutters.
16
- * **NEVER translate.** If the audio is in Chinese, transcribe in Chinese. If it mixes languages (e.g., "这个 feature 很酷"), your transcript must replicate that mix exactly.
17
-
18
- ## 2. Speaker Identification
19
- * **Priority 1: Use metadata.** Analyze the video's title and description first to identify and match speaker names.
20
- * **Priority 2: Use audio content.** If names are not in the metadata, listen for introductions or how speakers address each other.
21
- * **Fallback:** If a name remains unknown, use a generic but consistent label (`**Speaker 1:**`, `**Host:**`, etc.).
22
- * **Consistency is key:** If a speaker's name is revealed later, you must go back and update all previous labels for that speaker.
23
-
24
- ## 3. Chapter Generation Strategy
25
- * **For YouTube Links:** First, check if the video description contains a list of chapters. If so, use that as the primary basis for segmenting the transcript.
26
- * **For all other videos (or if no chapters exist on YouTube):** Create chapters based on significant shifts in topic or conversation flow.
27
-
28
- ## 4. Output Structure & Formatting
29
-
30
- * **Timestamp Format**
31
- * All timestamps throughout the entire output MUST use the exact `[HH:MM:SS]` format (e.g., `[00:01:23]`). Milliseconds are forbidden.
32
-
33
- * **Table of Contents (ToC)**
34
- * Must be the very first thing in your output, under a `## Table of Contents` heading.
35
- * Format for each entry: `* [HH:MM:SS] Chapter Title`
36
-
37
- * **Chapters**
38
- * Start each chapter with a heading in this format: `## [HH:MM:SS] Chapter Title`
39
- * Use two blank lines to separate the end of one chapter from the heading of the next.
40
-
41
- * **Dialogue Paragraphs (VERY IMPORTANT)**
42
- * **Speaker Turns:** The first paragraph of a speaker's turn must begin with `**Speaker Name:** `.
43
- * **Paragraph Splitting:** For a long continuous block of speech from a single speaker, split it into smaller, logical paragraphs (roughly 2-4 sentences). Separate these paragraphs with a single blank line. Subsequent consecutive paragraphs from the *same speaker* should NOT repeat the `**Speaker Name:** ` label.
44
- * **Timestamp Rule:** Every single paragraph MUST end with exactly one timestamp. The timestamp must be placed at the very end of the paragraph's text.
45
- * ❌ **WRONG:** `**Host:** Welcome back. [00:00:01] Today we have a guest. [00:00:02]`
46
- * ❌ **WRONG:** `**Jane Doe:** The study is complex. We tracked two groups over five years to see the effects. [00:00:18] And the results were surprising.`
47
- * ✅ **CORRECT:** `**Host:** Welcome back. Today we have a guest. [00:00:02]`
48
- * ✅ **CORRECT (for a long monologue):**
49
- `**Jane Doe:** The study is complex. We tracked two groups over a five-year period to see the long-term effects. [00:00:18]
50
-
51
- And the results, well, they were quite surprising to the entire team. [00:00:22]`
52
-
53
- * **Non-Speech Audio**
54
- * Describe significant sounds like `[Laughter]` or `[Music starts]`, each on its own line with its own timestamp: `[Event description] [HH:MM:SS]`
55
-
56
- ---
57
- ### Example of Correct Output
58
-
59
- ## Table of Contents
60
- * [00:00:00] Introduction and Welcome
61
- * [00:00:12] Overview of the New Research
62
-
63
- ## [00:00:00] Introduction and Welcome
64
-
65
- **Host:** Welcome back to the show. Today, we have a, uh, very special guest, Jane Doe. [00:00:01]
66
-
67
- **Jane Doe:** Thank you for having me. I'm excited to be here and discuss the findings. [00:00:05]
68
-
69
- ## [00:00:12] Overview of the New Research
70
-
71
- **Host:** So, Jane, before we get into the nitty-gritty, could you, you know, give us a brief overview for our audience? [00:00:14]
72
-
73
- **Jane Doe:** Of course. The study focuses on the long-term effects of specific dietary changes. It's a bit complicated but essentially we tracked two large groups over a five-year period. [00:00:21]
74
-
75
- The first group followed the new regimen, while the second group, our control, maintained a traditional diet. This allowed us to isolate variables effectively. [00:00:28]
76
-
77
- [Laughter] [00:00:29]
78
-
79
- **Host:** Fascinating. And what did you find? [00:00:31]
80
- ---
81
- Begin transcription now. Adhere to all rules with absolute precision.