lattifai 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/bin/agent.py +4 -3
- lattifai/io/reader.py +4 -1
- lattifai/io/text_parser.py +75 -0
- lattifai/tokenizer/tokenizer.py +10 -1
- lattifai/workflows/__init__.py +34 -0
- lattifai/workflows/agents.py +10 -0
- lattifai/workflows/base.py +192 -0
- lattifai/workflows/file_manager.py +812 -0
- lattifai/workflows/gemini.py +159 -0
- lattifai/workflows/youtube.py +931 -0
- {lattifai-0.4.0.dist-info → lattifai-0.4.2.dist-info}/METADATA +6 -7
- {lattifai-0.4.0.dist-info → lattifai-0.4.2.dist-info}/RECORD +16 -9
- {lattifai-0.4.0.dist-info → lattifai-0.4.2.dist-info}/top_level.txt +0 -1
- {lattifai-0.4.0.dist-info → lattifai-0.4.2.dist-info}/WHEEL +0 -0
- {lattifai-0.4.0.dist-info → lattifai-0.4.2.dist-info}/entry_points.txt +0 -0
- {lattifai-0.4.0.dist-info → lattifai-0.4.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Gemini 2.5 Pro transcription module
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
# Import Google GenAI SDK
|
|
9
|
+
from google import genai
|
|
10
|
+
from google.genai.types import GenerateContentConfig, Part, ThinkingConfig
|
|
11
|
+
|
|
12
|
+
from .base import setup_workflow_logger
|
|
13
|
+
from .prompts import get_prompt_loader
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class GeminiTranscriber:
|
|
17
|
+
"""Gemini 2.5 Pro audio transcription using the specified Gem
|
|
18
|
+
|
|
19
|
+
Configuration (in __init__):
|
|
20
|
+
- api_key: Gemini API key (required)
|
|
21
|
+
|
|
22
|
+
Runtime parameters (in __call__):
|
|
23
|
+
- youtube_url: YouTube URL to transcribe
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
# The specific Gem URL provided by the user
|
|
27
|
+
GEM_URL = 'https://gemini.google.com/gem/1870ly7xvW2hU_umtv-LedGsjywT0sQiN'
|
|
28
|
+
|
|
29
|
+
def __init__(self, api_key: Optional[str] = None):
|
|
30
|
+
self.api_key = api_key
|
|
31
|
+
self.logger = setup_workflow_logger('gemini')
|
|
32
|
+
self.prompt_loader = get_prompt_loader()
|
|
33
|
+
|
|
34
|
+
if not self.api_key:
|
|
35
|
+
raise ValueError('Gemini API key is required')
|
|
36
|
+
|
|
37
|
+
async def __call__(self, youtube_url: str) -> str:
|
|
38
|
+
"""Main entry point for transcription"""
|
|
39
|
+
return await self.transcribe_url(youtube_url)
|
|
40
|
+
|
|
41
|
+
async def transcribe_url(self, youtube_url: str) -> str:
|
|
42
|
+
"""
|
|
43
|
+
Transcribe audio from YouTube URL using Gemini 2.5 Pro Gem
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
youtube_url: YouTube URL to transcribe
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Transcribed text
|
|
50
|
+
"""
|
|
51
|
+
self.logger.info(f'🎤 Starting Gemini transcription for: {youtube_url}')
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
# Initialize client
|
|
55
|
+
client = genai.Client(api_key=self.api_key)
|
|
56
|
+
|
|
57
|
+
# Load prompt from Gem configuration
|
|
58
|
+
system_prompt = self.prompt_loader.get_gemini_transcription_prompt()
|
|
59
|
+
|
|
60
|
+
# Generate transcription with extended thinking
|
|
61
|
+
self.logger.info('🔄 Sending request to Gemini 2.5 Pro...')
|
|
62
|
+
config = GenerateContentConfig(
|
|
63
|
+
system_instruction=system_prompt,
|
|
64
|
+
# Enable thinking by including it in response modalities
|
|
65
|
+
response_modalities=['TEXT'],
|
|
66
|
+
thinking_config=ThinkingConfig(
|
|
67
|
+
include_thoughts=False,
|
|
68
|
+
thinking_budget=-1,
|
|
69
|
+
),
|
|
70
|
+
)
|
|
71
|
+
response = await asyncio.get_event_loop().run_in_executor(
|
|
72
|
+
None,
|
|
73
|
+
lambda: client.models.generate_content(
|
|
74
|
+
model='gemini-2.5-pro',
|
|
75
|
+
contents=Part.from_uri(file_uri=youtube_url, mime_type='video/*'),
|
|
76
|
+
config=config,
|
|
77
|
+
),
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
if not response.text:
|
|
81
|
+
raise RuntimeError('Empty response from Gemini API')
|
|
82
|
+
|
|
83
|
+
transcript = response.text.strip()
|
|
84
|
+
|
|
85
|
+
self.logger.info(f'✅ Transcription completed: {len(transcript)} characters')
|
|
86
|
+
return transcript
|
|
87
|
+
|
|
88
|
+
except ImportError:
|
|
89
|
+
raise RuntimeError('Google GenAI SDK not installed. Please install with: pip install google-genai')
|
|
90
|
+
except Exception as e:
|
|
91
|
+
self.logger.error(f'Gemini transcription failed: {str(e)}')
|
|
92
|
+
raise RuntimeError(f'Gemini transcription failed: {str(e)}')
|
|
93
|
+
|
|
94
|
+
async def transcribe_file(self, media_file_path: str) -> str:
|
|
95
|
+
"""
|
|
96
|
+
Transcribe audio/video from local file using Gemini 2.5 Pro
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
media_file_path: Path to local audio file
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Transcribed text
|
|
103
|
+
"""
|
|
104
|
+
self.logger.info(f'🎤 Starting Gemini transcription for file: {media_file_path}')
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
# Initialize client
|
|
108
|
+
client = genai.Client(api_key=self.api_key)
|
|
109
|
+
|
|
110
|
+
# Load prompt from Gem configuration
|
|
111
|
+
system_prompt = self.prompt_loader.get_gemini_transcription_prompt()
|
|
112
|
+
|
|
113
|
+
# Upload audio file
|
|
114
|
+
self.logger.info('📤 Uploading audio file to Gemini...')
|
|
115
|
+
media_file = client.files.upload(path=media_file_path)
|
|
116
|
+
|
|
117
|
+
# Generate transcription with extended thinking
|
|
118
|
+
# Note: For thinking mode, you may want to use 'gemini-2.0-flash-thinking-exp' or similar models
|
|
119
|
+
self.logger.info('🔄 Sending transcription request...')
|
|
120
|
+
config = GenerateContentConfig(
|
|
121
|
+
system_instruction=system_prompt,
|
|
122
|
+
# Enable thinking by including it in response modalities
|
|
123
|
+
response_modalities=['TEXT'],
|
|
124
|
+
thinking_config=ThinkingConfig(
|
|
125
|
+
include_thoughts=False,
|
|
126
|
+
thinking_budget=-1,
|
|
127
|
+
),
|
|
128
|
+
)
|
|
129
|
+
response = await asyncio.get_event_loop().run_in_executor(
|
|
130
|
+
None,
|
|
131
|
+
lambda: client.models.generate_content(
|
|
132
|
+
model='gemini-2.5-pro',
|
|
133
|
+
contents=Part.from_uri(file_uri=media_file.uri, mime_type=media_file.mime_type),
|
|
134
|
+
config=config,
|
|
135
|
+
),
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
if not response.text:
|
|
139
|
+
raise RuntimeError('Empty response from Gemini API')
|
|
140
|
+
|
|
141
|
+
transcript = response.text.strip()
|
|
142
|
+
|
|
143
|
+
self.logger.info(f'✅ Transcription completed: {len(transcript)} characters')
|
|
144
|
+
return transcript
|
|
145
|
+
|
|
146
|
+
except ImportError:
|
|
147
|
+
raise RuntimeError('Google GenAI SDK not installed. Please install with: pip install google-genai')
|
|
148
|
+
except Exception as e:
|
|
149
|
+
self.logger.error(f'Gemini transcription failed: {str(e)}')
|
|
150
|
+
raise RuntimeError(f'Gemini transcription failed: {str(e)}')
|
|
151
|
+
|
|
152
|
+
def get_gem_info(self) -> dict:
|
|
153
|
+
"""Get information about the Gem being used"""
|
|
154
|
+
return {
|
|
155
|
+
'gem_name': 'Audio Transcription Gem',
|
|
156
|
+
'gem_url': self.GEM_URL,
|
|
157
|
+
'model': 'Gemini 2.5 Pro',
|
|
158
|
+
'description': 'Specialized Gem for media content transcribe',
|
|
159
|
+
}
|