lattifai 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,159 @@
1
+ """
2
+ Gemini 2.5 Pro transcription module
3
+ """
4
+
5
+ import asyncio
6
+ from typing import Optional
7
+
8
+ # Import Google GenAI SDK
9
+ from google import genai
10
+ from google.genai.types import GenerateContentConfig, Part, ThinkingConfig
11
+
12
+ from .base import setup_workflow_logger
13
+ from .prompts import get_prompt_loader
14
+
15
+
16
+ class GeminiTranscriber:
17
+ """Gemini 2.5 Pro audio transcription using the specified Gem
18
+
19
+ Configuration (in __init__):
20
+ - api_key: Gemini API key (required)
21
+
22
+ Runtime parameters (in __call__):
23
+ - youtube_url: YouTube URL to transcribe
24
+ """
25
+
26
+ # The specific Gem URL provided by the user
27
+ GEM_URL = 'https://gemini.google.com/gem/1870ly7xvW2hU_umtv-LedGsjywT0sQiN'
28
+
29
+ def __init__(self, api_key: Optional[str] = None):
30
+ self.api_key = api_key
31
+ self.logger = setup_workflow_logger('gemini')
32
+ self.prompt_loader = get_prompt_loader()
33
+
34
+ if not self.api_key:
35
+ raise ValueError('Gemini API key is required')
36
+
37
+ async def __call__(self, youtube_url: str) -> str:
38
+ """Main entry point for transcription"""
39
+ return await self.transcribe_url(youtube_url)
40
+
41
+ async def transcribe_url(self, youtube_url: str) -> str:
42
+ """
43
+ Transcribe audio from YouTube URL using Gemini 2.5 Pro Gem
44
+
45
+ Args:
46
+ youtube_url: YouTube URL to transcribe
47
+
48
+ Returns:
49
+ Transcribed text
50
+ """
51
+ self.logger.info(f'🎤 Starting Gemini transcription for: {youtube_url}')
52
+
53
+ try:
54
+ # Initialize client
55
+ client = genai.Client(api_key=self.api_key)
56
+
57
+ # Load prompt from Gem configuration
58
+ system_prompt = self.prompt_loader.get_gemini_transcription_prompt()
59
+
60
+ # Generate transcription with extended thinking
61
+ self.logger.info('🔄 Sending request to Gemini 2.5 Pro...')
62
+ config = GenerateContentConfig(
63
+ system_instruction=system_prompt,
64
+ # Enable thinking by including it in response modalities
65
+ response_modalities=['TEXT'],
66
+ thinking_config=ThinkingConfig(
67
+ include_thoughts=False,
68
+ thinking_budget=-1,
69
+ ),
70
+ )
71
+ response = await asyncio.get_event_loop().run_in_executor(
72
+ None,
73
+ lambda: client.models.generate_content(
74
+ model='gemini-2.5-pro',
75
+ contents=Part.from_uri(file_uri=youtube_url, mime_type='video/*'),
76
+ config=config,
77
+ ),
78
+ )
79
+
80
+ if not response.text:
81
+ raise RuntimeError('Empty response from Gemini API')
82
+
83
+ transcript = response.text.strip()
84
+
85
+ self.logger.info(f'✅ Transcription completed: {len(transcript)} characters')
86
+ return transcript
87
+
88
+ except ImportError:
89
+ raise RuntimeError('Google GenAI SDK not installed. Please install with: pip install google-genai')
90
+ except Exception as e:
91
+ self.logger.error(f'Gemini transcription failed: {str(e)}')
92
+ raise RuntimeError(f'Gemini transcription failed: {str(e)}')
93
+
94
+ async def transcribe_file(self, media_file_path: str) -> str:
95
+ """
96
+ Transcribe audio/video from local file using Gemini 2.5 Pro
97
+
98
+ Args:
99
+ media_file_path: Path to local audio file
100
+
101
+ Returns:
102
+ Transcribed text
103
+ """
104
+ self.logger.info(f'🎤 Starting Gemini transcription for file: {media_file_path}')
105
+
106
+ try:
107
+ # Initialize client
108
+ client = genai.Client(api_key=self.api_key)
109
+
110
+ # Load prompt from Gem configuration
111
+ system_prompt = self.prompt_loader.get_gemini_transcription_prompt()
112
+
113
+ # Upload audio file
114
+ self.logger.info('📤 Uploading audio file to Gemini...')
115
+ media_file = client.files.upload(path=media_file_path)
116
+
117
+ # Generate transcription with extended thinking
118
+ # Note: For thinking mode, you may want to use 'gemini-2.0-flash-thinking-exp' or similar models
119
+ self.logger.info('🔄 Sending transcription request...')
120
+ config = GenerateContentConfig(
121
+ system_instruction=system_prompt,
122
+ # Enable thinking by including it in response modalities
123
+ response_modalities=['TEXT'],
124
+ thinking_config=ThinkingConfig(
125
+ include_thoughts=False,
126
+ thinking_budget=-1,
127
+ ),
128
+ )
129
+ response = await asyncio.get_event_loop().run_in_executor(
130
+ None,
131
+ lambda: client.models.generate_content(
132
+ model='gemini-2.5-pro',
133
+ contents=Part.from_uri(file_uri=media_file.uri, mime_type=media_file.mime_type),
134
+ config=config,
135
+ ),
136
+ )
137
+
138
+ if not response.text:
139
+ raise RuntimeError('Empty response from Gemini API')
140
+
141
+ transcript = response.text.strip()
142
+
143
+ self.logger.info(f'✅ Transcription completed: {len(transcript)} characters')
144
+ return transcript
145
+
146
+ except ImportError:
147
+ raise RuntimeError('Google GenAI SDK not installed. Please install with: pip install google-genai')
148
+ except Exception as e:
149
+ self.logger.error(f'Gemini transcription failed: {str(e)}')
150
+ raise RuntimeError(f'Gemini transcription failed: {str(e)}')
151
+
152
+ def get_gem_info(self) -> dict:
153
+ """Get information about the Gem being used"""
154
+ return {
155
+ 'gem_name': 'Audio Transcription Gem',
156
+ 'gem_url': self.GEM_URL,
157
+ 'model': 'Gemini 2.5 Pro',
158
+ 'description': 'Specialized Gem for media content transcribe',
159
+ }