isa-model 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/core/model_manager.py +69 -4
- isa_model/inference/ai_factory.py +335 -46
- isa_model/inference/billing_tracker.py +406 -0
- isa_model/inference/providers/base_provider.py +51 -4
- isa_model/inference/providers/ollama_provider.py +37 -18
- isa_model/inference/providers/openai_provider.py +65 -36
- isa_model/inference/providers/replicate_provider.py +42 -30
- isa_model/inference/services/audio/base_stt_service.py +21 -2
- isa_model/inference/services/audio/openai_realtime_service.py +353 -0
- isa_model/inference/services/audio/openai_stt_service.py +252 -0
- isa_model/inference/services/audio/openai_tts_service.py +48 -9
- isa_model/inference/services/audio/replicate_tts_service.py +239 -0
- isa_model/inference/services/base_service.py +36 -1
- isa_model/inference/services/embedding/openai_embed_service.py +223 -0
- isa_model/inference/services/llm/base_llm_service.py +88 -192
- isa_model/inference/services/llm/llm_adapter.py +459 -0
- isa_model/inference/services/llm/ollama_llm_service.py +111 -185
- isa_model/inference/services/llm/openai_llm_service.py +115 -360
- isa_model/inference/services/vision/helpers/image_utils.py +4 -3
- isa_model/inference/services/vision/ollama_vision_service.py +11 -3
- isa_model/inference/services/vision/openai_vision_service.py +275 -41
- isa_model/inference/services/vision/replicate_image_gen_service.py +233 -205
- {isa_model-0.3.0.dist-info → isa_model-0.3.2.dist-info}/METADATA +1 -1
- {isa_model-0.3.0.dist-info → isa_model-0.3.2.dist-info}/RECORD +26 -21
- {isa_model-0.3.0.dist-info → isa_model-0.3.2.dist-info}/WHEEL +0 -0
- {isa_model-0.3.0.dist-info → isa_model-0.3.2.dist-info}/top_level.txt +0 -0
@@ -1,80 +1,314 @@
|
|
1
|
-
from typing import Dict, Any, Union
|
1
|
+
from typing import Dict, Any, Union, List, Optional, BinaryIO
|
2
|
+
import base64
|
3
|
+
import aiohttp
|
2
4
|
from openai import AsyncOpenAI
|
3
5
|
from tenacity import retry, stop_after_attempt, wait_exponential
|
4
|
-
from isa_model.inference.services.
|
6
|
+
from isa_model.inference.services.vision.base_vision_service import BaseVisionService
|
5
7
|
from isa_model.inference.providers.base_provider import BaseProvider
|
6
|
-
from .
|
8
|
+
from isa_model.inference.billing_tracker import ServiceType
|
7
9
|
import logging
|
8
10
|
|
9
11
|
logger = logging.getLogger(__name__)
|
10
12
|
|
11
|
-
class OpenAIVisionService(
|
12
|
-
"""Vision
|
13
|
+
class OpenAIVisionService(BaseVisionService):
|
14
|
+
"""OpenAI Vision service using gpt-4.1-nano with vision capabilities"""
|
13
15
|
|
14
|
-
def __init__(self, provider: 'BaseProvider', model_name: str):
|
16
|
+
def __init__(self, provider: 'BaseProvider', model_name: str = "gpt-4.1-nano"):
|
15
17
|
super().__init__(provider, model_name)
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
18
|
+
|
19
|
+
# Get full configuration from provider (including sensitive data)
|
20
|
+
provider_config = provider.get_full_config()
|
21
|
+
|
22
|
+
# Initialize AsyncOpenAI client with provider configuration
|
23
|
+
try:
|
24
|
+
if not provider_config.get("api_key"):
|
25
|
+
raise ValueError("OpenAI API key not found in provider configuration")
|
26
|
+
|
27
|
+
self._client = AsyncOpenAI(
|
28
|
+
api_key=provider_config["api_key"],
|
29
|
+
base_url=provider_config.get("base_url", "https://api.openai.com/v1"),
|
30
|
+
organization=provider_config.get("organization")
|
31
|
+
)
|
32
|
+
|
33
|
+
logger.info(f"Initialized OpenAIVisionService with model {self.model_name}")
|
34
|
+
|
35
|
+
except Exception as e:
|
36
|
+
logger.error(f"Failed to initialize OpenAI client: {e}")
|
37
|
+
raise ValueError(f"Failed to initialize OpenAI client. Check your API key configuration: {e}") from e
|
38
|
+
|
39
|
+
self.max_tokens = provider_config.get('max_tokens', 1000)
|
40
|
+
self.temperature = provider_config.get('temperature', 0.7)
|
23
41
|
|
24
42
|
@property
|
25
43
|
def client(self) -> AsyncOpenAI:
|
26
|
-
"""
|
44
|
+
"""Get the underlying OpenAI client"""
|
27
45
|
return self._client
|
28
46
|
|
47
|
+
async def _download_image(self, image_url: str) -> bytes:
|
48
|
+
"""Download image from URL"""
|
49
|
+
async with aiohttp.ClientSession() as session:
|
50
|
+
async with session.get(image_url) as response:
|
51
|
+
if response.status == 200:
|
52
|
+
return await response.read()
|
53
|
+
else:
|
54
|
+
raise ValueError(f"Failed to download image from {image_url}: {response.status}")
|
55
|
+
|
56
|
+
def _encode_image(self, image_path_or_data: Union[str, bytes, BinaryIO]) -> str:
|
57
|
+
"""Encode image to base64"""
|
58
|
+
if isinstance(image_path_or_data, str):
|
59
|
+
# If it's a file path
|
60
|
+
with open(image_path_or_data, "rb") as image_file:
|
61
|
+
return base64.b64encode(image_file.read()).decode("utf-8")
|
62
|
+
elif hasattr(image_path_or_data, 'read'):
|
63
|
+
# If it's a file-like object (BinaryIO)
|
64
|
+
data = image_path_or_data.read() # type: ignore
|
65
|
+
if isinstance(data, bytes):
|
66
|
+
return base64.b64encode(data).decode("utf-8")
|
67
|
+
else:
|
68
|
+
raise ValueError("File-like object did not return bytes")
|
69
|
+
else:
|
70
|
+
# If it's bytes data
|
71
|
+
return base64.b64encode(image_path_or_data).decode("utf-8") # type: ignore
|
72
|
+
|
29
73
|
@retry(
|
30
74
|
stop=stop_after_attempt(3),
|
31
75
|
wait=wait_exponential(multiplier=1, min=4, max=10),
|
32
76
|
reraise=True
|
33
77
|
)
|
34
|
-
async def analyze_image(
|
35
|
-
|
78
|
+
async def analyze_image(
|
79
|
+
self,
|
80
|
+
image: Union[str, BinaryIO],
|
81
|
+
prompt: Optional[str] = None,
|
82
|
+
max_tokens: int = 1000
|
83
|
+
) -> Dict[str, Any]:
|
84
|
+
"""
|
85
|
+
Analyze image and provide description or answer questions
|
36
86
|
|
37
87
|
Args:
|
38
|
-
|
39
|
-
|
88
|
+
image: Path to image file, URL, or image data
|
89
|
+
prompt: Optional text prompt/question about the image
|
90
|
+
max_tokens: Maximum tokens in response
|
40
91
|
|
41
92
|
Returns:
|
42
|
-
|
93
|
+
Dict containing analysis results
|
43
94
|
"""
|
44
95
|
try:
|
45
|
-
#
|
46
|
-
if isinstance(
|
47
|
-
|
48
|
-
|
49
|
-
|
96
|
+
# Handle different input types
|
97
|
+
if isinstance(image, str):
|
98
|
+
if image.startswith(('http://', 'https://')):
|
99
|
+
# Download image from URL
|
100
|
+
image_bytes = await self._download_image(image)
|
101
|
+
base64_image = self._encode_image(image_bytes)
|
102
|
+
else:
|
103
|
+
# File path
|
104
|
+
base64_image = self._encode_image(image)
|
50
105
|
else:
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
106
|
+
# BinaryIO or bytes data
|
107
|
+
if hasattr(image, 'read'):
|
108
|
+
image_data = image.read()
|
109
|
+
else:
|
110
|
+
image_data = image
|
111
|
+
base64_image = self._encode_image(image_data)
|
112
|
+
|
113
|
+
# Use default prompt if none provided
|
114
|
+
if prompt is None:
|
115
|
+
prompt = "Please describe what you see in this image in detail."
|
116
|
+
|
117
|
+
# Use the standard chat completions API with vision
|
118
|
+
messages = [
|
119
|
+
{
|
61
120
|
"role": "user",
|
62
121
|
"content": [
|
63
|
-
{"type": "text", "text":
|
122
|
+
{"type": "text", "text": prompt},
|
64
123
|
{
|
65
124
|
"type": "image_url",
|
66
125
|
"image_url": {
|
67
|
-
"url": f"data:image/jpeg;base64,{
|
126
|
+
"url": f"data:image/jpeg;base64,{base64_image}",
|
127
|
+
"detail": "auto"
|
68
128
|
}
|
69
|
-
}
|
70
|
-
]
|
71
|
-
}
|
72
|
-
|
129
|
+
},
|
130
|
+
],
|
131
|
+
}
|
132
|
+
]
|
133
|
+
|
134
|
+
response = await self._client.chat.completions.create( # type: ignore
|
135
|
+
model=self.model_name,
|
136
|
+
messages=messages, # type: ignore
|
137
|
+
max_tokens=max_tokens,
|
73
138
|
temperature=self.temperature
|
74
139
|
)
|
75
140
|
|
76
|
-
|
141
|
+
# Track usage for billing
|
142
|
+
if response.usage:
|
143
|
+
self._track_usage(
|
144
|
+
service_type=ServiceType.VISION,
|
145
|
+
operation="image_analysis",
|
146
|
+
input_tokens=response.usage.prompt_tokens,
|
147
|
+
output_tokens=response.usage.completion_tokens,
|
148
|
+
metadata={"prompt": prompt[:100], "model": self.model_name}
|
149
|
+
)
|
150
|
+
|
151
|
+
content = response.choices[0].message.content or ""
|
152
|
+
|
153
|
+
return {
|
154
|
+
"text": content,
|
155
|
+
"confidence": 1.0, # OpenAI doesn't provide confidence scores
|
156
|
+
"detected_objects": [], # Would need separate object detection
|
157
|
+
"metadata": {
|
158
|
+
"model": self.model_name,
|
159
|
+
"prompt": prompt,
|
160
|
+
"tokens_used": response.usage.total_tokens if response.usage else 0
|
161
|
+
}
|
162
|
+
}
|
77
163
|
|
78
164
|
except Exception as e:
|
79
165
|
logger.error(f"Error in image analysis: {e}")
|
80
166
|
raise
|
167
|
+
|
168
|
+
async def analyze_images(
|
169
|
+
self,
|
170
|
+
images: List[Union[str, BinaryIO]],
|
171
|
+
prompt: Optional[str] = None,
|
172
|
+
max_tokens: int = 1000
|
173
|
+
) -> List[Dict[str, Any]]:
|
174
|
+
"""Analyze multiple images"""
|
175
|
+
results = []
|
176
|
+
for image in images:
|
177
|
+
result = await self.analyze_image(image, prompt, max_tokens)
|
178
|
+
results.append(result)
|
179
|
+
return results
|
180
|
+
|
181
|
+
async def describe_image(
|
182
|
+
self,
|
183
|
+
image: Union[str, BinaryIO],
|
184
|
+
detail_level: str = "medium"
|
185
|
+
) -> Dict[str, Any]:
|
186
|
+
"""Generate detailed description of image"""
|
187
|
+
detail_prompts = {
|
188
|
+
"low": "Briefly describe what you see in this image.",
|
189
|
+
"medium": "Describe what you see in this image in detail, including objects, colors, and scene.",
|
190
|
+
"high": "Provide a comprehensive and detailed description of this image, including all visible objects, their positions, colors, textures, lighting, composition, and any text or symbols present."
|
191
|
+
}
|
192
|
+
|
193
|
+
prompt = detail_prompts.get(detail_level, detail_prompts["medium"])
|
194
|
+
result = await self.analyze_image(image, prompt, 1500)
|
195
|
+
|
196
|
+
return {
|
197
|
+
"description": result["text"],
|
198
|
+
"objects": [], # Would need object detection API
|
199
|
+
"scene": result["text"], # Use same description
|
200
|
+
"colors": [], # Would need color analysis
|
201
|
+
"detail_level": detail_level,
|
202
|
+
"metadata": result["metadata"]
|
203
|
+
}
|
204
|
+
|
205
|
+
async def extract_text(self, image: Union[str, BinaryIO]) -> Dict[str, Any]:
|
206
|
+
"""Extract text from image (OCR)"""
|
207
|
+
prompt = "Extract all text visible in this image. Provide only the text content, maintaining the original structure and formatting as much as possible."
|
208
|
+
result = await self.analyze_image(image, prompt, 1000)
|
209
|
+
|
210
|
+
return {
|
211
|
+
"text": result["text"],
|
212
|
+
"confidence": 1.0,
|
213
|
+
"bounding_boxes": [], # OpenAI vision doesn't provide bounding boxes
|
214
|
+
"language": "unknown", # Would need language detection
|
215
|
+
"metadata": result["metadata"]
|
216
|
+
}
|
217
|
+
|
218
|
+
async def detect_objects(
|
219
|
+
self,
|
220
|
+
image: Union[str, BinaryIO],
|
221
|
+
confidence_threshold: float = 0.5
|
222
|
+
) -> Dict[str, Any]:
|
223
|
+
"""Detect objects in image"""
|
224
|
+
prompt = "List all objects visible in this image. For each object, provide the object name and a brief description of its location in the image."
|
225
|
+
result = await self.analyze_image(image, prompt, 1000)
|
226
|
+
|
227
|
+
# Parse the response to extract object information
|
228
|
+
objects = []
|
229
|
+
lines = result["text"].split('\n')
|
230
|
+
for line in lines:
|
231
|
+
line = line.strip()
|
232
|
+
if line and not line.startswith(('In this image', 'The image shows', 'I can see')):
|
233
|
+
objects.append({
|
234
|
+
"label": line,
|
235
|
+
"confidence": 1.0 # OpenAI doesn't provide confidence scores
|
236
|
+
})
|
237
|
+
|
238
|
+
return {
|
239
|
+
"objects": objects,
|
240
|
+
"count": len(objects),
|
241
|
+
"bounding_boxes": [], # Not available with current API
|
242
|
+
"metadata": result["metadata"]
|
243
|
+
}
|
244
|
+
|
245
|
+
async def classify_image(
|
246
|
+
self,
|
247
|
+
image: Union[str, BinaryIO],
|
248
|
+
categories: Optional[List[str]] = None
|
249
|
+
) -> Dict[str, Any]:
|
250
|
+
"""Classify image into categories"""
|
251
|
+
if categories:
|
252
|
+
category_list = ", ".join(categories)
|
253
|
+
prompt = f"Classify this image into one of these categories: {category_list}. Respond with only the most appropriate category name."
|
254
|
+
else:
|
255
|
+
prompt = "What category best describes this image? Provide a single category name."
|
256
|
+
|
257
|
+
result = await self.analyze_image(image, prompt, 100)
|
258
|
+
category = result["text"].strip()
|
259
|
+
|
260
|
+
return {
|
261
|
+
"category": category,
|
262
|
+
"confidence": 1.0,
|
263
|
+
"all_predictions": [{"category": category, "confidence": 1.0}],
|
264
|
+
"metadata": result["metadata"]
|
265
|
+
}
|
266
|
+
|
267
|
+
async def compare_images(
|
268
|
+
self,
|
269
|
+
image1: Union[str, BinaryIO],
|
270
|
+
image2: Union[str, BinaryIO]
|
271
|
+
) -> Dict[str, Any]:
|
272
|
+
"""Compare two images for similarity"""
|
273
|
+
# For now, analyze both images separately and compare descriptions
|
274
|
+
result1 = await self.analyze_image(image1, "Describe this image in detail.")
|
275
|
+
result2 = await self.analyze_image(image2, "Describe this image in detail.")
|
276
|
+
|
277
|
+
# Use LLM to compare the descriptions
|
278
|
+
comparison_prompt = f"Compare these two image descriptions and provide a similarity analysis:\n\nImage 1: {result1['text']}\n\nImage 2: {result2['text']}\n\nProvide: 1) A similarity score from 0.0 to 1.0, 2) Key differences, 3) Common elements."
|
279
|
+
|
280
|
+
comparison_result = await self._client.chat.completions.create(
|
281
|
+
model=self.model_name,
|
282
|
+
messages=[{"role": "user", "content": comparison_prompt}],
|
283
|
+
max_tokens=500,
|
284
|
+
temperature=0.3
|
285
|
+
)
|
286
|
+
|
287
|
+
comparison_text = comparison_result.choices[0].message.content or ""
|
288
|
+
|
289
|
+
return {
|
290
|
+
"similarity_score": 0.5, # Would need better parsing to extract actual score
|
291
|
+
"differences": comparison_text,
|
292
|
+
"common_elements": comparison_text,
|
293
|
+
"metadata": {
|
294
|
+
"model": self.model_name,
|
295
|
+
"comparison_method": "description_based"
|
296
|
+
}
|
297
|
+
}
|
298
|
+
|
299
|
+
def get_supported_formats(self) -> List[str]:
|
300
|
+
"""Get list of supported image formats"""
|
301
|
+
return ['jpg', 'jpeg', 'png', 'gif', 'webp']
|
302
|
+
|
303
|
+
def get_max_image_size(self) -> Dict[str, int]:
|
304
|
+
"""Get maximum supported image dimensions"""
|
305
|
+
return {
|
306
|
+
"width": 2048,
|
307
|
+
"height": 2048,
|
308
|
+
"file_size_mb": 20
|
309
|
+
}
|
310
|
+
|
311
|
+
async def close(self):
|
312
|
+
"""Clean up resources"""
|
313
|
+
if hasattr(self._client, 'close'):
|
314
|
+
await self._client.close()
|