isa-model 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/__init__.py +30 -1
- isa_model/client.py +937 -0
- isa_model/core/config/__init__.py +16 -0
- isa_model/core/config/config_manager.py +514 -0
- isa_model/core/config.py +426 -0
- isa_model/core/models/model_billing_tracker.py +476 -0
- isa_model/core/models/model_manager.py +399 -0
- isa_model/core/{storage/supabase_storage.py → models/model_repo.py} +72 -73
- isa_model/core/pricing_manager.py +426 -0
- isa_model/core/services/__init__.py +19 -0
- isa_model/core/services/intelligent_model_selector.py +547 -0
- isa_model/core/types.py +291 -0
- isa_model/deployment/__init__.py +2 -0
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +157 -3
- isa_model/deployment/cloud/modal/isa_vision_table_service.py +532 -0
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +104 -3
- isa_model/deployment/cloud/modal/register_models.py +321 -0
- isa_model/deployment/runtime/deployed_service.py +338 -0
- isa_model/deployment/services/__init__.py +9 -0
- isa_model/deployment/services/auto_deploy_vision_service.py +538 -0
- isa_model/deployment/services/model_service.py +332 -0
- isa_model/deployment/services/service_monitor.py +356 -0
- isa_model/deployment/services/service_registry.py +527 -0
- isa_model/deployment/services/simple_auto_deploy_vision_service.py +275 -0
- isa_model/eval/__init__.py +80 -44
- isa_model/eval/config/__init__.py +10 -0
- isa_model/eval/config/evaluation_config.py +108 -0
- isa_model/eval/evaluators/__init__.py +18 -0
- isa_model/eval/evaluators/base_evaluator.py +503 -0
- isa_model/eval/evaluators/llm_evaluator.py +472 -0
- isa_model/eval/factory.py +417 -709
- isa_model/eval/infrastructure/__init__.py +24 -0
- isa_model/eval/infrastructure/experiment_tracker.py +466 -0
- isa_model/eval/metrics.py +191 -21
- isa_model/inference/ai_factory.py +257 -601
- isa_model/inference/services/audio/base_stt_service.py +65 -1
- isa_model/inference/services/audio/base_tts_service.py +75 -1
- isa_model/inference/services/audio/openai_stt_service.py +189 -151
- isa_model/inference/services/audio/openai_tts_service.py +12 -10
- isa_model/inference/services/audio/replicate_tts_service.py +61 -56
- isa_model/inference/services/base_service.py +55 -17
- isa_model/inference/services/embedding/base_embed_service.py +65 -1
- isa_model/inference/services/embedding/ollama_embed_service.py +103 -43
- isa_model/inference/services/embedding/openai_embed_service.py +8 -10
- isa_model/inference/services/helpers/stacked_config.py +148 -0
- isa_model/inference/services/img/__init__.py +18 -0
- isa_model/inference/services/{vision → img}/base_image_gen_service.py +80 -1
- isa_model/inference/services/{stacked → img}/flux_professional_service.py +25 -1
- isa_model/inference/services/{stacked → img/helpers}/base_stacked_service.py +40 -35
- isa_model/inference/services/{vision → img}/replicate_image_gen_service.py +44 -31
- isa_model/inference/services/llm/__init__.py +3 -3
- isa_model/inference/services/llm/base_llm_service.py +492 -40
- isa_model/inference/services/llm/helpers/llm_prompts.py +258 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +280 -0
- isa_model/inference/services/llm/ollama_llm_service.py +51 -17
- isa_model/inference/services/llm/openai_llm_service.py +70 -19
- isa_model/inference/services/llm/yyds_llm_service.py +24 -23
- isa_model/inference/services/vision/__init__.py +38 -4
- isa_model/inference/services/vision/base_vision_service.py +218 -117
- isa_model/inference/services/vision/{isA_vision_service.py → disabled/isA_vision_service.py} +98 -0
- isa_model/inference/services/{stacked → vision}/doc_analysis_service.py +1 -1
- isa_model/inference/services/vision/helpers/base_stacked_service.py +274 -0
- isa_model/inference/services/vision/helpers/image_utils.py +272 -3
- isa_model/inference/services/vision/helpers/vision_prompts.py +297 -0
- isa_model/inference/services/vision/openai_vision_service.py +104 -307
- isa_model/inference/services/vision/replicate_vision_service.py +140 -325
- isa_model/inference/services/{stacked → vision}/ui_analysis_service.py +2 -498
- isa_model/scripts/register_models.py +370 -0
- isa_model/scripts/register_models_with_embeddings.py +510 -0
- isa_model/serving/api/fastapi_server.py +6 -1
- isa_model/serving/api/routes/unified.py +274 -0
- {isa_model-0.3.5.dist-info → isa_model-0.3.7.dist-info}/METADATA +4 -1
- {isa_model-0.3.5.dist-info → isa_model-0.3.7.dist-info}/RECORD +78 -53
- isa_model/config/__init__.py +0 -9
- isa_model/config/config_manager.py +0 -213
- isa_model/core/model_manager.py +0 -213
- isa_model/core/model_registry.py +0 -375
- isa_model/core/vision_models_init.py +0 -116
- isa_model/inference/billing_tracker.py +0 -406
- isa_model/inference/services/llm/triton_llm_service.py +0 -481
- isa_model/inference/services/stacked/__init__.py +0 -26
- isa_model/inference/services/stacked/config.py +0 -426
- isa_model/inference/services/vision/ollama_vision_service.py +0 -194
- /isa_model/core/{model_storage.py → models/model_storage.py} +0 -0
- /isa_model/inference/services/{vision → embedding}/helpers/text_splitter.py +0 -0
- /isa_model/inference/services/llm/{llm_adapter.py → helpers/llm_adapter.py} +0 -0
- {isa_model-0.3.5.dist-info → isa_model-0.3.7.dist-info}/WHEEL +0 -0
- {isa_model-0.3.5.dist-info → isa_model-0.3.7.dist-info}/top_level.txt +0 -0
@@ -1,32 +1,31 @@
|
|
1
1
|
from typing import Dict, Any, Union, List, Optional, BinaryIO
|
2
|
-
import base64
|
3
|
-
import aiohttp
|
4
2
|
from openai import AsyncOpenAI
|
5
3
|
from tenacity import retry, stop_after_attempt, wait_exponential
|
6
4
|
from isa_model.inference.services.vision.base_vision_service import BaseVisionService
|
7
|
-
from isa_model.inference.
|
8
|
-
from isa_model.inference.
|
5
|
+
from isa_model.inference.services.vision.helpers.image_utils import prepare_image_base64
|
6
|
+
from isa_model.inference.services.vision.helpers.vision_prompts import VisionPromptMixin
|
7
|
+
from isa_model.core.types import ServiceType
|
9
8
|
import logging
|
10
9
|
|
11
10
|
logger = logging.getLogger(__name__)
|
12
11
|
|
13
|
-
class OpenAIVisionService(BaseVisionService):
|
14
|
-
"""OpenAI Vision service using
|
12
|
+
class OpenAIVisionService(BaseVisionService, VisionPromptMixin):
|
13
|
+
"""OpenAI Vision service using centralized config management"""
|
15
14
|
|
16
|
-
def __init__(self,
|
17
|
-
super().__init__(
|
15
|
+
def __init__(self, provider_name: str, model_name: str = "gpt-4o-mini", **kwargs):
|
16
|
+
super().__init__(provider_name, model_name, **kwargs)
|
18
17
|
|
19
|
-
# Get
|
20
|
-
provider_config =
|
18
|
+
# Get configuration from centralized config manager
|
19
|
+
provider_config = self.get_provider_config()
|
21
20
|
|
22
|
-
# Initialize AsyncOpenAI client with
|
21
|
+
# Initialize AsyncOpenAI client with centralized configuration
|
23
22
|
try:
|
24
23
|
if not provider_config.get("api_key"):
|
25
24
|
raise ValueError("OpenAI API key not found in provider configuration")
|
26
25
|
|
27
26
|
self._client = AsyncOpenAI(
|
28
27
|
api_key=provider_config["api_key"],
|
29
|
-
base_url=provider_config.get("
|
28
|
+
base_url=provider_config.get("api_base_url", "https://api.openai.com/v1"),
|
30
29
|
organization=provider_config.get("organization")
|
31
30
|
)
|
32
31
|
|
@@ -44,57 +43,7 @@ class OpenAIVisionService(BaseVisionService):
|
|
44
43
|
"""Get the underlying OpenAI client"""
|
45
44
|
return self._client
|
46
45
|
|
47
|
-
async def _download_image(self, image_url: str) -> bytes:
|
48
|
-
"""Download image from URL"""
|
49
|
-
async with aiohttp.ClientSession() as session:
|
50
|
-
async with session.get(image_url) as response:
|
51
|
-
if response.status == 200:
|
52
|
-
return await response.read()
|
53
|
-
else:
|
54
|
-
raise ValueError(f"Failed to download image from {image_url}: {response.status}")
|
55
46
|
|
56
|
-
def _encode_image(self, image_path_or_data: Union[str, bytes, BinaryIO]) -> str:
|
57
|
-
"""Encode image to base64"""
|
58
|
-
if isinstance(image_path_or_data, str):
|
59
|
-
# If it's a file path
|
60
|
-
with open(image_path_or_data, "rb") as image_file:
|
61
|
-
return base64.b64encode(image_file.read()).decode("utf-8")
|
62
|
-
elif hasattr(image_path_or_data, 'read'):
|
63
|
-
# If it's a file-like object (BinaryIO)
|
64
|
-
data = image_path_or_data.read() # type: ignore
|
65
|
-
if isinstance(data, bytes):
|
66
|
-
return base64.b64encode(data).decode("utf-8")
|
67
|
-
else:
|
68
|
-
raise ValueError("File-like object did not return bytes")
|
69
|
-
else:
|
70
|
-
# If it's bytes data
|
71
|
-
return base64.b64encode(image_path_or_data).decode("utf-8") # type: ignore
|
72
|
-
|
73
|
-
async def invoke(
|
74
|
-
self,
|
75
|
-
image: Union[str, BinaryIO],
|
76
|
-
prompt: Optional[str] = None,
|
77
|
-
task: Optional[str] = None,
|
78
|
-
**kwargs
|
79
|
-
) -> Dict[str, Any]:
|
80
|
-
"""
|
81
|
-
Unified invoke method for all vision operations
|
82
|
-
"""
|
83
|
-
task = task or "analyze"
|
84
|
-
|
85
|
-
if task == "analyze":
|
86
|
-
return await self.analyze_image(image, prompt, kwargs.get("max_tokens", 1000))
|
87
|
-
elif task == "describe":
|
88
|
-
return await self.describe_image(image, kwargs.get("detail_level", "medium"))
|
89
|
-
elif task == "extract_text":
|
90
|
-
return await self.extract_text(image)
|
91
|
-
elif task == "detect_objects":
|
92
|
-
return await self.detect_objects(image, kwargs.get("confidence_threshold", 0.5))
|
93
|
-
elif task == "classify":
|
94
|
-
return await self.classify_image(image, kwargs.get("categories"))
|
95
|
-
else:
|
96
|
-
# Default to analyze_image for unknown tasks
|
97
|
-
return await self.analyze_image(image, prompt, kwargs.get("max_tokens", 1000))
|
98
47
|
|
99
48
|
@retry(
|
100
49
|
stop=stop_after_attempt(3),
|
@@ -119,22 +68,8 @@ class OpenAIVisionService(BaseVisionService):
|
|
119
68
|
Dict containing analysis results
|
120
69
|
"""
|
121
70
|
try:
|
122
|
-
#
|
123
|
-
|
124
|
-
if image.startswith(('http://', 'https://')):
|
125
|
-
# Download image from URL
|
126
|
-
image_bytes = await self._download_image(image)
|
127
|
-
base64_image = self._encode_image(image_bytes)
|
128
|
-
else:
|
129
|
-
# File path
|
130
|
-
base64_image = self._encode_image(image)
|
131
|
-
else:
|
132
|
-
# BinaryIO or bytes data
|
133
|
-
if hasattr(image, 'read'):
|
134
|
-
image_data = image.read()
|
135
|
-
else:
|
136
|
-
image_data = image
|
137
|
-
base64_image = self._encode_image(image_data)
|
71
|
+
# Use unified image processing from image_utils
|
72
|
+
base64_image = prepare_image_base64(image)
|
138
73
|
|
139
74
|
# Use default prompt if none provided
|
140
75
|
if prompt is None:
|
@@ -166,7 +101,7 @@ class OpenAIVisionService(BaseVisionService):
|
|
166
101
|
|
167
102
|
# Track usage for billing
|
168
103
|
if response.usage:
|
169
|
-
self._track_usage(
|
104
|
+
await self._track_usage(
|
170
105
|
service_type=ServiceType.VISION,
|
171
106
|
operation="image_analysis",
|
172
107
|
input_tokens=response.usage.prompt_tokens,
|
@@ -176,14 +111,36 @@ class OpenAIVisionService(BaseVisionService):
|
|
176
111
|
|
177
112
|
content = response.choices[0].message.content or ""
|
178
113
|
|
114
|
+
# 尝试解析JSON响应(对于结构化任务)
|
115
|
+
try:
|
116
|
+
import json
|
117
|
+
# 检查响应是否是JSON格式
|
118
|
+
if content.strip().startswith('{') and content.strip().endswith('}'):
|
119
|
+
parsed_json = json.loads(content)
|
120
|
+
return {
|
121
|
+
"text": content,
|
122
|
+
"parsed_data": parsed_json,
|
123
|
+
"confidence": 1.0,
|
124
|
+
"metadata": {
|
125
|
+
"model": self.model_name,
|
126
|
+
"prompt": prompt[:100],
|
127
|
+
"tokens_used": response.usage.total_tokens if response.usage else 0,
|
128
|
+
"response_format": "json"
|
129
|
+
}
|
130
|
+
}
|
131
|
+
except json.JSONDecodeError:
|
132
|
+
pass
|
133
|
+
|
134
|
+
# 标准文本响应
|
179
135
|
return {
|
180
136
|
"text": content,
|
181
137
|
"confidence": 1.0, # OpenAI doesn't provide confidence scores
|
182
|
-
"detected_objects": [], #
|
138
|
+
"detected_objects": [], # Populated by specific detection methods
|
183
139
|
"metadata": {
|
184
140
|
"model": self.model_name,
|
185
|
-
"prompt": prompt,
|
186
|
-
"tokens_used": response.usage.total_tokens if response.usage else 0
|
141
|
+
"prompt": prompt[:100],
|
142
|
+
"tokens_used": response.usage.total_tokens if response.usage else 0,
|
143
|
+
"response_format": "text"
|
187
144
|
}
|
188
145
|
}
|
189
146
|
|
@@ -191,264 +148,104 @@ class OpenAIVisionService(BaseVisionService):
|
|
191
148
|
logger.error(f"Error in image analysis: {e}")
|
192
149
|
raise
|
193
150
|
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
prompt: Optional[str] = None,
|
198
|
-
max_tokens: int = 1000
|
199
|
-
) -> List[Dict[str, Any]]:
|
200
|
-
"""Analyze multiple images"""
|
201
|
-
results = []
|
202
|
-
for image in images:
|
203
|
-
result = await self.analyze_image(image, prompt, max_tokens)
|
204
|
-
results.append(result)
|
205
|
-
return results
|
151
|
+
# ==================== 基于提示词的智能功能实现 ====================
|
152
|
+
# OpenAI通过改变提示词就能实现大部分Vision功能
|
153
|
+
# 使用统一的VisionPromptMixin提供标准提示词
|
206
154
|
|
155
|
+
# 重写其他方法以使用智能提示词
|
207
156
|
async def describe_image(
|
208
157
|
self,
|
209
158
|
image: Union[str, BinaryIO],
|
210
159
|
detail_level: str = "medium"
|
211
160
|
) -> Dict[str, Any]:
|
212
|
-
"""
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
}
|
218
|
-
|
219
|
-
prompt = detail_prompts.get(detail_level, detail_prompts["medium"])
|
220
|
-
result = await self.analyze_image(image, prompt, 1500)
|
221
|
-
|
222
|
-
return {
|
223
|
-
"description": result["text"],
|
224
|
-
"objects": [], # Would need object detection API
|
225
|
-
"scene": result["text"], # Use same description
|
226
|
-
"colors": [], # Would need color analysis
|
227
|
-
"detail_level": detail_level,
|
228
|
-
"metadata": result["metadata"]
|
229
|
-
}
|
161
|
+
"""
|
162
|
+
图像描述 - 使用专门提示词
|
163
|
+
"""
|
164
|
+
prompt = self.get_task_prompt("describe", detail_level=detail_level)
|
165
|
+
return await self.analyze_image(image, prompt)
|
230
166
|
|
231
167
|
async def extract_text(self, image: Union[str, BinaryIO]) -> Dict[str, Any]:
|
232
|
-
"""
|
233
|
-
|
234
|
-
|
168
|
+
"""
|
169
|
+
文本提取(OCR) - 使用专门提示词
|
170
|
+
"""
|
171
|
+
prompt = self.get_task_prompt("extract_text")
|
235
172
|
|
236
|
-
return
|
237
|
-
"text": result["text"],
|
238
|
-
"confidence": 1.0,
|
239
|
-
"bounding_boxes": [], # OpenAI vision doesn't provide bounding boxes
|
240
|
-
"language": "unknown", # Would need language detection
|
241
|
-
"metadata": result["metadata"]
|
242
|
-
}
|
173
|
+
return await self.analyze_image(image, prompt)
|
243
174
|
|
244
175
|
async def detect_objects(
|
245
176
|
self,
|
246
177
|
image: Union[str, BinaryIO],
|
247
178
|
confidence_threshold: float = 0.5
|
248
179
|
) -> Dict[str, Any]:
|
249
|
-
"""
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
3. Approximate size as percentages of image dimensions (width%, height%)
|
254
|
-
4. Brief description
|
255
|
-
|
256
|
-
Format each object as: "ObjectName: x=X%, y=Y%, width=W%, height=H% - Description"
|
257
|
-
|
258
|
-
Example: "Car: x=25%, y=40%, width=15%, height=12% - Red sedan in the center"
|
259
|
-
"""
|
260
|
-
result = await self.analyze_image(image, prompt, 1500)
|
261
|
-
|
262
|
-
# Parse the response to extract object information with coordinates
|
263
|
-
objects = []
|
264
|
-
bounding_boxes = []
|
265
|
-
lines = result["text"].split('\n')
|
266
|
-
|
267
|
-
for line in lines:
|
268
|
-
line = line.strip()
|
269
|
-
if line and ':' in line and ('x=' in line or 'width=' in line):
|
270
|
-
try:
|
271
|
-
# Extract object name and details
|
272
|
-
parts = line.split(':', 1)
|
273
|
-
if len(parts) == 2:
|
274
|
-
object_name = parts[0].strip()
|
275
|
-
details = parts[1].strip()
|
276
|
-
|
277
|
-
# Extract coordinates using regex-like parsing
|
278
|
-
coords = {}
|
279
|
-
for param in ['x', 'y', 'width', 'height']:
|
280
|
-
param_pattern = f"{param}="
|
281
|
-
if param_pattern in details:
|
282
|
-
start_idx = details.find(param_pattern) + len(param_pattern)
|
283
|
-
end_idx = details.find('%', start_idx)
|
284
|
-
if end_idx > start_idx:
|
285
|
-
try:
|
286
|
-
value = float(details[start_idx:end_idx])
|
287
|
-
coords[param] = value
|
288
|
-
except ValueError:
|
289
|
-
continue
|
290
|
-
|
291
|
-
# Extract description (after the coordinates)
|
292
|
-
desc_start = details.find(' - ')
|
293
|
-
description = details[desc_start + 3:] if desc_start != -1 else details
|
294
|
-
|
295
|
-
objects.append({
|
296
|
-
"label": object_name,
|
297
|
-
"confidence": 1.0,
|
298
|
-
"coordinates": coords,
|
299
|
-
"description": description
|
300
|
-
})
|
301
|
-
|
302
|
-
# Add bounding box if we have coordinates
|
303
|
-
if all(k in coords for k in ['x', 'y', 'width', 'height']):
|
304
|
-
bounding_boxes.append({
|
305
|
-
"label": object_name,
|
306
|
-
"x_percent": coords['x'],
|
307
|
-
"y_percent": coords['y'],
|
308
|
-
"width_percent": coords['width'],
|
309
|
-
"height_percent": coords['height']
|
310
|
-
})
|
311
|
-
|
312
|
-
except Exception:
|
313
|
-
# Fallback for objects that don't match expected format
|
314
|
-
objects.append({
|
315
|
-
"label": line,
|
316
|
-
"confidence": 1.0,
|
317
|
-
"coordinates": {},
|
318
|
-
"description": line
|
319
|
-
})
|
180
|
+
"""
|
181
|
+
物体检测 - 使用专门提示词
|
182
|
+
"""
|
183
|
+
prompt = self.get_task_prompt("detect_objects", confidence_threshold=confidence_threshold)
|
320
184
|
|
321
|
-
return
|
322
|
-
"objects": objects,
|
323
|
-
"count": len(objects),
|
324
|
-
"bounding_boxes": bounding_boxes,
|
325
|
-
"metadata": result["metadata"]
|
326
|
-
}
|
185
|
+
return await self.analyze_image(image, prompt)
|
327
186
|
|
328
|
-
async def
|
187
|
+
async def detect_ui_elements(
|
329
188
|
self,
|
330
189
|
image: Union[str, BinaryIO],
|
331
|
-
|
190
|
+
element_types: Optional[List[str]] = None,
|
191
|
+
confidence_threshold: float = 0.5
|
332
192
|
) -> Dict[str, Any]:
|
333
|
-
"""
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
FOUND: YES/NO
|
338
|
-
CENTER: [x, y]
|
339
|
-
DESCRIPTION: [Brief description]
|
340
|
-
|
341
|
-
If found, provide the pixel coordinates of the center point.
|
342
|
-
If not found, explain why.
|
343
|
-
|
344
|
-
Example:
|
345
|
-
FOUND: YES
|
346
|
-
CENTER: [640, 360]
|
347
|
-
DESCRIPTION: Blue login button in the center-left area
|
348
|
-
"""
|
349
|
-
|
350
|
-
result = await self.analyze_image(image, prompt, 300)
|
351
|
-
response_text = result["text"]
|
352
|
-
|
353
|
-
# Parse the structured response
|
354
|
-
found = False
|
355
|
-
center_coords = None
|
356
|
-
description = ""
|
193
|
+
"""
|
194
|
+
UI元素检测 - 使用专门提示词
|
195
|
+
"""
|
196
|
+
prompt = self.get_task_prompt("detect_ui_elements", element_types=element_types, confidence_threshold=confidence_threshold)
|
357
197
|
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
x_str, y_str = coords_text.split(',')
|
371
|
-
x = int(float(x_str.strip()))
|
372
|
-
y = int(float(y_str.strip()))
|
373
|
-
center_coords = [x, y]
|
374
|
-
except (ValueError, IndexError):
|
375
|
-
pass
|
376
|
-
elif line.startswith('DESCRIPTION:'):
|
377
|
-
description = line.replace('DESCRIPTION:', '').strip()
|
198
|
+
return await self.analyze_image(image, prompt)
|
199
|
+
|
200
|
+
async def detect_document_elements(
|
201
|
+
self,
|
202
|
+
image: Union[str, BinaryIO],
|
203
|
+
element_types: Optional[List[str]] = None,
|
204
|
+
confidence_threshold: float = 0.5
|
205
|
+
) -> Dict[str, Any]:
|
206
|
+
"""
|
207
|
+
文档元素检测 - 使用专门提示词
|
208
|
+
"""
|
209
|
+
prompt = self.get_task_prompt("detect_document_elements", element_types=element_types, confidence_threshold=confidence_threshold)
|
378
210
|
|
379
|
-
return
|
380
|
-
"found": found,
|
381
|
-
"center_coordinates": center_coords,
|
382
|
-
"confidence": 1.0 if found else 0.0,
|
383
|
-
"description": description,
|
384
|
-
"metadata": result["metadata"]
|
385
|
-
}
|
211
|
+
return await self.analyze_image(image, prompt)
|
386
212
|
|
387
213
|
async def classify_image(
|
388
214
|
self,
|
389
215
|
image: Union[str, BinaryIO],
|
390
216
|
categories: Optional[List[str]] = None
|
391
217
|
) -> Dict[str, Any]:
|
392
|
-
"""
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
else:
|
397
|
-
prompt = "What category best describes this image? Provide a single category name."
|
398
|
-
|
399
|
-
result = await self.analyze_image(image, prompt, 100)
|
400
|
-
category = result["text"].strip()
|
218
|
+
"""
|
219
|
+
图像分类 - 使用专门提示词
|
220
|
+
"""
|
221
|
+
prompt = self.get_task_prompt("classify", categories=categories)
|
401
222
|
|
402
|
-
return
|
403
|
-
"category": category,
|
404
|
-
"confidence": 1.0,
|
405
|
-
"all_predictions": [{"category": category, "confidence": 1.0}],
|
406
|
-
"metadata": result["metadata"]
|
407
|
-
}
|
223
|
+
return await self.analyze_image(image, prompt)
|
408
224
|
|
409
|
-
async def
|
410
|
-
self,
|
411
|
-
|
412
|
-
|
225
|
+
async def get_object_coordinates(
|
226
|
+
self,
|
227
|
+
image: Union[str, BinaryIO],
|
228
|
+
object_name: str
|
413
229
|
) -> Dict[str, Any]:
|
414
|
-
"""
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
# Use LLM to compare the descriptions
|
420
|
-
comparison_prompt = f"Compare these two image descriptions and provide a similarity analysis:\n\nImage 1: {result1['text']}\n\nImage 2: {result2['text']}\n\nProvide: 1) A similarity score from 0.0 to 1.0, 2) Key differences, 3) Common elements."
|
421
|
-
|
422
|
-
comparison_result = await self._client.chat.completions.create(
|
423
|
-
model=self.model_name,
|
424
|
-
messages=[{"role": "user", "content": comparison_prompt}],
|
425
|
-
max_tokens=500,
|
426
|
-
temperature=0.3
|
427
|
-
)
|
428
|
-
|
429
|
-
comparison_text = comparison_result.choices[0].message.content or ""
|
230
|
+
"""
|
231
|
+
获取对象坐标 - 使用专门提示词
|
232
|
+
"""
|
233
|
+
prompt = self.get_task_prompt("get_coordinates", object_name=object_name)
|
430
234
|
|
431
|
-
return
|
432
|
-
"similarity_score": 0.5, # Would need better parsing to extract actual score
|
433
|
-
"differences": comparison_text,
|
434
|
-
"common_elements": comparison_text,
|
435
|
-
"metadata": {
|
436
|
-
"model": self.model_name,
|
437
|
-
"comparison_method": "description_based"
|
438
|
-
}
|
439
|
-
}
|
440
|
-
|
441
|
-
def get_supported_formats(self) -> List[str]:
|
442
|
-
"""Get list of supported image formats"""
|
443
|
-
return ['jpg', 'jpeg', 'png', 'gif', 'webp']
|
235
|
+
return await self.analyze_image(image, prompt)
|
444
236
|
|
445
|
-
def
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
237
|
+
async def extract_table_data(
|
238
|
+
self,
|
239
|
+
image: Union[str, BinaryIO],
|
240
|
+
table_format: str = "json",
|
241
|
+
preserve_formatting: bool = True
|
242
|
+
) -> Dict[str, Any]:
|
243
|
+
"""
|
244
|
+
表格数据结构化抽取 - 使用专门的表格抽取提示词
|
245
|
+
"""
|
246
|
+
prompt = self.get_task_prompt("extract_table_data", table_format=table_format, preserve_formatting=preserve_formatting)
|
247
|
+
|
248
|
+
return await self.analyze_image(image, prompt)
|
452
249
|
|
453
250
|
async def close(self):
|
454
251
|
"""Clean up resources"""
|