isa-model 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/__init__.py +30 -1
- isa_model/client.py +770 -0
- isa_model/core/config/__init__.py +16 -0
- isa_model/core/config/config_manager.py +514 -0
- isa_model/core/config.py +426 -0
- isa_model/core/models/model_billing_tracker.py +476 -0
- isa_model/core/models/model_manager.py +399 -0
- isa_model/core/{storage/supabase_storage.py → models/model_repo.py} +72 -73
- isa_model/core/pricing_manager.py +426 -0
- isa_model/core/services/__init__.py +19 -0
- isa_model/core/services/intelligent_model_selector.py +547 -0
- isa_model/core/types.py +291 -0
- isa_model/deployment/__init__.py +2 -0
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +157 -3
- isa_model/deployment/cloud/modal/isa_vision_table_service.py +532 -0
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +104 -3
- isa_model/deployment/cloud/modal/register_models.py +321 -0
- isa_model/deployment/runtime/deployed_service.py +338 -0
- isa_model/deployment/services/__init__.py +9 -0
- isa_model/deployment/services/auto_deploy_vision_service.py +537 -0
- isa_model/deployment/services/model_service.py +332 -0
- isa_model/deployment/services/service_monitor.py +356 -0
- isa_model/deployment/services/service_registry.py +527 -0
- isa_model/eval/__init__.py +80 -44
- isa_model/eval/config/__init__.py +10 -0
- isa_model/eval/config/evaluation_config.py +108 -0
- isa_model/eval/evaluators/__init__.py +18 -0
- isa_model/eval/evaluators/base_evaluator.py +503 -0
- isa_model/eval/evaluators/llm_evaluator.py +472 -0
- isa_model/eval/factory.py +417 -709
- isa_model/eval/infrastructure/__init__.py +24 -0
- isa_model/eval/infrastructure/experiment_tracker.py +466 -0
- isa_model/eval/metrics.py +191 -21
- isa_model/inference/ai_factory.py +181 -605
- isa_model/inference/services/audio/base_stt_service.py +65 -1
- isa_model/inference/services/audio/base_tts_service.py +75 -1
- isa_model/inference/services/audio/openai_stt_service.py +189 -151
- isa_model/inference/services/audio/openai_tts_service.py +12 -10
- isa_model/inference/services/audio/replicate_tts_service.py +61 -56
- isa_model/inference/services/base_service.py +55 -17
- isa_model/inference/services/embedding/base_embed_service.py +65 -1
- isa_model/inference/services/embedding/ollama_embed_service.py +103 -43
- isa_model/inference/services/embedding/openai_embed_service.py +8 -10
- isa_model/inference/services/helpers/stacked_config.py +148 -0
- isa_model/inference/services/img/__init__.py +18 -0
- isa_model/inference/services/{vision → img}/base_image_gen_service.py +80 -1
- isa_model/inference/services/{stacked → img}/flux_professional_service.py +25 -1
- isa_model/inference/services/{stacked → img/helpers}/base_stacked_service.py +40 -35
- isa_model/inference/services/{vision → img}/replicate_image_gen_service.py +44 -31
- isa_model/inference/services/llm/__init__.py +3 -3
- isa_model/inference/services/llm/base_llm_service.py +492 -40
- isa_model/inference/services/llm/helpers/llm_prompts.py +258 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +280 -0
- isa_model/inference/services/llm/ollama_llm_service.py +51 -17
- isa_model/inference/services/llm/openai_llm_service.py +70 -19
- isa_model/inference/services/llm/yyds_llm_service.py +24 -23
- isa_model/inference/services/vision/__init__.py +38 -4
- isa_model/inference/services/vision/base_vision_service.py +218 -117
- isa_model/inference/services/vision/{isA_vision_service.py → disabled/isA_vision_service.py} +98 -0
- isa_model/inference/services/{stacked → vision}/doc_analysis_service.py +1 -1
- isa_model/inference/services/vision/helpers/base_stacked_service.py +274 -0
- isa_model/inference/services/vision/helpers/image_utils.py +272 -3
- isa_model/inference/services/vision/helpers/vision_prompts.py +297 -0
- isa_model/inference/services/vision/openai_vision_service.py +104 -307
- isa_model/inference/services/vision/replicate_vision_service.py +140 -325
- isa_model/inference/services/{stacked → vision}/ui_analysis_service.py +2 -498
- isa_model/scripts/register_models.py +370 -0
- isa_model/scripts/register_models_with_embeddings.py +510 -0
- isa_model/serving/api/fastapi_server.py +6 -1
- isa_model/serving/api/routes/unified.py +202 -0
- {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/METADATA +4 -1
- {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/RECORD +77 -53
- isa_model/config/__init__.py +0 -9
- isa_model/config/config_manager.py +0 -213
- isa_model/core/model_manager.py +0 -213
- isa_model/core/model_registry.py +0 -375
- isa_model/core/vision_models_init.py +0 -116
- isa_model/inference/billing_tracker.py +0 -406
- isa_model/inference/services/llm/triton_llm_service.py +0 -481
- isa_model/inference/services/stacked/__init__.py +0 -26
- isa_model/inference/services/stacked/config.py +0 -426
- isa_model/inference/services/vision/ollama_vision_service.py +0 -194
- /isa_model/core/{model_storage.py → models/model_storage.py} +0 -0
- /isa_model/inference/services/{vision → embedding}/helpers/text_splitter.py +0 -0
- /isa_model/inference/services/llm/{llm_adapter.py → helpers/llm_adapter.py} +0 -0
- {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/WHEEL +0 -0
- {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/top_level.txt +0 -0
@@ -5,13 +5,14 @@ import replicate
|
|
5
5
|
import re
|
6
6
|
import ast
|
7
7
|
from isa_model.inference.services.vision.base_vision_service import BaseVisionService
|
8
|
-
from isa_model.
|
9
|
-
from isa_model.inference.
|
8
|
+
from isa_model.core.types import ServiceType
|
9
|
+
from isa_model.inference.services.vision.helpers.image_utils import prepare_image_data_url
|
10
|
+
from isa_model.inference.services.vision.helpers.vision_prompts import VisionPromptMixin
|
10
11
|
import logging
|
11
12
|
|
12
13
|
logger = logging.getLogger(__name__)
|
13
14
|
|
14
|
-
class ReplicateVisionService(BaseVisionService):
|
15
|
+
class ReplicateVisionService(BaseVisionService, VisionPromptMixin):
|
15
16
|
"""Enhanced Replicate Vision service supporting multiple specialized models"""
|
16
17
|
|
17
18
|
# Supported model configurations
|
@@ -19,17 +20,18 @@ class ReplicateVisionService(BaseVisionService):
|
|
19
20
|
"cogvlm": "cjwbw/cogvlm:a5092d718ea77a073e6d8f6969d5c0fb87d0ac7e4cdb7175427331e1798a34ed",
|
20
21
|
"florence-2": "microsoft/florence-2-large:fcdb54e52322b9e6dce7a35e5d8ad173dce30b46ef49a236c1a71bc6b78b5bed",
|
21
22
|
"omniparser": "microsoft/omniparser-v2:49cf3d41b8d3aca1360514e83be4c97131ce8f0d99abfc365526d8384caa88df",
|
22
|
-
"yolov8": "adirik/yolov8:3b21ba0e5da47bb2c69a96f72894a31b7c1e77b3e8a7b6ba43b7eb93b7b2c4f4"
|
23
|
+
"yolov8": "adirik/yolov8:3b21ba0e5da47bb2c69a96f72894a31b7c1e77b3e8a7b6ba43b7eb93b7b2c4f4",
|
24
|
+
"qwen-vl-chat": "lucataco/qwen-vl-chat:50881b153b4d5f72b3db697e2bbad23bb1277ab741c5b52d80cd6ee17ea660e9"
|
23
25
|
}
|
24
26
|
|
25
|
-
def __init__(self,
|
27
|
+
def __init__(self, provider_name: str, model_name: str = "cogvlm", **kwargs):
|
26
28
|
# Resolve model name to full model path
|
27
29
|
self.model_key = model_name
|
28
30
|
resolved_model = self.MODELS.get(model_name, model_name)
|
29
|
-
super().__init__(
|
31
|
+
super().__init__(provider_name, resolved_model, **kwargs)
|
30
32
|
|
31
|
-
# Get
|
32
|
-
provider_config =
|
33
|
+
# Get configuration from centralized config manager
|
34
|
+
provider_config = self.get_provider_config()
|
33
35
|
|
34
36
|
# Initialize Replicate client
|
35
37
|
try:
|
@@ -52,72 +54,15 @@ class ReplicateVisionService(BaseVisionService):
|
|
52
54
|
|
53
55
|
def _prepare_image(self, image: Union[str, BinaryIO]) -> str:
|
54
56
|
"""Prepare image for Replicate API - convert to URL or base64"""
|
55
|
-
if isinstance(image, str):
|
56
|
-
|
57
|
-
|
58
|
-
return image
|
59
|
-
else:
|
60
|
-
# Local file path - need to convert to base64 data URL
|
61
|
-
with open(image, "rb") as f:
|
62
|
-
image_data = f.read()
|
63
|
-
image_b64 = base64.b64encode(image_data).decode()
|
64
|
-
# Determine file extension for MIME type
|
65
|
-
ext = os.path.splitext(image)[1].lower()
|
66
|
-
mime_type = {
|
67
|
-
'.jpg': 'image/jpeg',
|
68
|
-
'.jpeg': 'image/jpeg',
|
69
|
-
'.png': 'image/png',
|
70
|
-
'.gif': 'image/gif',
|
71
|
-
'.webp': 'image/webp'
|
72
|
-
}.get(ext, 'image/jpeg')
|
73
|
-
return f"data:{mime_type};base64,{image_b64}"
|
57
|
+
if isinstance(image, str) and image.startswith(('http://', 'https://')):
|
58
|
+
# Already a URL
|
59
|
+
return image
|
74
60
|
else:
|
75
|
-
#
|
76
|
-
|
77
|
-
image_data = image.read()
|
78
|
-
if isinstance(image_data, bytes):
|
79
|
-
image_b64 = base64.b64encode(image_data).decode()
|
80
|
-
else:
|
81
|
-
raise ValueError("File-like object did not return bytes")
|
82
|
-
else:
|
83
|
-
# Assume it's bytes
|
84
|
-
image_b64 = base64.b64encode(image).decode() # type: ignore
|
85
|
-
return f"data:image/jpeg;base64,{image_b64}"
|
61
|
+
# Use unified image processing from image_utils
|
62
|
+
return prepare_image_data_url(image)
|
86
63
|
|
87
|
-
|
88
|
-
|
89
|
-
image: Union[str, BinaryIO],
|
90
|
-
prompt: Optional[str] = None,
|
91
|
-
task: Optional[str] = None,
|
92
|
-
**kwargs
|
93
|
-
) -> Dict[str, Any]:
|
94
|
-
"""
|
95
|
-
Unified invoke method for all vision operations
|
96
|
-
"""
|
97
|
-
task = task or "analyze"
|
98
|
-
|
99
|
-
if task == "analyze":
|
100
|
-
return await self.analyze_image(image, prompt, kwargs.get("max_tokens", 1000))
|
101
|
-
elif task == "element_detection":
|
102
|
-
if self.model_key == "omniparser":
|
103
|
-
return await self.run_omniparser(image, **kwargs)
|
104
|
-
elif self.model_key == "florence-2":
|
105
|
-
return await self.run_florence2(image, **kwargs)
|
106
|
-
elif self.model_key == "yolov8":
|
107
|
-
return await self.run_yolo(image, **kwargs)
|
108
|
-
else:
|
109
|
-
return await self.detect_objects(image, kwargs.get("confidence_threshold", 0.5))
|
110
|
-
elif task == "describe":
|
111
|
-
return await self.describe_image(image, kwargs.get("detail_level", "medium"))
|
112
|
-
elif task == "extract_text":
|
113
|
-
return await self.extract_text(image)
|
114
|
-
elif task == "detect_objects":
|
115
|
-
return await self.detect_objects(image, kwargs.get("confidence_threshold", 0.5))
|
116
|
-
elif task == "classify":
|
117
|
-
return await self.classify_image(image, kwargs.get("categories"))
|
118
|
-
else:
|
119
|
-
# Default to analyze_image for unknown tasks
|
120
|
-
return await self.analyze_image(image, prompt, kwargs.get("max_tokens", 1000))
|
64
|
+
# Replicate使用base的invoke方法,不需要重写
|
65
|
+
# 直接实现对应的标准方法即可
|
121
66
|
|
122
67
|
async def analyze_image(
|
123
68
|
self,
|
@@ -129,28 +74,39 @@ class ReplicateVisionService(BaseVisionService):
|
|
129
74
|
Analyze image and provide description or answer questions
|
130
75
|
"""
|
131
76
|
try:
|
132
|
-
# Prepare image for API
|
77
|
+
# Prepare image for API using unified processing
|
133
78
|
image_input = self._prepare_image(image)
|
134
79
|
|
135
80
|
# Use default prompt if none provided
|
136
81
|
if prompt is None:
|
137
82
|
prompt = "Describe this image in detail."
|
138
83
|
|
139
|
-
#
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
84
|
+
# Choose input format based on model type
|
85
|
+
if self.model_key == "qwen-vl-chat":
|
86
|
+
# Qwen-VL-Chat uses simple image + prompt format
|
87
|
+
output = replicate.run(
|
88
|
+
self.model_name,
|
89
|
+
input={
|
90
|
+
"image": image_input,
|
91
|
+
"prompt": prompt
|
92
|
+
}
|
93
|
+
)
|
94
|
+
else:
|
95
|
+
# CogVLM and other models use VQA format
|
96
|
+
output = replicate.run(
|
97
|
+
self.model_name,
|
98
|
+
input={
|
99
|
+
"vqa": True, # Visual Question Answering mode
|
100
|
+
"image": image_input,
|
101
|
+
"query": prompt
|
102
|
+
}
|
103
|
+
)
|
148
104
|
|
149
105
|
# CogVLM returns a string response
|
150
106
|
response_text = str(output) if output else ""
|
151
107
|
|
152
108
|
# Track usage for billing
|
153
|
-
self._track_usage(
|
109
|
+
await self._track_usage(
|
154
110
|
service_type=ServiceType.VISION,
|
155
111
|
operation="image_analysis",
|
156
112
|
input_tokens=len(prompt.split()) if prompt else 0,
|
@@ -173,272 +129,131 @@ class ReplicateVisionService(BaseVisionService):
|
|
173
129
|
logger.error(f"Error in image analysis: {e}")
|
174
130
|
raise
|
175
131
|
|
176
|
-
|
177
|
-
self,
|
178
|
-
images: List[Union[str, BinaryIO]],
|
179
|
-
prompt: Optional[str] = None,
|
180
|
-
max_tokens: int = 1000
|
181
|
-
) -> List[Dict[str, Any]]:
|
182
|
-
"""Analyze multiple images"""
|
183
|
-
results = []
|
184
|
-
for image in images:
|
185
|
-
result = await self.analyze_image(image, prompt, max_tokens)
|
186
|
-
results.append(result)
|
187
|
-
return results
|
132
|
+
# ==================== 标准接口实现:检测抽取类 ====================
|
188
133
|
|
189
|
-
async def
|
190
|
-
self,
|
134
|
+
async def detect_ui_elements(
|
135
|
+
self,
|
191
136
|
image: Union[str, BinaryIO],
|
192
|
-
|
137
|
+
element_types: Optional[List[str]] = None,
|
138
|
+
confidence_threshold: float = 0.5
|
193
139
|
) -> Dict[str, Any]:
|
194
|
-
"""
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
return {
|
205
|
-
"description": result["text"],
|
206
|
-
"objects": [], # Would need object detection API
|
207
|
-
"scene": result["text"], # Use same description
|
208
|
-
"colors": [], # Would need color analysis
|
209
|
-
"detail_level": detail_level,
|
210
|
-
"metadata": result["metadata"]
|
211
|
-
}
|
140
|
+
"""
|
141
|
+
UI界面元素检测 - 使用专门模型实现
|
142
|
+
"""
|
143
|
+
if self.model_key == "omniparser":
|
144
|
+
return await self.run_omniparser(image, box_threshold=confidence_threshold)
|
145
|
+
elif self.model_key == "florence-2":
|
146
|
+
return await self.run_florence2(image, task="<OPEN_VOCABULARY_DETECTION>")
|
147
|
+
else:
|
148
|
+
# 使用通用物体检测作为fallback
|
149
|
+
return await self.detect_objects(image, confidence_threshold)
|
212
150
|
|
213
|
-
async def
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
151
|
+
async def detect_document_elements(
|
152
|
+
self,
|
153
|
+
image: Union[str, BinaryIO],
|
154
|
+
element_types: Optional[List[str]] = None,
|
155
|
+
confidence_threshold: float = 0.5
|
156
|
+
) -> Dict[str, Any]:
|
157
|
+
"""
|
158
|
+
文档结构元素检测 - 使用专门模型实现
|
159
|
+
"""
|
160
|
+
if self.model_key == "florence-2":
|
161
|
+
# Florence-2可以检测文档结构
|
162
|
+
return await self.run_florence2(image, task="<DETAILED_CAPTION>")
|
163
|
+
else:
|
164
|
+
raise NotImplementedError(f"Document detection not supported for model {self.model_key}")
|
225
165
|
|
226
166
|
async def detect_objects(
|
227
167
|
self,
|
228
168
|
image: Union[str, BinaryIO],
|
229
169
|
confidence_threshold: float = 0.5
|
230
170
|
) -> Dict[str, Any]:
|
231
|
-
"""
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
- y% = vertical position from top edge (0-100%)
|
245
|
-
- width% = element width as percentage of image width (0-100%)
|
246
|
-
- height% = element height as percentage of image height (0-100%)
|
247
|
-
|
248
|
-
Be precise about the actual visual boundaries of each element.
|
249
|
-
|
250
|
-
Example: "Submit Button: x=25%, y=60%, width=15%, height=5% - Blue rectangular button with white text"
|
251
|
-
"""
|
252
|
-
result = await self.analyze_image(image, prompt, 1500)
|
253
|
-
|
254
|
-
# Parse the response to extract object information with coordinates
|
255
|
-
objects = []
|
256
|
-
bounding_boxes = []
|
257
|
-
lines = result["text"].split('\n')
|
258
|
-
|
259
|
-
for line in lines:
|
260
|
-
line = line.strip()
|
261
|
-
if line and ':' in line and ('x=' in line or 'width=' in line):
|
262
|
-
try:
|
263
|
-
# Extract object name and details
|
264
|
-
parts = line.split(':', 1)
|
265
|
-
if len(parts) == 2:
|
266
|
-
object_name = parts[0].strip()
|
267
|
-
details = parts[1].strip()
|
268
|
-
|
269
|
-
# Extract coordinates using regex-like parsing
|
270
|
-
coords = {}
|
271
|
-
for param in ['x', 'y', 'width', 'height']:
|
272
|
-
param_pattern = f"{param}="
|
273
|
-
if param_pattern in details:
|
274
|
-
start_idx = details.find(param_pattern) + len(param_pattern)
|
275
|
-
end_idx = details.find('%', start_idx)
|
276
|
-
if end_idx > start_idx:
|
277
|
-
try:
|
278
|
-
value = float(details[start_idx:end_idx])
|
279
|
-
coords[param] = value
|
280
|
-
except ValueError:
|
281
|
-
continue
|
282
|
-
|
283
|
-
# Extract description (after the coordinates)
|
284
|
-
desc_start = details.find(' - ')
|
285
|
-
description = details[desc_start + 3:] if desc_start != -1 else details
|
286
|
-
|
287
|
-
objects.append({
|
288
|
-
"label": object_name,
|
289
|
-
"confidence": 1.0,
|
290
|
-
"coordinates": coords,
|
291
|
-
"description": description
|
292
|
-
})
|
293
|
-
|
294
|
-
# Add bounding box if we have coordinates
|
295
|
-
if all(k in coords for k in ['x', 'y', 'width', 'height']):
|
296
|
-
bounding_boxes.append({
|
297
|
-
"label": object_name,
|
298
|
-
"x_percent": coords['x'],
|
299
|
-
"y_percent": coords['y'],
|
300
|
-
"width_percent": coords['width'],
|
301
|
-
"height_percent": coords['height']
|
302
|
-
})
|
303
|
-
|
304
|
-
except Exception:
|
305
|
-
# Fallback for objects that don't match expected format
|
306
|
-
objects.append({
|
307
|
-
"label": line,
|
308
|
-
"confidence": 1.0,
|
309
|
-
"coordinates": {},
|
310
|
-
"description": line
|
311
|
-
})
|
312
|
-
|
313
|
-
return {
|
314
|
-
"objects": objects,
|
315
|
-
"count": len(objects),
|
316
|
-
"bounding_boxes": bounding_boxes,
|
317
|
-
"metadata": result["metadata"]
|
318
|
-
}
|
171
|
+
"""
|
172
|
+
通用物体检测 - 实现标准接口
|
173
|
+
"""
|
174
|
+
if self.model_key == "yolov8":
|
175
|
+
return await self.run_yolo(image, confidence=confidence_threshold)
|
176
|
+
elif self.model_key == "florence-2":
|
177
|
+
return await self.run_florence2(image, task="<OD>")
|
178
|
+
elif self.model_key == "qwen-vl-chat":
|
179
|
+
# Qwen-VL-Chat can do object detection through prompting
|
180
|
+
prompt = self.get_task_prompt("detect_objects", confidence_threshold=confidence_threshold)
|
181
|
+
return await self.analyze_image(image, prompt)
|
182
|
+
else:
|
183
|
+
raise NotImplementedError(f"Object detection not supported for model {self.model_key}")
|
319
184
|
|
320
|
-
|
321
|
-
|
185
|
+
# ==================== QWEN-VL-CHAT 智能提示词实现 ====================
|
186
|
+
# 类似 OpenAI,qwen-vl-chat 通过提示词实现所有 Vision 功能
|
187
|
+
|
188
|
+
async def describe_image(
|
189
|
+
self,
|
322
190
|
image: Union[str, BinaryIO],
|
323
|
-
|
191
|
+
detail_level: str = "medium"
|
324
192
|
) -> Dict[str, Any]:
|
325
|
-
"""
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
""
|
343
|
-
|
344
|
-
result = await self.analyze_image(image, prompt, 300)
|
345
|
-
response_text = result["text"]
|
346
|
-
|
347
|
-
# Parse the structured response
|
348
|
-
found = False
|
349
|
-
center_coords = None
|
350
|
-
description = ""
|
351
|
-
|
352
|
-
lines = response_text.split('\n')
|
353
|
-
for line in lines:
|
354
|
-
line = line.strip()
|
355
|
-
if line.startswith('FOUND:'):
|
356
|
-
found = 'YES' in line.upper()
|
357
|
-
elif line.startswith('CENTER:') and found:
|
358
|
-
# Extract center coordinates [x, y]
|
359
|
-
coords_text = line.replace('CENTER:', '').strip()
|
360
|
-
try:
|
361
|
-
# Remove brackets and split
|
362
|
-
coords_text = coords_text.replace('[', '').replace(']', '')
|
363
|
-
if ',' in coords_text:
|
364
|
-
x_str, y_str = coords_text.split(',')
|
365
|
-
x = int(float(x_str.strip()))
|
366
|
-
y = int(float(y_str.strip()))
|
367
|
-
center_coords = [x, y]
|
368
|
-
except (ValueError, IndexError):
|
369
|
-
pass
|
370
|
-
elif line.startswith('DESCRIPTION:'):
|
371
|
-
description = line.replace('DESCRIPTION:', '').strip()
|
372
|
-
|
373
|
-
return {
|
374
|
-
"found": found,
|
375
|
-
"center_coordinates": center_coords,
|
376
|
-
"confidence": 1.0 if found else 0.0,
|
377
|
-
"description": description,
|
378
|
-
"metadata": result["metadata"]
|
379
|
-
}
|
193
|
+
"""
|
194
|
+
图像描述 - qwen-vl-chat通过提示词实现
|
195
|
+
"""
|
196
|
+
if self.model_key == "qwen-vl-chat":
|
197
|
+
prompt = self.get_task_prompt("describe", detail_level=detail_level)
|
198
|
+
return await self.analyze_image(image, prompt)
|
199
|
+
else:
|
200
|
+
raise NotImplementedError(f"describe_image not supported for model {self.model_key}")
|
201
|
+
|
202
|
+
async def extract_text(self, image: Union[str, BinaryIO]) -> Dict[str, Any]:
|
203
|
+
"""
|
204
|
+
文本提取(OCR) - qwen-vl-chat通过提示词实现
|
205
|
+
"""
|
206
|
+
if self.model_key == "qwen-vl-chat":
|
207
|
+
prompt = self.get_task_prompt("extract_text")
|
208
|
+
return await self.analyze_image(image, prompt)
|
209
|
+
else:
|
210
|
+
raise NotImplementedError(f"extract_text not supported for model {self.model_key}")
|
380
211
|
|
381
212
|
async def classify_image(
|
382
213
|
self,
|
383
214
|
image: Union[str, BinaryIO],
|
384
215
|
categories: Optional[List[str]] = None
|
385
216
|
) -> Dict[str, Any]:
|
386
|
-
"""
|
387
|
-
|
388
|
-
|
389
|
-
|
217
|
+
"""
|
218
|
+
图像分类 - qwen-vl-chat通过提示词实现
|
219
|
+
"""
|
220
|
+
if self.model_key == "qwen-vl-chat":
|
221
|
+
prompt = self.get_task_prompt("classify", categories=categories)
|
222
|
+
return await self.analyze_image(image, prompt)
|
390
223
|
else:
|
391
|
-
|
392
|
-
|
393
|
-
result = await self.analyze_image(image, prompt, 100)
|
394
|
-
category = result["text"].strip()
|
395
|
-
|
396
|
-
return {
|
397
|
-
"category": category,
|
398
|
-
"confidence": 1.0,
|
399
|
-
"all_predictions": [{"category": category, "confidence": 1.0}],
|
400
|
-
"metadata": result["metadata"]
|
401
|
-
}
|
224
|
+
raise NotImplementedError(f"classify_image not supported for model {self.model_key}")
|
402
225
|
|
403
|
-
async def
|
404
|
-
self,
|
405
|
-
|
406
|
-
|
226
|
+
async def extract_table_data(
|
227
|
+
self,
|
228
|
+
image: Union[str, BinaryIO],
|
229
|
+
table_format: str = "json",
|
230
|
+
preserve_formatting: bool = True
|
407
231
|
) -> Dict[str, Any]:
|
408
|
-
"""
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
# Create a simple text prompt for comparison
|
417
|
-
comparison_result = await self.analyze_image(image1, comparison_prompt)
|
418
|
-
|
419
|
-
comparison_text = comparison_result["text"]
|
420
|
-
|
421
|
-
return {
|
422
|
-
"similarity_score": 0.5, # Would need better parsing to extract actual score
|
423
|
-
"differences": comparison_text,
|
424
|
-
"common_elements": comparison_text,
|
425
|
-
"metadata": {
|
426
|
-
"model": self.model_name,
|
427
|
-
"comparison_method": "description_based"
|
428
|
-
}
|
429
|
-
}
|
232
|
+
"""
|
233
|
+
表格数据抽取 - qwen-vl-chat通过提示词实现
|
234
|
+
"""
|
235
|
+
if self.model_key == "qwen-vl-chat":
|
236
|
+
prompt = self.get_task_prompt("extract_table_data", table_format=table_format, preserve_formatting=preserve_formatting)
|
237
|
+
return await self.analyze_image(image, prompt)
|
238
|
+
else:
|
239
|
+
raise NotImplementedError(f"extract_table_data not supported for model {self.model_key}")
|
430
240
|
|
431
|
-
def
|
432
|
-
|
433
|
-
|
241
|
+
async def get_object_coordinates(
|
242
|
+
self,
|
243
|
+
image: Union[str, BinaryIO],
|
244
|
+
object_name: str
|
245
|
+
) -> Dict[str, Any]:
|
246
|
+
"""
|
247
|
+
获取对象坐标 - qwen-vl-chat通过提示词实现
|
248
|
+
"""
|
249
|
+
if self.model_key == "qwen-vl-chat":
|
250
|
+
prompt = self.get_task_prompt("get_coordinates", object_name=object_name)
|
251
|
+
return await self.analyze_image(image, prompt)
|
252
|
+
else:
|
253
|
+
raise NotImplementedError(f"get_object_coordinates not supported for model {self.model_key}")
|
434
254
|
|
435
|
-
|
436
|
-
|
437
|
-
return {
|
438
|
-
"width": 2048,
|
439
|
-
"height": 2048,
|
440
|
-
"file_size_mb": 10
|
441
|
-
}
|
255
|
+
# ==================== REPLICATE专门模型方法 ====================
|
256
|
+
# 以下方法是Replicate特有的专门模型实现,不在标准接口中
|
442
257
|
|
443
258
|
# ==================== MODEL-SPECIFIC METHODS ====================
|
444
259
|
|