isa-model 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. isa_model/__init__.py +30 -1
  2. isa_model/client.py +770 -0
  3. isa_model/core/config/__init__.py +16 -0
  4. isa_model/core/config/config_manager.py +514 -0
  5. isa_model/core/config.py +426 -0
  6. isa_model/core/models/model_billing_tracker.py +476 -0
  7. isa_model/core/models/model_manager.py +399 -0
  8. isa_model/core/{storage/supabase_storage.py → models/model_repo.py} +72 -73
  9. isa_model/core/pricing_manager.py +426 -0
  10. isa_model/core/services/__init__.py +19 -0
  11. isa_model/core/services/intelligent_model_selector.py +547 -0
  12. isa_model/core/types.py +291 -0
  13. isa_model/deployment/__init__.py +2 -0
  14. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +157 -3
  15. isa_model/deployment/cloud/modal/isa_vision_table_service.py +532 -0
  16. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +104 -3
  17. isa_model/deployment/cloud/modal/register_models.py +321 -0
  18. isa_model/deployment/runtime/deployed_service.py +338 -0
  19. isa_model/deployment/services/__init__.py +9 -0
  20. isa_model/deployment/services/auto_deploy_vision_service.py +537 -0
  21. isa_model/deployment/services/model_service.py +332 -0
  22. isa_model/deployment/services/service_monitor.py +356 -0
  23. isa_model/deployment/services/service_registry.py +527 -0
  24. isa_model/eval/__init__.py +80 -44
  25. isa_model/eval/config/__init__.py +10 -0
  26. isa_model/eval/config/evaluation_config.py +108 -0
  27. isa_model/eval/evaluators/__init__.py +18 -0
  28. isa_model/eval/evaluators/base_evaluator.py +503 -0
  29. isa_model/eval/evaluators/llm_evaluator.py +472 -0
  30. isa_model/eval/factory.py +417 -709
  31. isa_model/eval/infrastructure/__init__.py +24 -0
  32. isa_model/eval/infrastructure/experiment_tracker.py +466 -0
  33. isa_model/eval/metrics.py +191 -21
  34. isa_model/inference/ai_factory.py +181 -605
  35. isa_model/inference/services/audio/base_stt_service.py +65 -1
  36. isa_model/inference/services/audio/base_tts_service.py +75 -1
  37. isa_model/inference/services/audio/openai_stt_service.py +189 -151
  38. isa_model/inference/services/audio/openai_tts_service.py +12 -10
  39. isa_model/inference/services/audio/replicate_tts_service.py +61 -56
  40. isa_model/inference/services/base_service.py +55 -17
  41. isa_model/inference/services/embedding/base_embed_service.py +65 -1
  42. isa_model/inference/services/embedding/ollama_embed_service.py +103 -43
  43. isa_model/inference/services/embedding/openai_embed_service.py +8 -10
  44. isa_model/inference/services/helpers/stacked_config.py +148 -0
  45. isa_model/inference/services/img/__init__.py +18 -0
  46. isa_model/inference/services/{vision → img}/base_image_gen_service.py +80 -1
  47. isa_model/inference/services/{stacked → img}/flux_professional_service.py +25 -1
  48. isa_model/inference/services/{stacked → img/helpers}/base_stacked_service.py +40 -35
  49. isa_model/inference/services/{vision → img}/replicate_image_gen_service.py +44 -31
  50. isa_model/inference/services/llm/__init__.py +3 -3
  51. isa_model/inference/services/llm/base_llm_service.py +492 -40
  52. isa_model/inference/services/llm/helpers/llm_prompts.py +258 -0
  53. isa_model/inference/services/llm/helpers/llm_utils.py +280 -0
  54. isa_model/inference/services/llm/ollama_llm_service.py +51 -17
  55. isa_model/inference/services/llm/openai_llm_service.py +70 -19
  56. isa_model/inference/services/llm/yyds_llm_service.py +24 -23
  57. isa_model/inference/services/vision/__init__.py +38 -4
  58. isa_model/inference/services/vision/base_vision_service.py +218 -117
  59. isa_model/inference/services/vision/{isA_vision_service.py → disabled/isA_vision_service.py} +98 -0
  60. isa_model/inference/services/{stacked → vision}/doc_analysis_service.py +1 -1
  61. isa_model/inference/services/vision/helpers/base_stacked_service.py +274 -0
  62. isa_model/inference/services/vision/helpers/image_utils.py +272 -3
  63. isa_model/inference/services/vision/helpers/vision_prompts.py +297 -0
  64. isa_model/inference/services/vision/openai_vision_service.py +104 -307
  65. isa_model/inference/services/vision/replicate_vision_service.py +140 -325
  66. isa_model/inference/services/{stacked → vision}/ui_analysis_service.py +2 -498
  67. isa_model/scripts/register_models.py +370 -0
  68. isa_model/scripts/register_models_with_embeddings.py +510 -0
  69. isa_model/serving/api/fastapi_server.py +6 -1
  70. isa_model/serving/api/routes/unified.py +202 -0
  71. {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/METADATA +4 -1
  72. {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/RECORD +77 -53
  73. isa_model/config/__init__.py +0 -9
  74. isa_model/config/config_manager.py +0 -213
  75. isa_model/core/model_manager.py +0 -213
  76. isa_model/core/model_registry.py +0 -375
  77. isa_model/core/vision_models_init.py +0 -116
  78. isa_model/inference/billing_tracker.py +0 -406
  79. isa_model/inference/services/llm/triton_llm_service.py +0 -481
  80. isa_model/inference/services/stacked/__init__.py +0 -26
  81. isa_model/inference/services/stacked/config.py +0 -426
  82. isa_model/inference/services/vision/ollama_vision_service.py +0 -194
  83. /isa_model/core/{model_storage.py → models/model_storage.py} +0 -0
  84. /isa_model/inference/services/{vision → embedding}/helpers/text_splitter.py +0 -0
  85. /isa_model/inference/services/llm/{llm_adapter.py → helpers/llm_adapter.py} +0 -0
  86. {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/WHEEL +0 -0
  87. {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/top_level.txt +0 -0
@@ -5,13 +5,14 @@ import replicate
5
5
  import re
6
6
  import ast
7
7
  from isa_model.inference.services.vision.base_vision_service import BaseVisionService
8
- from isa_model.inference.providers.base_provider import BaseProvider
9
- from isa_model.inference.billing_tracker import ServiceType
8
+ from isa_model.core.types import ServiceType
9
+ from isa_model.inference.services.vision.helpers.image_utils import prepare_image_data_url
10
+ from isa_model.inference.services.vision.helpers.vision_prompts import VisionPromptMixin
10
11
  import logging
11
12
 
12
13
  logger = logging.getLogger(__name__)
13
14
 
14
- class ReplicateVisionService(BaseVisionService):
15
+ class ReplicateVisionService(BaseVisionService, VisionPromptMixin):
15
16
  """Enhanced Replicate Vision service supporting multiple specialized models"""
16
17
 
17
18
  # Supported model configurations
@@ -19,17 +20,18 @@ class ReplicateVisionService(BaseVisionService):
19
20
  "cogvlm": "cjwbw/cogvlm:a5092d718ea77a073e6d8f6969d5c0fb87d0ac7e4cdb7175427331e1798a34ed",
20
21
  "florence-2": "microsoft/florence-2-large:fcdb54e52322b9e6dce7a35e5d8ad173dce30b46ef49a236c1a71bc6b78b5bed",
21
22
  "omniparser": "microsoft/omniparser-v2:49cf3d41b8d3aca1360514e83be4c97131ce8f0d99abfc365526d8384caa88df",
22
- "yolov8": "adirik/yolov8:3b21ba0e5da47bb2c69a96f72894a31b7c1e77b3e8a7b6ba43b7eb93b7b2c4f4"
23
+ "yolov8": "adirik/yolov8:3b21ba0e5da47bb2c69a96f72894a31b7c1e77b3e8a7b6ba43b7eb93b7b2c4f4",
24
+ "qwen-vl-chat": "lucataco/qwen-vl-chat:50881b153b4d5f72b3db697e2bbad23bb1277ab741c5b52d80cd6ee17ea660e9"
23
25
  }
24
26
 
25
- def __init__(self, provider: 'BaseProvider', model_name: str = "cogvlm"):
27
+ def __init__(self, provider_name: str, model_name: str = "cogvlm", **kwargs):
26
28
  # Resolve model name to full model path
27
29
  self.model_key = model_name
28
30
  resolved_model = self.MODELS.get(model_name, model_name)
29
- super().__init__(provider, resolved_model)
31
+ super().__init__(provider_name, resolved_model, **kwargs)
30
32
 
31
- # Get full configuration from provider
32
- provider_config = provider.get_full_config()
33
+ # Get configuration from centralized config manager
34
+ provider_config = self.get_provider_config()
33
35
 
34
36
  # Initialize Replicate client
35
37
  try:
@@ -52,72 +54,15 @@ class ReplicateVisionService(BaseVisionService):
52
54
 
53
55
  def _prepare_image(self, image: Union[str, BinaryIO]) -> str:
54
56
  """Prepare image for Replicate API - convert to URL or base64"""
55
- if isinstance(image, str):
56
- if image.startswith(('http://', 'https://')):
57
- # Already a URL
58
- return image
59
- else:
60
- # Local file path - need to convert to base64 data URL
61
- with open(image, "rb") as f:
62
- image_data = f.read()
63
- image_b64 = base64.b64encode(image_data).decode()
64
- # Determine file extension for MIME type
65
- ext = os.path.splitext(image)[1].lower()
66
- mime_type = {
67
- '.jpg': 'image/jpeg',
68
- '.jpeg': 'image/jpeg',
69
- '.png': 'image/png',
70
- '.gif': 'image/gif',
71
- '.webp': 'image/webp'
72
- }.get(ext, 'image/jpeg')
73
- return f"data:{mime_type};base64,{image_b64}"
57
+ if isinstance(image, str) and image.startswith(('http://', 'https://')):
58
+ # Already a URL
59
+ return image
74
60
  else:
75
- # BinaryIO or bytes data - convert to base64 data URL
76
- if hasattr(image, 'read'):
77
- image_data = image.read()
78
- if isinstance(image_data, bytes):
79
- image_b64 = base64.b64encode(image_data).decode()
80
- else:
81
- raise ValueError("File-like object did not return bytes")
82
- else:
83
- # Assume it's bytes
84
- image_b64 = base64.b64encode(image).decode() # type: ignore
85
- return f"data:image/jpeg;base64,{image_b64}"
61
+ # Use unified image processing from image_utils
62
+ return prepare_image_data_url(image)
86
63
 
87
- async def invoke(
88
- self,
89
- image: Union[str, BinaryIO],
90
- prompt: Optional[str] = None,
91
- task: Optional[str] = None,
92
- **kwargs
93
- ) -> Dict[str, Any]:
94
- """
95
- Unified invoke method for all vision operations
96
- """
97
- task = task or "analyze"
98
-
99
- if task == "analyze":
100
- return await self.analyze_image(image, prompt, kwargs.get("max_tokens", 1000))
101
- elif task == "element_detection":
102
- if self.model_key == "omniparser":
103
- return await self.run_omniparser(image, **kwargs)
104
- elif self.model_key == "florence-2":
105
- return await self.run_florence2(image, **kwargs)
106
- elif self.model_key == "yolov8":
107
- return await self.run_yolo(image, **kwargs)
108
- else:
109
- return await self.detect_objects(image, kwargs.get("confidence_threshold", 0.5))
110
- elif task == "describe":
111
- return await self.describe_image(image, kwargs.get("detail_level", "medium"))
112
- elif task == "extract_text":
113
- return await self.extract_text(image)
114
- elif task == "detect_objects":
115
- return await self.detect_objects(image, kwargs.get("confidence_threshold", 0.5))
116
- elif task == "classify":
117
- return await self.classify_image(image, kwargs.get("categories"))
118
- else:
119
- # Default to analyze_image for unknown tasks
120
- return await self.analyze_image(image, prompt, kwargs.get("max_tokens", 1000))
64
+ # Replicate使用base的invoke方法,不需要重写
65
+ # 直接实现对应的标准方法即可
121
66
 
122
67
  async def analyze_image(
123
68
  self,
@@ -129,28 +74,39 @@ class ReplicateVisionService(BaseVisionService):
129
74
  Analyze image and provide description or answer questions
130
75
  """
131
76
  try:
132
- # Prepare image for API
77
+ # Prepare image for API using unified processing
133
78
  image_input = self._prepare_image(image)
134
79
 
135
80
  # Use default prompt if none provided
136
81
  if prompt is None:
137
82
  prompt = "Describe this image in detail."
138
83
 
139
- # Run CogVLM model
140
- output = replicate.run(
141
- self.model_name,
142
- input={
143
- "vqa": True, # Visual Question Answering mode
144
- "image": image_input,
145
- "query": prompt
146
- }
147
- )
84
+ # Choose input format based on model type
85
+ if self.model_key == "qwen-vl-chat":
86
+ # Qwen-VL-Chat uses simple image + prompt format
87
+ output = replicate.run(
88
+ self.model_name,
89
+ input={
90
+ "image": image_input,
91
+ "prompt": prompt
92
+ }
93
+ )
94
+ else:
95
+ # CogVLM and other models use VQA format
96
+ output = replicate.run(
97
+ self.model_name,
98
+ input={
99
+ "vqa": True, # Visual Question Answering mode
100
+ "image": image_input,
101
+ "query": prompt
102
+ }
103
+ )
148
104
 
149
105
  # CogVLM returns a string response
150
106
  response_text = str(output) if output else ""
151
107
 
152
108
  # Track usage for billing
153
- self._track_usage(
109
+ await self._track_usage(
154
110
  service_type=ServiceType.VISION,
155
111
  operation="image_analysis",
156
112
  input_tokens=len(prompt.split()) if prompt else 0,
@@ -173,272 +129,131 @@ class ReplicateVisionService(BaseVisionService):
173
129
  logger.error(f"Error in image analysis: {e}")
174
130
  raise
175
131
 
176
- async def analyze_images(
177
- self,
178
- images: List[Union[str, BinaryIO]],
179
- prompt: Optional[str] = None,
180
- max_tokens: int = 1000
181
- ) -> List[Dict[str, Any]]:
182
- """Analyze multiple images"""
183
- results = []
184
- for image in images:
185
- result = await self.analyze_image(image, prompt, max_tokens)
186
- results.append(result)
187
- return results
132
+ # ==================== 标准接口实现:检测抽取类 ====================
188
133
 
189
- async def describe_image(
190
- self,
134
+ async def detect_ui_elements(
135
+ self,
191
136
  image: Union[str, BinaryIO],
192
- detail_level: str = "medium"
137
+ element_types: Optional[List[str]] = None,
138
+ confidence_threshold: float = 0.5
193
139
  ) -> Dict[str, Any]:
194
- """Generate detailed description of image"""
195
- detail_prompts = {
196
- "low": "Briefly describe what you see in this image.",
197
- "medium": "Describe what you see in this image in detail, including objects, colors, and scene.",
198
- "high": "Provide a comprehensive and detailed description of this image, including all visible objects, their positions, colors, textures, lighting, composition, and any text or symbols present."
199
- }
200
-
201
- prompt = detail_prompts.get(detail_level, detail_prompts["medium"])
202
- result = await self.analyze_image(image, prompt, 1500)
203
-
204
- return {
205
- "description": result["text"],
206
- "objects": [], # Would need object detection API
207
- "scene": result["text"], # Use same description
208
- "colors": [], # Would need color analysis
209
- "detail_level": detail_level,
210
- "metadata": result["metadata"]
211
- }
140
+ """
141
+ UI界面元素检测 - 使用专门模型实现
142
+ """
143
+ if self.model_key == "omniparser":
144
+ return await self.run_omniparser(image, box_threshold=confidence_threshold)
145
+ elif self.model_key == "florence-2":
146
+ return await self.run_florence2(image, task="<OPEN_VOCABULARY_DETECTION>")
147
+ else:
148
+ # 使用通用物体检测作为fallback
149
+ return await self.detect_objects(image, confidence_threshold)
212
150
 
213
- async def extract_text(self, image: Union[str, BinaryIO]) -> Dict[str, Any]:
214
- """Extract text from image (OCR)"""
215
- prompt = "Extract all text visible in this image. Provide only the text content, maintaining the original structure and formatting as much as possible."
216
- result = await self.analyze_image(image, prompt, 1000)
217
-
218
- return {
219
- "text": result["text"],
220
- "confidence": 1.0,
221
- "bounding_boxes": [], # CogVLM doesn't provide bounding boxes
222
- "language": "unknown", # Would need language detection
223
- "metadata": result["metadata"]
224
- }
151
+ async def detect_document_elements(
152
+ self,
153
+ image: Union[str, BinaryIO],
154
+ element_types: Optional[List[str]] = None,
155
+ confidence_threshold: float = 0.5
156
+ ) -> Dict[str, Any]:
157
+ """
158
+ 文档结构元素检测 - 使用专门模型实现
159
+ """
160
+ if self.model_key == "florence-2":
161
+ # Florence-2可以检测文档结构
162
+ return await self.run_florence2(image, task="<DETAILED_CAPTION>")
163
+ else:
164
+ raise NotImplementedError(f"Document detection not supported for model {self.model_key}")
225
165
 
226
166
  async def detect_objects(
227
167
  self,
228
168
  image: Union[str, BinaryIO],
229
169
  confidence_threshold: float = 0.5
230
170
  ) -> Dict[str, Any]:
231
- """Detect objects in image"""
232
- prompt = """Analyze this image and identify all distinct objects, UI elements, or regions. For each element you identify, provide its location and size as percentages.
233
-
234
- Look carefully at the image and identify distinct visual elements like:
235
- - Text regions, buttons, input fields, images
236
- - Distinct objects, shapes, or regions
237
- - Interactive elements like buttons or form controls
238
-
239
- For each element, respond in this EXACT format:
240
- ElementName: x=X%, y=Y%, width=W%, height=H% - Description
241
-
242
- Where:
243
- - x% = horizontal position from left edge (0-100%)
244
- - y% = vertical position from top edge (0-100%)
245
- - width% = element width as percentage of image width (0-100%)
246
- - height% = element height as percentage of image height (0-100%)
247
-
248
- Be precise about the actual visual boundaries of each element.
249
-
250
- Example: "Submit Button: x=25%, y=60%, width=15%, height=5% - Blue rectangular button with white text"
251
- """
252
- result = await self.analyze_image(image, prompt, 1500)
253
-
254
- # Parse the response to extract object information with coordinates
255
- objects = []
256
- bounding_boxes = []
257
- lines = result["text"].split('\n')
258
-
259
- for line in lines:
260
- line = line.strip()
261
- if line and ':' in line and ('x=' in line or 'width=' in line):
262
- try:
263
- # Extract object name and details
264
- parts = line.split(':', 1)
265
- if len(parts) == 2:
266
- object_name = parts[0].strip()
267
- details = parts[1].strip()
268
-
269
- # Extract coordinates using regex-like parsing
270
- coords = {}
271
- for param in ['x', 'y', 'width', 'height']:
272
- param_pattern = f"{param}="
273
- if param_pattern in details:
274
- start_idx = details.find(param_pattern) + len(param_pattern)
275
- end_idx = details.find('%', start_idx)
276
- if end_idx > start_idx:
277
- try:
278
- value = float(details[start_idx:end_idx])
279
- coords[param] = value
280
- except ValueError:
281
- continue
282
-
283
- # Extract description (after the coordinates)
284
- desc_start = details.find(' - ')
285
- description = details[desc_start + 3:] if desc_start != -1 else details
286
-
287
- objects.append({
288
- "label": object_name,
289
- "confidence": 1.0,
290
- "coordinates": coords,
291
- "description": description
292
- })
293
-
294
- # Add bounding box if we have coordinates
295
- if all(k in coords for k in ['x', 'y', 'width', 'height']):
296
- bounding_boxes.append({
297
- "label": object_name,
298
- "x_percent": coords['x'],
299
- "y_percent": coords['y'],
300
- "width_percent": coords['width'],
301
- "height_percent": coords['height']
302
- })
303
-
304
- except Exception:
305
- # Fallback for objects that don't match expected format
306
- objects.append({
307
- "label": line,
308
- "confidence": 1.0,
309
- "coordinates": {},
310
- "description": line
311
- })
312
-
313
- return {
314
- "objects": objects,
315
- "count": len(objects),
316
- "bounding_boxes": bounding_boxes,
317
- "metadata": result["metadata"]
318
- }
171
+ """
172
+ 通用物体检测 - 实现标准接口
173
+ """
174
+ if self.model_key == "yolov8":
175
+ return await self.run_yolo(image, confidence=confidence_threshold)
176
+ elif self.model_key == "florence-2":
177
+ return await self.run_florence2(image, task="<OD>")
178
+ elif self.model_key == "qwen-vl-chat":
179
+ # Qwen-VL-Chat can do object detection through prompting
180
+ prompt = self.get_task_prompt("detect_objects", confidence_threshold=confidence_threshold)
181
+ return await self.analyze_image(image, prompt)
182
+ else:
183
+ raise NotImplementedError(f"Object detection not supported for model {self.model_key}")
319
184
 
320
- async def get_object_coordinates(
321
- self,
185
+ # ==================== QWEN-VL-CHAT 智能提示词实现 ====================
186
+ # 类似 OpenAI,qwen-vl-chat 通过提示词实现所有 Vision 功能
187
+
188
+ async def describe_image(
189
+ self,
322
190
  image: Union[str, BinaryIO],
323
- object_name: str
191
+ detail_level: str = "medium"
324
192
  ) -> Dict[str, Any]:
325
- """Get coordinates of a specific object in the image"""
326
- prompt = f"""Locate the {object_name} in this image and return its center coordinates as [x, y] pixels.
327
-
328
- Look carefully at the image to find the exact element described. Be very precise about the location.
329
-
330
- Respond in this exact format:
331
- FOUND: YES/NO
332
- CENTER: [x, y]
333
- DESCRIPTION: [Brief description]
334
-
335
- If found, provide the pixel coordinates of the center point.
336
- If not found, explain why.
337
-
338
- Example:
339
- FOUND: YES
340
- CENTER: [640, 360]
341
- DESCRIPTION: Blue login button in the center-left area
342
- """
343
-
344
- result = await self.analyze_image(image, prompt, 300)
345
- response_text = result["text"]
346
-
347
- # Parse the structured response
348
- found = False
349
- center_coords = None
350
- description = ""
351
-
352
- lines = response_text.split('\n')
353
- for line in lines:
354
- line = line.strip()
355
- if line.startswith('FOUND:'):
356
- found = 'YES' in line.upper()
357
- elif line.startswith('CENTER:') and found:
358
- # Extract center coordinates [x, y]
359
- coords_text = line.replace('CENTER:', '').strip()
360
- try:
361
- # Remove brackets and split
362
- coords_text = coords_text.replace('[', '').replace(']', '')
363
- if ',' in coords_text:
364
- x_str, y_str = coords_text.split(',')
365
- x = int(float(x_str.strip()))
366
- y = int(float(y_str.strip()))
367
- center_coords = [x, y]
368
- except (ValueError, IndexError):
369
- pass
370
- elif line.startswith('DESCRIPTION:'):
371
- description = line.replace('DESCRIPTION:', '').strip()
372
-
373
- return {
374
- "found": found,
375
- "center_coordinates": center_coords,
376
- "confidence": 1.0 if found else 0.0,
377
- "description": description,
378
- "metadata": result["metadata"]
379
- }
193
+ """
194
+ 图像描述 - qwen-vl-chat通过提示词实现
195
+ """
196
+ if self.model_key == "qwen-vl-chat":
197
+ prompt = self.get_task_prompt("describe", detail_level=detail_level)
198
+ return await self.analyze_image(image, prompt)
199
+ else:
200
+ raise NotImplementedError(f"describe_image not supported for model {self.model_key}")
201
+
202
+ async def extract_text(self, image: Union[str, BinaryIO]) -> Dict[str, Any]:
203
+ """
204
+ 文本提取(OCR) - qwen-vl-chat通过提示词实现
205
+ """
206
+ if self.model_key == "qwen-vl-chat":
207
+ prompt = self.get_task_prompt("extract_text")
208
+ return await self.analyze_image(image, prompt)
209
+ else:
210
+ raise NotImplementedError(f"extract_text not supported for model {self.model_key}")
380
211
 
381
212
  async def classify_image(
382
213
  self,
383
214
  image: Union[str, BinaryIO],
384
215
  categories: Optional[List[str]] = None
385
216
  ) -> Dict[str, Any]:
386
- """Classify image into categories"""
387
- if categories:
388
- category_list = ", ".join(categories)
389
- prompt = f"Classify this image into one of these categories: {category_list}. Respond with only the most appropriate category name."
217
+ """
218
+ 图像分类 - qwen-vl-chat通过提示词实现
219
+ """
220
+ if self.model_key == "qwen-vl-chat":
221
+ prompt = self.get_task_prompt("classify", categories=categories)
222
+ return await self.analyze_image(image, prompt)
390
223
  else:
391
- prompt = "What category best describes this image? Provide a single category name."
392
-
393
- result = await self.analyze_image(image, prompt, 100)
394
- category = result["text"].strip()
395
-
396
- return {
397
- "category": category,
398
- "confidence": 1.0,
399
- "all_predictions": [{"category": category, "confidence": 1.0}],
400
- "metadata": result["metadata"]
401
- }
224
+ raise NotImplementedError(f"classify_image not supported for model {self.model_key}")
402
225
 
403
- async def compare_images(
404
- self,
405
- image1: Union[str, BinaryIO],
406
- image2: Union[str, BinaryIO]
226
+ async def extract_table_data(
227
+ self,
228
+ image: Union[str, BinaryIO],
229
+ table_format: str = "json",
230
+ preserve_formatting: bool = True
407
231
  ) -> Dict[str, Any]:
408
- """Compare two images for similarity"""
409
- # For now, analyze both images separately and compare descriptions
410
- result1 = await self.analyze_image(image1, "Describe this image in detail.")
411
- result2 = await self.analyze_image(image2, "Describe this image in detail.")
412
-
413
- # Use another CogVLM call to compare the descriptions
414
- comparison_prompt = f"Compare these two image descriptions and provide a similarity analysis:\n\nImage 1: {result1['text']}\n\nImage 2: {result2['text']}\n\nProvide: 1) A similarity score from 0.0 to 1.0, 2) Key differences, 3) Common elements."
415
-
416
- # Create a simple text prompt for comparison
417
- comparison_result = await self.analyze_image(image1, comparison_prompt)
418
-
419
- comparison_text = comparison_result["text"]
420
-
421
- return {
422
- "similarity_score": 0.5, # Would need better parsing to extract actual score
423
- "differences": comparison_text,
424
- "common_elements": comparison_text,
425
- "metadata": {
426
- "model": self.model_name,
427
- "comparison_method": "description_based"
428
- }
429
- }
232
+ """
233
+ 表格数据抽取 - qwen-vl-chat通过提示词实现
234
+ """
235
+ if self.model_key == "qwen-vl-chat":
236
+ prompt = self.get_task_prompt("extract_table_data", table_format=table_format, preserve_formatting=preserve_formatting)
237
+ return await self.analyze_image(image, prompt)
238
+ else:
239
+ raise NotImplementedError(f"extract_table_data not supported for model {self.model_key}")
430
240
 
431
- def get_supported_formats(self) -> List[str]:
432
- """Get list of supported image formats"""
433
- return ['jpg', 'jpeg', 'png', 'gif', 'webp']
241
+ async def get_object_coordinates(
242
+ self,
243
+ image: Union[str, BinaryIO],
244
+ object_name: str
245
+ ) -> Dict[str, Any]:
246
+ """
247
+ 获取对象坐标 - qwen-vl-chat通过提示词实现
248
+ """
249
+ if self.model_key == "qwen-vl-chat":
250
+ prompt = self.get_task_prompt("get_coordinates", object_name=object_name)
251
+ return await self.analyze_image(image, prompt)
252
+ else:
253
+ raise NotImplementedError(f"get_object_coordinates not supported for model {self.model_key}")
434
254
 
435
- def get_max_image_size(self) -> Dict[str, int]:
436
- """Get maximum supported image dimensions"""
437
- return {
438
- "width": 2048,
439
- "height": 2048,
440
- "file_size_mb": 10
441
- }
255
+ # ==================== REPLICATE专门模型方法 ====================
256
+ # 以下方法是Replicate特有的专门模型实现,不在标准接口中
442
257
 
443
258
  # ==================== MODEL-SPECIFIC METHODS ====================
444
259