isa-model 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. isa_model/__init__.py +30 -1
  2. isa_model/client.py +937 -0
  3. isa_model/core/config/__init__.py +16 -0
  4. isa_model/core/config/config_manager.py +514 -0
  5. isa_model/core/config.py +426 -0
  6. isa_model/core/models/model_billing_tracker.py +476 -0
  7. isa_model/core/models/model_manager.py +399 -0
  8. isa_model/core/{storage/supabase_storage.py → models/model_repo.py} +72 -73
  9. isa_model/core/pricing_manager.py +426 -0
  10. isa_model/core/services/__init__.py +19 -0
  11. isa_model/core/services/intelligent_model_selector.py +547 -0
  12. isa_model/core/types.py +291 -0
  13. isa_model/deployment/__init__.py +2 -0
  14. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +157 -3
  15. isa_model/deployment/cloud/modal/isa_vision_table_service.py +532 -0
  16. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +104 -3
  17. isa_model/deployment/cloud/modal/register_models.py +321 -0
  18. isa_model/deployment/runtime/deployed_service.py +338 -0
  19. isa_model/deployment/services/__init__.py +9 -0
  20. isa_model/deployment/services/auto_deploy_vision_service.py +538 -0
  21. isa_model/deployment/services/model_service.py +332 -0
  22. isa_model/deployment/services/service_monitor.py +356 -0
  23. isa_model/deployment/services/service_registry.py +527 -0
  24. isa_model/deployment/services/simple_auto_deploy_vision_service.py +275 -0
  25. isa_model/eval/__init__.py +80 -44
  26. isa_model/eval/config/__init__.py +10 -0
  27. isa_model/eval/config/evaluation_config.py +108 -0
  28. isa_model/eval/evaluators/__init__.py +18 -0
  29. isa_model/eval/evaluators/base_evaluator.py +503 -0
  30. isa_model/eval/evaluators/llm_evaluator.py +472 -0
  31. isa_model/eval/factory.py +417 -709
  32. isa_model/eval/infrastructure/__init__.py +24 -0
  33. isa_model/eval/infrastructure/experiment_tracker.py +466 -0
  34. isa_model/eval/metrics.py +191 -21
  35. isa_model/inference/ai_factory.py +257 -601
  36. isa_model/inference/services/audio/base_stt_service.py +65 -1
  37. isa_model/inference/services/audio/base_tts_service.py +75 -1
  38. isa_model/inference/services/audio/openai_stt_service.py +189 -151
  39. isa_model/inference/services/audio/openai_tts_service.py +12 -10
  40. isa_model/inference/services/audio/replicate_tts_service.py +61 -56
  41. isa_model/inference/services/base_service.py +55 -17
  42. isa_model/inference/services/embedding/base_embed_service.py +65 -1
  43. isa_model/inference/services/embedding/ollama_embed_service.py +103 -43
  44. isa_model/inference/services/embedding/openai_embed_service.py +8 -10
  45. isa_model/inference/services/helpers/stacked_config.py +148 -0
  46. isa_model/inference/services/img/__init__.py +18 -0
  47. isa_model/inference/services/{vision → img}/base_image_gen_service.py +80 -1
  48. isa_model/inference/services/{stacked → img}/flux_professional_service.py +25 -1
  49. isa_model/inference/services/{stacked → img/helpers}/base_stacked_service.py +40 -35
  50. isa_model/inference/services/{vision → img}/replicate_image_gen_service.py +44 -31
  51. isa_model/inference/services/llm/__init__.py +3 -3
  52. isa_model/inference/services/llm/base_llm_service.py +492 -40
  53. isa_model/inference/services/llm/helpers/llm_prompts.py +258 -0
  54. isa_model/inference/services/llm/helpers/llm_utils.py +280 -0
  55. isa_model/inference/services/llm/ollama_llm_service.py +51 -17
  56. isa_model/inference/services/llm/openai_llm_service.py +70 -19
  57. isa_model/inference/services/llm/yyds_llm_service.py +24 -23
  58. isa_model/inference/services/vision/__init__.py +38 -4
  59. isa_model/inference/services/vision/base_vision_service.py +218 -117
  60. isa_model/inference/services/vision/{isA_vision_service.py → disabled/isA_vision_service.py} +98 -0
  61. isa_model/inference/services/{stacked → vision}/doc_analysis_service.py +1 -1
  62. isa_model/inference/services/vision/helpers/base_stacked_service.py +274 -0
  63. isa_model/inference/services/vision/helpers/image_utils.py +272 -3
  64. isa_model/inference/services/vision/helpers/vision_prompts.py +297 -0
  65. isa_model/inference/services/vision/openai_vision_service.py +104 -307
  66. isa_model/inference/services/vision/replicate_vision_service.py +140 -325
  67. isa_model/inference/services/{stacked → vision}/ui_analysis_service.py +2 -498
  68. isa_model/scripts/register_models.py +370 -0
  69. isa_model/scripts/register_models_with_embeddings.py +510 -0
  70. isa_model/serving/api/fastapi_server.py +6 -1
  71. isa_model/serving/api/routes/unified.py +274 -0
  72. {isa_model-0.3.5.dist-info → isa_model-0.3.7.dist-info}/METADATA +4 -1
  73. {isa_model-0.3.5.dist-info → isa_model-0.3.7.dist-info}/RECORD +78 -53
  74. isa_model/config/__init__.py +0 -9
  75. isa_model/config/config_manager.py +0 -213
  76. isa_model/core/model_manager.py +0 -213
  77. isa_model/core/model_registry.py +0 -375
  78. isa_model/core/vision_models_init.py +0 -116
  79. isa_model/inference/billing_tracker.py +0 -406
  80. isa_model/inference/services/llm/triton_llm_service.py +0 -481
  81. isa_model/inference/services/stacked/__init__.py +0 -26
  82. isa_model/inference/services/stacked/config.py +0 -426
  83. isa_model/inference/services/vision/ollama_vision_service.py +0 -194
  84. /isa_model/core/{model_storage.py → models/model_storage.py} +0 -0
  85. /isa_model/inference/services/{vision → embedding}/helpers/text_splitter.py +0 -0
  86. /isa_model/inference/services/llm/{llm_adapter.py → helpers/llm_adapter.py} +0 -0
  87. {isa_model-0.3.5.dist-info → isa_model-0.3.7.dist-info}/WHEEL +0 -0
  88. {isa_model-0.3.5.dist-info → isa_model-0.3.7.dist-info}/top_level.txt +0 -0
@@ -1,426 +0,0 @@
1
- """
2
- Configuration system for stacked services
3
- """
4
-
5
- from typing import Dict, Any, List
6
- from dataclasses import dataclass, field
7
- from enum import Enum
8
-
9
- from .base_stacked_service import LayerConfig, LayerType
10
-
11
- class WorkflowType(Enum):
12
- """Predefined workflow types"""
13
- UI_ANALYSIS_FAST = "ui_analysis_fast"
14
- UI_ANALYSIS_ACCURATE = "ui_analysis_accurate"
15
- UI_ANALYSIS_COMPREHENSIVE = "ui_analysis_comprehensive"
16
- SEARCH_PAGE_ANALYSIS = "search_page_analysis"
17
- CONTENT_EXTRACTION = "content_extraction"
18
- FORM_INTERACTION = "form_interaction"
19
- NAVIGATION_ANALYSIS = "navigation_analysis"
20
- CUSTOM = "custom"
21
-
22
- @dataclass
23
- class StackedServiceConfig:
24
- """Configuration for a stacked service workflow"""
25
- name: str
26
- workflow_type: WorkflowType
27
- layers: List[LayerConfig] = field(default_factory=list)
28
- global_timeout: float = 120.0
29
- parallel_execution: bool = False
30
- fail_fast: bool = False
31
- metadata: Dict[str, Any] = field(default_factory=dict)
32
-
33
- class ConfigManager:
34
- """Manager for stacked service configurations"""
35
-
36
- PREDEFINED_CONFIGS = {
37
- WorkflowType.UI_ANALYSIS_FAST: {
38
- "name": "Fast UI Analysis",
39
- "layers": [
40
- LayerConfig(
41
- name="page_intelligence",
42
- layer_type=LayerType.INTELLIGENCE,
43
- service_type="vision",
44
- model_name="gpt-4.1-nano",
45
- parameters={"max_tokens": 300},
46
- depends_on=[],
47
- timeout=10.0,
48
- fallback_enabled=True
49
- ),
50
- LayerConfig(
51
- name="element_detection",
52
- layer_type=LayerType.DETECTION,
53
- service_type="vision",
54
- model_name="omniparser",
55
- parameters={
56
- "imgsz": 480,
57
- "box_threshold": 0.08,
58
- "iou_threshold": 0.2
59
- },
60
- depends_on=["page_intelligence"],
61
- timeout=15.0,
62
- fallback_enabled=True
63
- ),
64
- LayerConfig(
65
- name="element_classification",
66
- layer_type=LayerType.CLASSIFICATION,
67
- service_type="vision",
68
- model_name="gpt-4.1-nano",
69
- parameters={"max_tokens": 200},
70
- depends_on=["page_intelligence", "element_detection"],
71
- timeout=20.0,
72
- fallback_enabled=False
73
- )
74
- ],
75
- "global_timeout": 60.0,
76
- "parallel_execution": False,
77
- "fail_fast": False,
78
- "metadata": {
79
- "description": "Fast UI analysis optimized for speed",
80
- "expected_time": "30-45 seconds",
81
- "accuracy": "medium"
82
- }
83
- },
84
-
85
- WorkflowType.UI_ANALYSIS_ACCURATE: {
86
- "name": "Accurate UI Analysis",
87
- "layers": [
88
- LayerConfig(
89
- name="page_intelligence",
90
- layer_type=LayerType.INTELLIGENCE,
91
- service_type="vision",
92
- model_name="gpt-4-vision-preview",
93
- parameters={"max_tokens": 800},
94
- depends_on=[],
95
- timeout=20.0,
96
- fallback_enabled=True
97
- ),
98
- LayerConfig(
99
- name="element_detection",
100
- layer_type=LayerType.DETECTION,
101
- service_type="vision",
102
- model_name="omniparser",
103
- parameters={
104
- "imgsz": 640,
105
- "box_threshold": 0.05,
106
- "iou_threshold": 0.1
107
- },
108
- depends_on=["page_intelligence"],
109
- timeout=25.0,
110
- fallback_enabled=True
111
- ),
112
- LayerConfig(
113
- name="element_classification",
114
- layer_type=LayerType.CLASSIFICATION,
115
- service_type="vision",
116
- model_name="gpt-4-vision-preview",
117
- parameters={"max_tokens": 500},
118
- depends_on=["page_intelligence", "element_detection"],
119
- timeout=30.0,
120
- fallback_enabled=False
121
- )
122
- ],
123
- "global_timeout": 90.0,
124
- "parallel_execution": False,
125
- "fail_fast": False,
126
- "metadata": {
127
- "description": "Balanced UI analysis for production use",
128
- "expected_time": "60-75 seconds",
129
- "accuracy": "high"
130
- }
131
- },
132
-
133
- WorkflowType.SEARCH_PAGE_ANALYSIS: {
134
- "name": "Search Page Analysis",
135
- "layers": [
136
- LayerConfig(
137
- name="page_intelligence",
138
- layer_type=LayerType.INTELLIGENCE,
139
- service_type="vision",
140
- model_name="default",
141
- parameters={
142
- "task": "search_page_intelligence",
143
- "max_tokens": 400
144
- },
145
- depends_on=[],
146
- timeout=15.0,
147
- fallback_enabled=True
148
- ),
149
- LayerConfig(
150
- name="element_detection",
151
- layer_type=LayerType.DETECTION,
152
- service_type="vision",
153
- model_name="omniparser",
154
- parameters={
155
- "task": "element_detection",
156
- "imgsz": 640,
157
- "box_threshold": 0.05,
158
- "iou_threshold": 0.1
159
- },
160
- depends_on=["page_intelligence"],
161
- timeout=20.0,
162
- fallback_enabled=True
163
- ),
164
- LayerConfig(
165
- name="element_classification",
166
- layer_type=LayerType.CLASSIFICATION,
167
- service_type="vision",
168
- model_name="default",
169
- parameters={
170
- "task": "search_element_classification",
171
- "max_tokens": 300
172
- },
173
- depends_on=["page_intelligence", "element_detection"],
174
- timeout=25.0,
175
- fallback_enabled=False
176
- )
177
- ],
178
- "global_timeout": 80.0,
179
- "parallel_execution": False,
180
- "fail_fast": False,
181
- "metadata": {
182
- "description": "Analysis for search pages (Google, Bing, etc.)",
183
- "expected_time": "45-60 seconds",
184
- "accuracy": "high",
185
- "page_types": ["search", "query", "results"]
186
- }
187
- },
188
-
189
- WorkflowType.CONTENT_EXTRACTION: {
190
- "name": "Content Extraction",
191
- "layers": [
192
- LayerConfig(
193
- name="page_intelligence",
194
- layer_type=LayerType.INTELLIGENCE,
195
- service_type="vision",
196
- model_name="default",
197
- parameters={
198
- "task": "content_page_intelligence",
199
- "max_tokens": 500
200
- },
201
- depends_on=[],
202
- timeout=15.0,
203
- fallback_enabled=True
204
- ),
205
- LayerConfig(
206
- name="content_detection",
207
- layer_type=LayerType.DETECTION,
208
- service_type="vision",
209
- model_name="florence-2",
210
- parameters={
211
- "task": "<OPEN_VOCABULARY_DETECTION>",
212
- "text_input": "article content, text blocks, headings, paragraphs, links"
213
- },
214
- depends_on=["page_intelligence"],
215
- timeout=25.0,
216
- fallback_enabled=True
217
- ),
218
- LayerConfig(
219
- name="content_classification",
220
- layer_type=LayerType.CLASSIFICATION,
221
- service_type="vision",
222
- model_name="default",
223
- parameters={
224
- "task": "content_classification",
225
- "max_tokens": 400
226
- },
227
- depends_on=["page_intelligence", "content_detection"],
228
- timeout=30.0,
229
- fallback_enabled=False
230
- )
231
- ],
232
- "global_timeout": 90.0,
233
- "parallel_execution": False,
234
- "fail_fast": False,
235
- "metadata": {
236
- "description": "Extract and analyze content from web pages",
237
- "expected_time": "60-75 seconds",
238
- "accuracy": "high",
239
- "page_types": ["article", "blog", "news", "documentation"]
240
- }
241
- },
242
-
243
- WorkflowType.UI_ANALYSIS_COMPREHENSIVE: {
244
- "name": "Comprehensive UI Analysis",
245
- "layers": [
246
- LayerConfig(
247
- name="page_intelligence",
248
- layer_type=LayerType.INTELLIGENCE,
249
- service_type="vision",
250
- model_name="gpt-4-vision-preview",
251
- parameters={"max_tokens": 1000},
252
- depends_on=[],
253
- timeout=25.0,
254
- fallback_enabled=True
255
- ),
256
- LayerConfig(
257
- name="primary_detection",
258
- layer_type=LayerType.DETECTION,
259
- service_type="vision",
260
- model_name="omniparser",
261
- parameters={
262
- "imgsz": 1024,
263
- "box_threshold": 0.03,
264
- "iou_threshold": 0.1
265
- },
266
- depends_on=["page_intelligence"],
267
- timeout=30.0,
268
- fallback_enabled=True
269
- ),
270
- LayerConfig(
271
- name="secondary_detection",
272
- layer_type=LayerType.DETECTION,
273
- service_type="vision",
274
- model_name="florence-2",
275
- parameters={
276
- "task": "<OPEN_VOCABULARY_DETECTION>",
277
- "text_input": "login form elements, input fields, buttons, checkboxes"
278
- },
279
- depends_on=["page_intelligence"],
280
- timeout=25.0,
281
- fallback_enabled=True
282
- ),
283
- LayerConfig(
284
- name="detection_fusion",
285
- layer_type=LayerType.TRANSFORMATION,
286
- service_type="custom",
287
- model_name="fusion_algorithm",
288
- parameters={"fusion_method": "confidence_weighted"},
289
- depends_on=["primary_detection", "secondary_detection"],
290
- timeout=5.0,
291
- fallback_enabled=False
292
- ),
293
- LayerConfig(
294
- name="element_classification",
295
- layer_type=LayerType.CLASSIFICATION,
296
- service_type="vision",
297
- model_name="gpt-4-vision-preview",
298
- parameters={"max_tokens": 600},
299
- depends_on=["page_intelligence", "detection_fusion"],
300
- timeout=40.0,
301
- fallback_enabled=False
302
- ),
303
- LayerConfig(
304
- name="result_validation",
305
- layer_type=LayerType.VALIDATION,
306
- service_type="vision",
307
- model_name="gpt-4.1-nano",
308
- parameters={"validation_criteria": ["completeness", "consistency", "accuracy"]},
309
- depends_on=["element_classification"],
310
- timeout=15.0,
311
- fallback_enabled=True
312
- )
313
- ],
314
- "global_timeout": 180.0,
315
- "parallel_execution": True, # Enable parallel execution for detection layers
316
- "fail_fast": False,
317
- "metadata": {
318
- "description": "Most comprehensive UI analysis with multi-model fusion",
319
- "expected_time": "120-150 seconds",
320
- "accuracy": "very high"
321
- }
322
- }
323
- }
324
-
325
- @classmethod
326
- def get_config(cls, workflow_type: WorkflowType) -> StackedServiceConfig:
327
- """Get predefined configuration for a workflow type"""
328
- if workflow_type not in cls.PREDEFINED_CONFIGS:
329
- raise ValueError(f"Unknown workflow type: {workflow_type}")
330
-
331
- config_data = cls.PREDEFINED_CONFIGS[workflow_type]
332
-
333
- return StackedServiceConfig(
334
- name=config_data["name"],
335
- workflow_type=workflow_type,
336
- layers=config_data["layers"],
337
- global_timeout=config_data["global_timeout"],
338
- parallel_execution=config_data["parallel_execution"],
339
- fail_fast=config_data["fail_fast"],
340
- metadata=config_data["metadata"]
341
- )
342
-
343
- @classmethod
344
- def create_custom_config(
345
- cls,
346
- name: str,
347
- layers: List[LayerConfig],
348
- global_timeout: float = 120.0,
349
- parallel_execution: bool = False,
350
- fail_fast: bool = False,
351
- metadata: Dict[str, Any] = None
352
- ) -> StackedServiceConfig:
353
- """Create a custom configuration"""
354
- return StackedServiceConfig(
355
- name=name,
356
- workflow_type=WorkflowType.CUSTOM,
357
- layers=layers,
358
- global_timeout=global_timeout,
359
- parallel_execution=parallel_execution,
360
- fail_fast=fail_fast,
361
- metadata=metadata or {}
362
- )
363
-
364
- @classmethod
365
- def modify_config(
366
- cls,
367
- base_config: StackedServiceConfig,
368
- modifications: Dict[str, Any]
369
- ) -> StackedServiceConfig:
370
- """Modify an existing configuration"""
371
- # Create a copy
372
- new_config = StackedServiceConfig(
373
- name=base_config.name,
374
- workflow_type=base_config.workflow_type,
375
- layers=base_config.layers.copy(),
376
- global_timeout=base_config.global_timeout,
377
- parallel_execution=base_config.parallel_execution,
378
- fail_fast=base_config.fail_fast,
379
- metadata=base_config.metadata.copy()
380
- )
381
-
382
- # Apply modifications
383
- for key, value in modifications.items():
384
- if hasattr(new_config, key):
385
- setattr(new_config, key, value)
386
- elif key == "layer_modifications":
387
- # Modify specific layers
388
- for layer_name, layer_mods in value.items():
389
- for layer in new_config.layers:
390
- if layer.name == layer_name:
391
- for mod_key, mod_value in layer_mods.items():
392
- if hasattr(layer, mod_key):
393
- setattr(layer, mod_key, mod_value)
394
- elif mod_key == "parameters":
395
- layer.parameters.update(mod_value)
396
-
397
- return new_config
398
-
399
- @classmethod
400
- def get_available_workflows(cls) -> Dict[WorkflowType, Dict[str, Any]]:
401
- """Get information about all available workflows"""
402
- workflows = {}
403
-
404
- for workflow_type in cls.PREDEFINED_CONFIGS:
405
- config_data = cls.PREDEFINED_CONFIGS[workflow_type]
406
- workflows[workflow_type] = {
407
- "name": config_data["name"],
408
- "layer_count": len(config_data["layers"]),
409
- "expected_time": config_data["metadata"].get("expected_time", "unknown"),
410
- "accuracy": config_data["metadata"].get("accuracy", "unknown"),
411
- "description": config_data["metadata"].get("description", "")
412
- }
413
-
414
- return workflows
415
-
416
- # Convenience function for quick access
417
- def get_ui_analysis_config(speed: str = "accurate") -> StackedServiceConfig:
418
- """Get UI analysis configuration by speed preference"""
419
- speed_mapping = {
420
- "fast": WorkflowType.UI_ANALYSIS_FAST,
421
- "accurate": WorkflowType.UI_ANALYSIS_ACCURATE,
422
- "comprehensive": WorkflowType.UI_ANALYSIS_COMPREHENSIVE
423
- }
424
-
425
- workflow_type = speed_mapping.get(speed.lower(), WorkflowType.UI_ANALYSIS_ACCURATE)
426
- return ConfigManager.get_config(workflow_type)
@@ -1,194 +0,0 @@
1
- import os
2
- import json
3
- import base64
4
- import ollama
5
- from typing import Dict, Any, Union, List, Optional, BinaryIO
6
- from tenacity import retry, stop_after_attempt, wait_exponential
7
- from isa_model.inference.services.vision.base_vision_service import BaseVisionService
8
- from isa_model.inference.providers.base_provider import BaseProvider
9
- import logging
10
- import requests
11
-
12
- logger = logging.getLogger(__name__)
13
-
14
- class OllamaVisionService(BaseVisionService):
15
- """Vision model service wrapper for Ollama using base64 encoded images"""
16
-
17
- def __init__(self, provider: 'BaseProvider', model_name: str = 'gemma3:4b'):
18
- super().__init__(provider, model_name)
19
- self.max_tokens = self.config.get('max_tokens', 1000)
20
- self.temperature = self.config.get('temperature', 0.7)
21
-
22
- def _get_image_data(self, image: Union[str, BinaryIO]) -> bytes:
23
- """获取图像数据,支持本地文件和URL"""
24
- if isinstance(image, str):
25
- # Check if it's a URL
26
- if image.startswith(('http://', 'https://')):
27
- response = requests.get(image)
28
- response.raise_for_status()
29
- return response.content
30
- else:
31
- # Local file path
32
- with open(image, 'rb') as f:
33
- return f.read()
34
- else:
35
- return image.read()
36
-
37
- @retry(
38
- stop=stop_after_attempt(3),
39
- wait=wait_exponential(multiplier=1, min=4, max=10),
40
- reraise=True
41
- )
42
- async def analyze_image(
43
- self,
44
- image: Union[str, BinaryIO],
45
- prompt: Optional[str] = None,
46
- max_tokens: int = 1000
47
- ) -> Dict[str, Any]:
48
- """
49
- Analyze image and provide description or answer questions
50
- """
51
- try:
52
- # 获取图像数据
53
- image_data = self._get_image_data(image)
54
-
55
- # 转换为base64
56
- image_base64 = base64.b64encode(image_data).decode('utf-8')
57
-
58
- # 使用默认提示词如果没有提供
59
- query = prompt or "请描述这张图片的内容。"
60
-
61
- # 使用 ollama 库直接调用
62
- response = ollama.chat(
63
- model=self.model_name,
64
- messages=[{
65
- 'role': 'user',
66
- 'content': query,
67
- 'images': [image_base64]
68
- }]
69
- )
70
-
71
- content = response['message']['content']
72
-
73
- return {
74
- "text": content,
75
- "confidence": 1.0, # Ollama doesn't provide confidence scores
76
- "detected_objects": [], # Basic implementation
77
- "metadata": {
78
- "model": self.model_name,
79
- "prompt": query
80
- }
81
- }
82
-
83
- except Exception as e:
84
- logger.error(f"Error in image analysis: {e}")
85
- raise
86
-
87
- async def analyze_images(
88
- self,
89
- images: List[Union[str, BinaryIO]],
90
- prompt: Optional[str] = None,
91
- max_tokens: int = 1000
92
- ) -> List[Dict[str, Any]]:
93
- """Analyze multiple images"""
94
- results = []
95
- for image in images:
96
- result = await self.analyze_image(image, prompt, max_tokens)
97
- results.append(result)
98
- return results
99
-
100
- async def describe_image(
101
- self,
102
- image: Union[str, BinaryIO],
103
- detail_level: str = "medium"
104
- ) -> Dict[str, Any]:
105
- """Generate detailed description of image"""
106
- prompts = {
107
- "low": "简单描述这张图片。",
108
- "medium": "详细描述这张图片的内容、颜色、物体和场景。",
109
- "high": "非常详细地描述这张图片,包括所有可见的物体、颜色、纹理、场景、情感和任何其他细节。"
110
- }
111
-
112
- prompt = prompts.get(detail_level, prompts["medium"])
113
- result = await self.analyze_image(image, prompt)
114
-
115
- return {
116
- "description": result["text"],
117
- "objects": [], # Basic implementation
118
- "scene": "未知", # Basic implementation
119
- "colors": [] # Basic implementation
120
- }
121
-
122
- async def extract_text(self, image: Union[str, BinaryIO]) -> Dict[str, Any]:
123
- """Extract text from image (OCR)"""
124
- result = await self.analyze_image(image, "提取图片中的所有文字内容。")
125
-
126
- return {
127
- "text": result["text"],
128
- "confidence": 1.0,
129
- "bounding_boxes": [], # Basic implementation
130
- "language": "未知" # Basic implementation
131
- }
132
-
133
- async def detect_objects(
134
- self,
135
- image: Union[str, BinaryIO],
136
- confidence_threshold: float = 0.5
137
- ) -> Dict[str, Any]:
138
- """Detect objects in image"""
139
- result = await self.analyze_image(image, "识别并列出图片中的所有物体。")
140
-
141
- return {
142
- "objects": [], # Basic implementation - would need parsing
143
- "count": 0,
144
- "bounding_boxes": []
145
- }
146
-
147
- async def classify_image(
148
- self,
149
- image: Union[str, BinaryIO],
150
- categories: Optional[List[str]] = None
151
- ) -> Dict[str, Any]:
152
- """Classify image into categories"""
153
- if categories:
154
- category_str = "、".join(categories)
155
- prompt = f"将这张图片分类到以下类别之一:{category_str}"
156
- else:
157
- prompt = "这张图片属于什么类别?"
158
-
159
- result = await self.analyze_image(image, prompt)
160
-
161
- return {
162
- "category": result["text"],
163
- "confidence": 1.0,
164
- "all_predictions": [{"category": result["text"], "confidence": 1.0}]
165
- }
166
-
167
- async def compare_images(
168
- self,
169
- image1: Union[str, BinaryIO],
170
- image2: Union[str, BinaryIO]
171
- ) -> Dict[str, Any]:
172
- """Compare two images for similarity"""
173
- # For now, analyze each image separately and compare descriptions
174
- result1 = await self.analyze_image(image1, "描述这张图片。")
175
- result2 = await self.analyze_image(image2, "描述这张图片。")
176
-
177
- return {
178
- "similarity_score": 0.5, # Basic implementation
179
- "differences": "需要进一步分析",
180
- "common_elements": "需要进一步分析"
181
- }
182
-
183
- def get_supported_formats(self) -> List[str]:
184
- """Get list of supported image formats"""
185
- return ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp']
186
-
187
- def get_max_image_size(self) -> Dict[str, int]:
188
- """Get maximum supported image dimensions"""
189
- return {"width": 4096, "height": 4096}
190
-
191
- async def close(self):
192
- """Cleanup resources"""
193
- pass
194
-