isa-model 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. isa_model/__init__.py +30 -1
  2. isa_model/client.py +770 -0
  3. isa_model/core/config/__init__.py +16 -0
  4. isa_model/core/config/config_manager.py +514 -0
  5. isa_model/core/config.py +426 -0
  6. isa_model/core/models/model_billing_tracker.py +476 -0
  7. isa_model/core/models/model_manager.py +399 -0
  8. isa_model/core/models/model_repo.py +343 -0
  9. isa_model/core/pricing_manager.py +426 -0
  10. isa_model/core/services/__init__.py +19 -0
  11. isa_model/core/services/intelligent_model_selector.py +547 -0
  12. isa_model/core/types.py +291 -0
  13. isa_model/deployment/__init__.py +2 -0
  14. isa_model/deployment/cloud/__init__.py +9 -0
  15. isa_model/deployment/cloud/modal/__init__.py +10 -0
  16. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +766 -0
  17. isa_model/deployment/cloud/modal/isa_vision_table_service.py +532 -0
  18. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +406 -0
  19. isa_model/deployment/cloud/modal/register_models.py +321 -0
  20. isa_model/deployment/runtime/deployed_service.py +338 -0
  21. isa_model/deployment/services/__init__.py +9 -0
  22. isa_model/deployment/services/auto_deploy_vision_service.py +537 -0
  23. isa_model/deployment/services/model_service.py +332 -0
  24. isa_model/deployment/services/service_monitor.py +356 -0
  25. isa_model/deployment/services/service_registry.py +527 -0
  26. isa_model/eval/__init__.py +80 -44
  27. isa_model/eval/config/__init__.py +10 -0
  28. isa_model/eval/config/evaluation_config.py +108 -0
  29. isa_model/eval/evaluators/__init__.py +18 -0
  30. isa_model/eval/evaluators/base_evaluator.py +503 -0
  31. isa_model/eval/evaluators/llm_evaluator.py +472 -0
  32. isa_model/eval/factory.py +417 -709
  33. isa_model/eval/infrastructure/__init__.py +24 -0
  34. isa_model/eval/infrastructure/experiment_tracker.py +466 -0
  35. isa_model/eval/metrics.py +191 -21
  36. isa_model/inference/ai_factory.py +187 -387
  37. isa_model/inference/providers/modal_provider.py +109 -0
  38. isa_model/inference/providers/yyds_provider.py +108 -0
  39. isa_model/inference/services/__init__.py +2 -1
  40. isa_model/inference/services/audio/base_stt_service.py +65 -1
  41. isa_model/inference/services/audio/base_tts_service.py +75 -1
  42. isa_model/inference/services/audio/openai_stt_service.py +189 -151
  43. isa_model/inference/services/audio/openai_tts_service.py +12 -10
  44. isa_model/inference/services/audio/replicate_tts_service.py +61 -56
  45. isa_model/inference/services/base_service.py +55 -55
  46. isa_model/inference/services/embedding/base_embed_service.py +65 -1
  47. isa_model/inference/services/embedding/ollama_embed_service.py +103 -43
  48. isa_model/inference/services/embedding/openai_embed_service.py +8 -10
  49. isa_model/inference/services/helpers/stacked_config.py +148 -0
  50. isa_model/inference/services/img/__init__.py +18 -0
  51. isa_model/inference/services/{vision → img}/base_image_gen_service.py +80 -35
  52. isa_model/inference/services/img/flux_professional_service.py +603 -0
  53. isa_model/inference/services/img/helpers/base_stacked_service.py +274 -0
  54. isa_model/inference/services/{vision → img}/replicate_image_gen_service.py +210 -69
  55. isa_model/inference/services/llm/__init__.py +3 -3
  56. isa_model/inference/services/llm/base_llm_service.py +519 -35
  57. isa_model/inference/services/llm/{llm_adapter.py → helpers/llm_adapter.py} +40 -0
  58. isa_model/inference/services/llm/helpers/llm_prompts.py +258 -0
  59. isa_model/inference/services/llm/helpers/llm_utils.py +280 -0
  60. isa_model/inference/services/llm/ollama_llm_service.py +150 -15
  61. isa_model/inference/services/llm/openai_llm_service.py +134 -31
  62. isa_model/inference/services/llm/yyds_llm_service.py +255 -0
  63. isa_model/inference/services/vision/__init__.py +38 -4
  64. isa_model/inference/services/vision/base_vision_service.py +241 -96
  65. isa_model/inference/services/vision/disabled/isA_vision_service.py +500 -0
  66. isa_model/inference/services/vision/doc_analysis_service.py +640 -0
  67. isa_model/inference/services/vision/helpers/base_stacked_service.py +274 -0
  68. isa_model/inference/services/vision/helpers/image_utils.py +272 -3
  69. isa_model/inference/services/vision/helpers/vision_prompts.py +297 -0
  70. isa_model/inference/services/vision/openai_vision_service.py +109 -170
  71. isa_model/inference/services/vision/replicate_vision_service.py +508 -0
  72. isa_model/inference/services/vision/ui_analysis_service.py +823 -0
  73. isa_model/scripts/register_models.py +370 -0
  74. isa_model/scripts/register_models_with_embeddings.py +510 -0
  75. isa_model/serving/__init__.py +19 -0
  76. isa_model/serving/api/__init__.py +10 -0
  77. isa_model/serving/api/fastapi_server.py +89 -0
  78. isa_model/serving/api/middleware/__init__.py +9 -0
  79. isa_model/serving/api/middleware/request_logger.py +88 -0
  80. isa_model/serving/api/routes/__init__.py +5 -0
  81. isa_model/serving/api/routes/health.py +82 -0
  82. isa_model/serving/api/routes/llm.py +19 -0
  83. isa_model/serving/api/routes/ui_analysis.py +223 -0
  84. isa_model/serving/api/routes/unified.py +202 -0
  85. isa_model/serving/api/routes/vision.py +19 -0
  86. isa_model/serving/api/schemas/__init__.py +17 -0
  87. isa_model/serving/api/schemas/common.py +33 -0
  88. isa_model/serving/api/schemas/ui_analysis.py +78 -0
  89. {isa_model-0.3.4.dist-info → isa_model-0.3.6.dist-info}/METADATA +4 -1
  90. isa_model-0.3.6.dist-info/RECORD +147 -0
  91. isa_model/core/model_manager.py +0 -208
  92. isa_model/core/model_registry.py +0 -342
  93. isa_model/inference/billing_tracker.py +0 -406
  94. isa_model/inference/services/llm/triton_llm_service.py +0 -481
  95. isa_model/inference/services/vision/ollama_vision_service.py +0 -194
  96. isa_model-0.3.4.dist-info/RECORD +0 -91
  97. /isa_model/core/{model_storage.py → models/model_storage.py} +0 -0
  98. /isa_model/inference/services/{vision → embedding}/helpers/text_splitter.py +0 -0
  99. {isa_model-0.3.4.dist-info → isa_model-0.3.6.dist-info}/WHEEL +0 -0
  100. {isa_model-0.3.4.dist-info → isa_model-0.3.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,297 @@
1
+ """
2
+ 统一的Vision任务提示词服务
3
+ 为不同的Vision模型提供标准化的提示词模板,避免重复代码
4
+ """
5
+
6
+ from typing import List, Optional, Dict, Any
7
+
8
+
9
+ class VisionPromptService:
10
+ """Vision任务的统一提示词生成服务"""
11
+
12
+ @staticmethod
13
+ def get_describe_prompt(detail_level: str = "medium") -> str:
14
+ """生成图像描述提示词"""
15
+ detail_prompts = {
16
+ "brief": "Please provide a brief, one-sentence description of this image.",
17
+ "medium": "Please provide a detailed description of this image, including main objects, people, setting, and notable details.",
18
+ "detailed": "Please provide a comprehensive and detailed description of this image, including all visible objects, people, setting, colors, composition, style, mood, and any other notable details or context."
19
+ }
20
+ return detail_prompts.get(detail_level, detail_prompts["medium"])
21
+
22
+ @staticmethod
23
+ def get_extract_text_prompt() -> str:
24
+ """生成文本提取(OCR)提示词"""
25
+ return """Please extract ALL text content from this image. Requirements:
26
+ 1. Extract text exactly as it appears
27
+ 2. Preserve formatting, line breaks, and structure
28
+ 3. If there are tables, maintain table structure
29
+ 4. Include headers, captions, and footnotes
30
+ 5. Return as structured JSON with extracted text and layout information
31
+
32
+ Format your response as JSON:
33
+ {
34
+ "extracted_text": "full text content",
35
+ "structured_content": {
36
+ "headers": [],
37
+ "paragraphs": [],
38
+ "tables": [],
39
+ "other": []
40
+ }
41
+ }"""
42
+
43
+ @staticmethod
44
+ def get_detect_objects_prompt(confidence_threshold: float = 0.5) -> str:
45
+ """生成物体检测提示词"""
46
+ return f"""Please identify and locate all objects in this image. For each object:
47
+ 1. Object name/type
48
+ 2. Approximate location (describe position: top-left, center, bottom-right, etc.)
49
+ 3. Size (small, medium, large)
50
+ 4. Confidence level (high, medium, low)
51
+
52
+ Only include objects you're confident about (confidence > {confidence_threshold})
53
+
54
+ Format as JSON:
55
+ {{
56
+ "detected_objects": [
57
+ {{
58
+ "name": "object_name",
59
+ "location": "position_description",
60
+ "size": "relative_size",
61
+ "confidence": "confidence_level"
62
+ }}
63
+ ]
64
+ }}"""
65
+
66
+ @staticmethod
67
+ def get_detect_ui_elements_prompt(element_types: Optional[List[str]] = None) -> str:
68
+ """生成UI元素检测提示词"""
69
+ element_filter = f"Focus on these element types: {', '.join(element_types)}" if element_types else "Identify all UI elements"
70
+
71
+ return f"""Please analyze this user interface image and identify all interactive elements. {element_filter}
72
+
73
+ For each UI element, provide:
74
+ 1. Element type (button, input field, dropdown, link, checkbox, radio button, text area, etc.)
75
+ 2. Text/label content
76
+ 3. Location description
77
+ 4. Interactive state (enabled, disabled, selected, etc.)
78
+
79
+ Format as JSON:
80
+ {{
81
+ "ui_elements": [
82
+ {{
83
+ "type": "element_type",
84
+ "text": "visible_text",
85
+ "location": "position_description",
86
+ "state": "element_state",
87
+ "confidence": "detection_confidence"
88
+ }}
89
+ ]
90
+ }}"""
91
+
92
+ @staticmethod
93
+ def get_detect_document_elements_prompt() -> str:
94
+ """生成文档元素检测提示词"""
95
+ return """Please analyze this document image and extract its structure and content.
96
+
97
+ Identify and extract:
98
+ 1. Headers and subheaders (with hierarchy level)
99
+ 2. Paragraphs and body text
100
+ 3. Tables (with rows and columns)
101
+ 4. Lists (ordered/unordered)
102
+ 5. Images and captions
103
+ 6. Footnotes and references
104
+
105
+ Format as JSON:
106
+ {
107
+ "document_structure": {
108
+ "title": "document_title",
109
+ "headers": [
110
+ {"level": 1, "text": "header_text", "position": "location"}
111
+ ],
112
+ "paragraphs": [
113
+ {"text": "paragraph_content", "position": "location"}
114
+ ],
115
+ "tables": [
116
+ {"rows": [["cell1", "cell2"]], "caption": "table_caption"}
117
+ ],
118
+ "lists": [
119
+ {"type": "ordered/unordered", "items": ["item1", "item2"]}
120
+ ]
121
+ }
122
+ }"""
123
+
124
+ @staticmethod
125
+ def get_extract_table_data_prompt(table_format: str = "json", preserve_formatting: bool = True) -> str:
126
+ """生成表格数据抽取提示词"""
127
+ format_instructions = {
128
+ "json": "Return the table data as a JSON structure with arrays for headers and rows",
129
+ "csv": "Return the table data in CSV format",
130
+ "markdown": "Return the table data in Markdown table format",
131
+ "html": "Return the table data as an HTML table"
132
+ }
133
+
134
+ format_instruction = format_instructions.get(table_format, format_instructions["json"])
135
+ formatting_note = "Preserve cell merging, formatting, and styling information" if preserve_formatting else "Extract data in simplified format"
136
+
137
+ return f"""Please extract ALL table data from this image with high precision. {formatting_note}
138
+
139
+ Requirements:
140
+ 1. Identify all tables in the image
141
+ 2. Extract headers, rows, and data accurately
142
+ 3. Maintain data relationships and structure
143
+ 4. Handle merged cells appropriately
144
+ 5. Include any table captions or titles
145
+ 6. {format_instruction}
146
+
147
+ For each table, provide:
148
+ - Table identifier/caption
149
+ - Column headers
150
+ - All row data
151
+ - Metadata about structure (row/column counts, merged cells)
152
+
153
+ Return as structured JSON:
154
+ {{
155
+ "tables": [
156
+ {{
157
+ "table_id": "table_1",
158
+ "caption": "table_title_if_any",
159
+ "headers": ["Column1", "Column2", "Column3"],
160
+ "rows": [
161
+ ["data1", "data2", "data3"],
162
+ ["data4", "data5", "data6"]
163
+ ],
164
+ "metadata": {{
165
+ "row_count": 2,
166
+ "column_count": 3,
167
+ "has_headers": true,
168
+ "merged_cells": [
169
+ {{"row": 0, "col": 0, "rowspan": 1, "colspan": 2}}
170
+ ],
171
+ "data_types": ["text", "number", "text"]
172
+ }}
173
+ }}
174
+ ],
175
+ "extraction_metadata": {{
176
+ "total_tables": 1,
177
+ "extraction_confidence": "high",
178
+ "format": "{table_format}",
179
+ "preserve_formatting": {str(preserve_formatting).lower()}
180
+ }}
181
+ }}
182
+
183
+ Important:
184
+ - Be extremely accurate with data extraction
185
+ - Preserve numbers exactly as they appear
186
+ - Handle currency, percentages, and special characters correctly
187
+ - If cells are empty, represent them as empty strings or null
188
+ - For merged cells, include merge information in metadata"""
189
+
190
+ @staticmethod
191
+ def get_classify_image_prompt(categories: Optional[List[str]] = None) -> str:
192
+ """生成图像分类提示词"""
193
+ if categories:
194
+ return f"""Please classify this image into one of these categories: {', '.join(categories)}
195
+
196
+ Provide:
197
+ 1. The most appropriate category
198
+ 2. Confidence level (0.0-1.0)
199
+ 3. Brief reasoning
200
+
201
+ Format as JSON:
202
+ {{
203
+ "classification": "selected_category",
204
+ "confidence": 0.95,
205
+ "reasoning": "explanation"
206
+ }}"""
207
+ else:
208
+ return """Please classify this image by identifying its main category and subcategory.
209
+
210
+ Provide:
211
+ 1. Main category (e.g., nature, technology, people, etc.)
212
+ 2. Subcategory (more specific classification)
213
+ 3. Confidence level
214
+ 4. Key features that led to this classification
215
+
216
+ Format as JSON:
217
+ {
218
+ "main_category": "primary_category",
219
+ "subcategory": "specific_type",
220
+ "confidence": 0.95,
221
+ "key_features": ["feature1", "feature2"]
222
+ }"""
223
+
224
+ @staticmethod
225
+ def get_object_coordinates_prompt(object_name: str) -> str:
226
+ """生成对象坐标检测提示词"""
227
+ return f"""Please locate '{object_name}' in this image and provide detailed location information.
228
+
229
+ Provide:
230
+ 1. Whether the object was found
231
+ 2. Detailed position description
232
+ 3. Approximate coordinates (if possible, describe as percentages from top-left)
233
+ 4. Size and boundaries
234
+
235
+ Format as JSON:
236
+ {{
237
+ "found": true/false,
238
+ "object_name": "{object_name}",
239
+ "location": "detailed_position_description",
240
+ "coordinates": "approximate_position_as_percentages",
241
+ "size": "object_size_description",
242
+ "confidence": "detection_confidence"
243
+ }}"""
244
+
245
+ @staticmethod
246
+ def get_compare_images_prompt() -> str:
247
+ """生成图像比较提示词"""
248
+ return """Please compare the objects, styles, and content in this image. Highlight similarities and differences.
249
+
250
+ Provide:
251
+ 1. Main similarities
252
+ 2. Key differences
253
+ 3. Style comparison
254
+ 4. Content analysis
255
+
256
+ Format as JSON:
257
+ {
258
+ "comparison": {
259
+ "similarities": ["similarity1", "similarity2"],
260
+ "differences": ["difference1", "difference2"],
261
+ "style_analysis": "style_comparison",
262
+ "content_analysis": "content_comparison"
263
+ }
264
+ }"""
265
+
266
+
267
+ class VisionPromptMixin:
268
+ """
269
+ Mixin类,为Vision服务提供统一的提示词支持
270
+ 任何Vision服务都可以继承这个Mixin来获得标准提示词
271
+ """
272
+
273
+ def get_task_prompt(self, task: str, **kwargs) -> str:
274
+ """根据任务类型获取对应的提示词"""
275
+ if task == "describe":
276
+ return VisionPromptService.get_describe_prompt(kwargs.get("detail_level", "medium"))
277
+ elif task == "extract_text":
278
+ return VisionPromptService.get_extract_text_prompt()
279
+ elif task == "detect_objects":
280
+ return VisionPromptService.get_detect_objects_prompt(kwargs.get("confidence_threshold", 0.5))
281
+ elif task == "detect_ui_elements":
282
+ return VisionPromptService.get_detect_ui_elements_prompt(kwargs.get("element_types"))
283
+ elif task == "detect_document_elements":
284
+ return VisionPromptService.get_detect_document_elements_prompt()
285
+ elif task == "extract_table_data":
286
+ return VisionPromptService.get_extract_table_data_prompt(
287
+ kwargs.get("table_format", "json"),
288
+ kwargs.get("preserve_formatting", True)
289
+ )
290
+ elif task == "classify":
291
+ return VisionPromptService.get_classify_image_prompt(kwargs.get("categories"))
292
+ elif task == "get_coordinates":
293
+ return VisionPromptService.get_object_coordinates_prompt(kwargs.get("object_name", ""))
294
+ elif task == "compare":
295
+ return VisionPromptService.get_compare_images_prompt()
296
+ else:
297
+ return "Please analyze this image and provide detailed information."
@@ -1,32 +1,31 @@
1
1
  from typing import Dict, Any, Union, List, Optional, BinaryIO
2
- import base64
3
- import aiohttp
4
2
  from openai import AsyncOpenAI
5
3
  from tenacity import retry, stop_after_attempt, wait_exponential
6
4
  from isa_model.inference.services.vision.base_vision_service import BaseVisionService
7
- from isa_model.inference.providers.base_provider import BaseProvider
8
- from isa_model.inference.billing_tracker import ServiceType
5
+ from isa_model.inference.services.vision.helpers.image_utils import prepare_image_base64
6
+ from isa_model.inference.services.vision.helpers.vision_prompts import VisionPromptMixin
7
+ from isa_model.core.types import ServiceType
9
8
  import logging
10
9
 
11
10
  logger = logging.getLogger(__name__)
12
11
 
13
- class OpenAIVisionService(BaseVisionService):
14
- """OpenAI Vision service using gpt-4.1-nano with vision capabilities"""
12
+ class OpenAIVisionService(BaseVisionService, VisionPromptMixin):
13
+ """OpenAI Vision service using centralized config management"""
15
14
 
16
- def __init__(self, provider: 'BaseProvider', model_name: str = "gpt-4.1-nano"):
17
- super().__init__(provider, model_name)
15
+ def __init__(self, provider_name: str, model_name: str = "gpt-4o-mini", **kwargs):
16
+ super().__init__(provider_name, model_name, **kwargs)
18
17
 
19
- # Get full configuration from provider (including sensitive data)
20
- provider_config = provider.get_full_config()
18
+ # Get configuration from centralized config manager
19
+ provider_config = self.get_provider_config()
21
20
 
22
- # Initialize AsyncOpenAI client with provider configuration
21
+ # Initialize AsyncOpenAI client with centralized configuration
23
22
  try:
24
23
  if not provider_config.get("api_key"):
25
24
  raise ValueError("OpenAI API key not found in provider configuration")
26
25
 
27
26
  self._client = AsyncOpenAI(
28
27
  api_key=provider_config["api_key"],
29
- base_url=provider_config.get("base_url", "https://api.openai.com/v1"),
28
+ base_url=provider_config.get("api_base_url", "https://api.openai.com/v1"),
30
29
  organization=provider_config.get("organization")
31
30
  )
32
31
 
@@ -44,31 +43,7 @@ class OpenAIVisionService(BaseVisionService):
44
43
  """Get the underlying OpenAI client"""
45
44
  return self._client
46
45
 
47
- async def _download_image(self, image_url: str) -> bytes:
48
- """Download image from URL"""
49
- async with aiohttp.ClientSession() as session:
50
- async with session.get(image_url) as response:
51
- if response.status == 200:
52
- return await response.read()
53
- else:
54
- raise ValueError(f"Failed to download image from {image_url}: {response.status}")
55
46
 
56
- def _encode_image(self, image_path_or_data: Union[str, bytes, BinaryIO]) -> str:
57
- """Encode image to base64"""
58
- if isinstance(image_path_or_data, str):
59
- # If it's a file path
60
- with open(image_path_or_data, "rb") as image_file:
61
- return base64.b64encode(image_file.read()).decode("utf-8")
62
- elif hasattr(image_path_or_data, 'read'):
63
- # If it's a file-like object (BinaryIO)
64
- data = image_path_or_data.read() # type: ignore
65
- if isinstance(data, bytes):
66
- return base64.b64encode(data).decode("utf-8")
67
- else:
68
- raise ValueError("File-like object did not return bytes")
69
- else:
70
- # If it's bytes data
71
- return base64.b64encode(image_path_or_data).decode("utf-8") # type: ignore
72
47
 
73
48
  @retry(
74
49
  stop=stop_after_attempt(3),
@@ -93,22 +68,8 @@ class OpenAIVisionService(BaseVisionService):
93
68
  Dict containing analysis results
94
69
  """
95
70
  try:
96
- # Handle different input types
97
- if isinstance(image, str):
98
- if image.startswith(('http://', 'https://')):
99
- # Download image from URL
100
- image_bytes = await self._download_image(image)
101
- base64_image = self._encode_image(image_bytes)
102
- else:
103
- # File path
104
- base64_image = self._encode_image(image)
105
- else:
106
- # BinaryIO or bytes data
107
- if hasattr(image, 'read'):
108
- image_data = image.read()
109
- else:
110
- image_data = image
111
- base64_image = self._encode_image(image_data)
71
+ # Use unified image processing from image_utils
72
+ base64_image = prepare_image_base64(image)
112
73
 
113
74
  # Use default prompt if none provided
114
75
  if prompt is None:
@@ -140,7 +101,7 @@ class OpenAIVisionService(BaseVisionService):
140
101
 
141
102
  # Track usage for billing
142
103
  if response.usage:
143
- self._track_usage(
104
+ await self._track_usage(
144
105
  service_type=ServiceType.VISION,
145
106
  operation="image_analysis",
146
107
  input_tokens=response.usage.prompt_tokens,
@@ -150,14 +111,36 @@ class OpenAIVisionService(BaseVisionService):
150
111
 
151
112
  content = response.choices[0].message.content or ""
152
113
 
114
+ # 尝试解析JSON响应(对于结构化任务)
115
+ try:
116
+ import json
117
+ # 检查响应是否是JSON格式
118
+ if content.strip().startswith('{') and content.strip().endswith('}'):
119
+ parsed_json = json.loads(content)
120
+ return {
121
+ "text": content,
122
+ "parsed_data": parsed_json,
123
+ "confidence": 1.0,
124
+ "metadata": {
125
+ "model": self.model_name,
126
+ "prompt": prompt[:100],
127
+ "tokens_used": response.usage.total_tokens if response.usage else 0,
128
+ "response_format": "json"
129
+ }
130
+ }
131
+ except json.JSONDecodeError:
132
+ pass
133
+
134
+ # 标准文本响应
153
135
  return {
154
136
  "text": content,
155
137
  "confidence": 1.0, # OpenAI doesn't provide confidence scores
156
- "detected_objects": [], # Would need separate object detection
138
+ "detected_objects": [], # Populated by specific detection methods
157
139
  "metadata": {
158
140
  "model": self.model_name,
159
- "prompt": prompt,
160
- "tokens_used": response.usage.total_tokens if response.usage else 0
141
+ "prompt": prompt[:100],
142
+ "tokens_used": response.usage.total_tokens if response.usage else 0,
143
+ "response_format": "text"
161
144
  }
162
145
  }
163
146
 
@@ -165,148 +148,104 @@ class OpenAIVisionService(BaseVisionService):
165
148
  logger.error(f"Error in image analysis: {e}")
166
149
  raise
167
150
 
168
- async def analyze_images(
169
- self,
170
- images: List[Union[str, BinaryIO]],
171
- prompt: Optional[str] = None,
172
- max_tokens: int = 1000
173
- ) -> List[Dict[str, Any]]:
174
- """Analyze multiple images"""
175
- results = []
176
- for image in images:
177
- result = await self.analyze_image(image, prompt, max_tokens)
178
- results.append(result)
179
- return results
151
+ # ==================== 基于提示词的智能功能实现 ====================
152
+ # OpenAI通过改变提示词就能实现大部分Vision功能
153
+ # 使用统一的VisionPromptMixin提供标准提示词
180
154
 
155
+ # 重写其他方法以使用智能提示词
181
156
  async def describe_image(
182
157
  self,
183
158
  image: Union[str, BinaryIO],
184
159
  detail_level: str = "medium"
185
160
  ) -> Dict[str, Any]:
186
- """Generate detailed description of image"""
187
- detail_prompts = {
188
- "low": "Briefly describe what you see in this image.",
189
- "medium": "Describe what you see in this image in detail, including objects, colors, and scene.",
190
- "high": "Provide a comprehensive and detailed description of this image, including all visible objects, their positions, colors, textures, lighting, composition, and any text or symbols present."
191
- }
192
-
193
- prompt = detail_prompts.get(detail_level, detail_prompts["medium"])
194
- result = await self.analyze_image(image, prompt, 1500)
195
-
196
- return {
197
- "description": result["text"],
198
- "objects": [], # Would need object detection API
199
- "scene": result["text"], # Use same description
200
- "colors": [], # Would need color analysis
201
- "detail_level": detail_level,
202
- "metadata": result["metadata"]
203
- }
161
+ """
162
+ 图像描述 - 使用专门提示词
163
+ """
164
+ prompt = self.get_task_prompt("describe", detail_level=detail_level)
165
+ return await self.analyze_image(image, prompt)
204
166
 
205
167
  async def extract_text(self, image: Union[str, BinaryIO]) -> Dict[str, Any]:
206
- """Extract text from image (OCR)"""
207
- prompt = "Extract all text visible in this image. Provide only the text content, maintaining the original structure and formatting as much as possible."
208
- result = await self.analyze_image(image, prompt, 1000)
168
+ """
169
+ 文本提取(OCR) - 使用专门提示词
170
+ """
171
+ prompt = self.get_task_prompt("extract_text")
209
172
 
210
- return {
211
- "text": result["text"],
212
- "confidence": 1.0,
213
- "bounding_boxes": [], # OpenAI vision doesn't provide bounding boxes
214
- "language": "unknown", # Would need language detection
215
- "metadata": result["metadata"]
216
- }
173
+ return await self.analyze_image(image, prompt)
217
174
 
218
175
  async def detect_objects(
219
176
  self,
220
177
  image: Union[str, BinaryIO],
221
178
  confidence_threshold: float = 0.5
222
179
  ) -> Dict[str, Any]:
223
- """Detect objects in image"""
224
- prompt = "List all objects visible in this image. For each object, provide the object name and a brief description of its location in the image."
225
- result = await self.analyze_image(image, prompt, 1000)
180
+ """
181
+ 物体检测 - 使用专门提示词
182
+ """
183
+ prompt = self.get_task_prompt("detect_objects", confidence_threshold=confidence_threshold)
184
+
185
+ return await self.analyze_image(image, prompt)
186
+
187
+ async def detect_ui_elements(
188
+ self,
189
+ image: Union[str, BinaryIO],
190
+ element_types: Optional[List[str]] = None,
191
+ confidence_threshold: float = 0.5
192
+ ) -> Dict[str, Any]:
193
+ """
194
+ UI元素检测 - 使用专门提示词
195
+ """
196
+ prompt = self.get_task_prompt("detect_ui_elements", element_types=element_types, confidence_threshold=confidence_threshold)
226
197
 
227
- # Parse the response to extract object information
228
- objects = []
229
- lines = result["text"].split('\n')
230
- for line in lines:
231
- line = line.strip()
232
- if line and not line.startswith(('In this image', 'The image shows', 'I can see')):
233
- objects.append({
234
- "label": line,
235
- "confidence": 1.0 # OpenAI doesn't provide confidence scores
236
- })
198
+ return await self.analyze_image(image, prompt)
199
+
200
+ async def detect_document_elements(
201
+ self,
202
+ image: Union[str, BinaryIO],
203
+ element_types: Optional[List[str]] = None,
204
+ confidence_threshold: float = 0.5
205
+ ) -> Dict[str, Any]:
206
+ """
207
+ 文档元素检测 - 使用专门提示词
208
+ """
209
+ prompt = self.get_task_prompt("detect_document_elements", element_types=element_types, confidence_threshold=confidence_threshold)
237
210
 
238
- return {
239
- "objects": objects,
240
- "count": len(objects),
241
- "bounding_boxes": [], # Not available with current API
242
- "metadata": result["metadata"]
243
- }
211
+ return await self.analyze_image(image, prompt)
244
212
 
245
213
  async def classify_image(
246
214
  self,
247
215
  image: Union[str, BinaryIO],
248
216
  categories: Optional[List[str]] = None
249
217
  ) -> Dict[str, Any]:
250
- """Classify image into categories"""
251
- if categories:
252
- category_list = ", ".join(categories)
253
- prompt = f"Classify this image into one of these categories: {category_list}. Respond with only the most appropriate category name."
254
- else:
255
- prompt = "What category best describes this image? Provide a single category name."
256
-
257
- result = await self.analyze_image(image, prompt, 100)
258
- category = result["text"].strip()
218
+ """
219
+ 图像分类 - 使用专门提示词
220
+ """
221
+ prompt = self.get_task_prompt("classify", categories=categories)
259
222
 
260
- return {
261
- "category": category,
262
- "confidence": 1.0,
263
- "all_predictions": [{"category": category, "confidence": 1.0}],
264
- "metadata": result["metadata"]
265
- }
223
+ return await self.analyze_image(image, prompt)
266
224
 
267
- async def compare_images(
268
- self,
269
- image1: Union[str, BinaryIO],
270
- image2: Union[str, BinaryIO]
225
+ async def get_object_coordinates(
226
+ self,
227
+ image: Union[str, BinaryIO],
228
+ object_name: str
271
229
  ) -> Dict[str, Any]:
272
- """Compare two images for similarity"""
273
- # For now, analyze both images separately and compare descriptions
274
- result1 = await self.analyze_image(image1, "Describe this image in detail.")
275
- result2 = await self.analyze_image(image2, "Describe this image in detail.")
276
-
277
- # Use LLM to compare the descriptions
278
- comparison_prompt = f"Compare these two image descriptions and provide a similarity analysis:\n\nImage 1: {result1['text']}\n\nImage 2: {result2['text']}\n\nProvide: 1) A similarity score from 0.0 to 1.0, 2) Key differences, 3) Common elements."
279
-
280
- comparison_result = await self._client.chat.completions.create(
281
- model=self.model_name,
282
- messages=[{"role": "user", "content": comparison_prompt}],
283
- max_tokens=500,
284
- temperature=0.3
285
- )
286
-
287
- comparison_text = comparison_result.choices[0].message.content or ""
230
+ """
231
+ 获取对象坐标 - 使用专门提示词
232
+ """
233
+ prompt = self.get_task_prompt("get_coordinates", object_name=object_name)
288
234
 
289
- return {
290
- "similarity_score": 0.5, # Would need better parsing to extract actual score
291
- "differences": comparison_text,
292
- "common_elements": comparison_text,
293
- "metadata": {
294
- "model": self.model_name,
295
- "comparison_method": "description_based"
296
- }
297
- }
298
-
299
- def get_supported_formats(self) -> List[str]:
300
- """Get list of supported image formats"""
301
- return ['jpg', 'jpeg', 'png', 'gif', 'webp']
235
+ return await self.analyze_image(image, prompt)
302
236
 
303
- def get_max_image_size(self) -> Dict[str, int]:
304
- """Get maximum supported image dimensions"""
305
- return {
306
- "width": 2048,
307
- "height": 2048,
308
- "file_size_mb": 20
309
- }
237
+ async def extract_table_data(
238
+ self,
239
+ image: Union[str, BinaryIO],
240
+ table_format: str = "json",
241
+ preserve_formatting: bool = True
242
+ ) -> Dict[str, Any]:
243
+ """
244
+ 表格数据结构化抽取 - 使用专门的表格抽取提示词
245
+ """
246
+ prompt = self.get_task_prompt("extract_table_data", table_format=table_format, preserve_formatting=preserve_formatting)
247
+
248
+ return await self.analyze_image(image, prompt)
310
249
 
311
250
  async def close(self):
312
251
  """Clean up resources"""