isa-model 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/config/__init__.py +9 -0
- isa_model/config/config_manager.py +213 -0
- isa_model/core/model_manager.py +5 -0
- isa_model/core/model_registry.py +39 -6
- isa_model/core/storage/supabase_storage.py +344 -0
- isa_model/core/vision_models_init.py +116 -0
- isa_model/deployment/cloud/__init__.py +9 -0
- isa_model/deployment/cloud/modal/__init__.py +10 -0
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +612 -0
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +305 -0
- isa_model/inference/ai_factory.py +238 -14
- isa_model/inference/providers/modal_provider.py +109 -0
- isa_model/inference/providers/yyds_provider.py +108 -0
- isa_model/inference/services/__init__.py +2 -1
- isa_model/inference/services/base_service.py +0 -38
- isa_model/inference/services/llm/base_llm_service.py +32 -0
- isa_model/inference/services/llm/llm_adapter.py +73 -3
- isa_model/inference/services/llm/ollama_llm_service.py +104 -3
- isa_model/inference/services/llm/openai_llm_service.py +67 -15
- isa_model/inference/services/llm/yyds_llm_service.py +254 -0
- isa_model/inference/services/stacked/__init__.py +26 -0
- isa_model/inference/services/stacked/base_stacked_service.py +269 -0
- isa_model/inference/services/stacked/config.py +426 -0
- isa_model/inference/services/stacked/doc_analysis_service.py +640 -0
- isa_model/inference/services/stacked/flux_professional_service.py +579 -0
- isa_model/inference/services/stacked/ui_analysis_service.py +1319 -0
- isa_model/inference/services/vision/base_image_gen_service.py +0 -34
- isa_model/inference/services/vision/base_vision_service.py +46 -2
- isa_model/inference/services/vision/isA_vision_service.py +402 -0
- isa_model/inference/services/vision/openai_vision_service.py +151 -9
- isa_model/inference/services/vision/replicate_image_gen_service.py +166 -38
- isa_model/inference/services/vision/replicate_vision_service.py +693 -0
- isa_model/serving/__init__.py +19 -0
- isa_model/serving/api/__init__.py +10 -0
- isa_model/serving/api/fastapi_server.py +84 -0
- isa_model/serving/api/middleware/__init__.py +9 -0
- isa_model/serving/api/middleware/request_logger.py +88 -0
- isa_model/serving/api/routes/__init__.py +5 -0
- isa_model/serving/api/routes/health.py +82 -0
- isa_model/serving/api/routes/llm.py +19 -0
- isa_model/serving/api/routes/ui_analysis.py +223 -0
- isa_model/serving/api/routes/vision.py +19 -0
- isa_model/serving/api/schemas/__init__.py +17 -0
- isa_model/serving/api/schemas/common.py +33 -0
- isa_model/serving/api/schemas/ui_analysis.py +78 -0
- {isa_model-0.3.3.dist-info → isa_model-0.3.5.dist-info}/METADATA +1 -1
- {isa_model-0.3.3.dist-info → isa_model-0.3.5.dist-info}/RECORD +49 -17
- {isa_model-0.3.3.dist-info → isa_model-0.3.5.dist-info}/WHEEL +0 -0
- {isa_model-0.3.3.dist-info → isa_model-0.3.5.dist-info}/top_level.txt +0 -0
@@ -68,40 +68,6 @@ class BaseImageGenService(BaseService):
|
|
68
68
|
"""
|
69
69
|
pass
|
70
70
|
|
71
|
-
@abstractmethod
|
72
|
-
async def generate_image_to_file(
|
73
|
-
self,
|
74
|
-
prompt: str,
|
75
|
-
output_path: str,
|
76
|
-
negative_prompt: Optional[str] = None,
|
77
|
-
width: int = 512,
|
78
|
-
height: int = 512,
|
79
|
-
num_inference_steps: int = 20,
|
80
|
-
guidance_scale: float = 7.5,
|
81
|
-
seed: Optional[int] = None
|
82
|
-
) -> Dict[str, Any]:
|
83
|
-
"""
|
84
|
-
Generate image and save directly to file
|
85
|
-
|
86
|
-
Args:
|
87
|
-
prompt: Text description of the desired image
|
88
|
-
output_path: Path to save the generated image
|
89
|
-
negative_prompt: Text describing what to avoid in the image
|
90
|
-
width: Image width in pixels
|
91
|
-
height: Image height in pixels
|
92
|
-
num_inference_steps: Number of denoising steps
|
93
|
-
guidance_scale: How closely to follow the prompt
|
94
|
-
seed: Random seed for reproducible results
|
95
|
-
|
96
|
-
Returns:
|
97
|
-
Dict containing generation results with keys:
|
98
|
-
- file_path: Path to saved image
|
99
|
-
- width: Image width
|
100
|
-
- height: Image height
|
101
|
-
- seed: Seed used for generation
|
102
|
-
"""
|
103
|
-
pass
|
104
|
-
|
105
71
|
@abstractmethod
|
106
72
|
async def image_to_image(
|
107
73
|
self,
|
@@ -5,6 +5,28 @@ from isa_model.inference.services.base_service import BaseService
|
|
5
5
|
class BaseVisionService(BaseService):
|
6
6
|
"""Base class for vision understanding services"""
|
7
7
|
|
8
|
+
@abstractmethod
|
9
|
+
async def invoke(
|
10
|
+
self,
|
11
|
+
image: Union[str, BinaryIO],
|
12
|
+
prompt: Optional[str] = None,
|
13
|
+
task: Optional[str] = None,
|
14
|
+
**kwargs
|
15
|
+
) -> Dict[str, Any]:
|
16
|
+
"""
|
17
|
+
Unified invoke method for all vision operations
|
18
|
+
|
19
|
+
Args:
|
20
|
+
image: Path to image file or image data
|
21
|
+
prompt: Optional text prompt/question about the image
|
22
|
+
task: Task type (analyze, describe, extract_text, detect_objects, etc.)
|
23
|
+
**kwargs: Additional task-specific parameters
|
24
|
+
|
25
|
+
Returns:
|
26
|
+
Dict containing task results
|
27
|
+
"""
|
28
|
+
pass
|
29
|
+
|
8
30
|
@abstractmethod
|
9
31
|
async def analyze_image(
|
10
32
|
self,
|
@@ -103,9 +125,31 @@ class BaseVisionService(BaseService):
|
|
103
125
|
|
104
126
|
Returns:
|
105
127
|
Dict containing detection results with keys:
|
106
|
-
- objects: List of detected objects with labels and
|
128
|
+
- objects: List of detected objects with labels, confidence, and coordinates
|
107
129
|
- count: Number of objects detected
|
108
|
-
- bounding_boxes: Object locations
|
130
|
+
- bounding_boxes: Object locations with coordinates
|
131
|
+
"""
|
132
|
+
pass
|
133
|
+
|
134
|
+
@abstractmethod
|
135
|
+
async def get_object_coordinates(
|
136
|
+
self,
|
137
|
+
image: Union[str, BinaryIO],
|
138
|
+
object_name: str
|
139
|
+
) -> Dict[str, Any]:
|
140
|
+
"""
|
141
|
+
Get coordinates of a specific object in the image
|
142
|
+
|
143
|
+
Args:
|
144
|
+
image: Path to image file or image data
|
145
|
+
object_name: Name of the object to locate
|
146
|
+
|
147
|
+
Returns:
|
148
|
+
Dict containing coordinate results with keys:
|
149
|
+
- found: Boolean indicating if object was found
|
150
|
+
- center_coordinates: List [x, y] with pixel coordinates of center point
|
151
|
+
- confidence: Confidence score for the detection
|
152
|
+
- description: Description of the object location
|
109
153
|
"""
|
110
154
|
pass
|
111
155
|
|
@@ -0,0 +1,402 @@
|
|
1
|
+
"""
|
2
|
+
ISA Vision Service
|
3
|
+
|
4
|
+
Connects to self-hosted Modal UI detection service
|
5
|
+
Provides vision capabilities using our deployed models
|
6
|
+
"""
|
7
|
+
|
8
|
+
import modal
|
9
|
+
import base64
|
10
|
+
import io
|
11
|
+
import logging
|
12
|
+
from typing import Dict, Any, List, Union, Optional, BinaryIO
|
13
|
+
from PIL import Image
|
14
|
+
|
15
|
+
from .base_vision_service import BaseVisionService
|
16
|
+
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
class ISAVisionService(BaseVisionService):
|
20
|
+
"""ISA Vision Service using Modal backend"""
|
21
|
+
|
22
|
+
def __init__(self, provider, model_name: str):
|
23
|
+
super().__init__(provider, model_name)
|
24
|
+
self.ui_app = None
|
25
|
+
self.doc_app = None
|
26
|
+
self._initialize_modal_connections()
|
27
|
+
|
28
|
+
def _initialize_modal_connections(self):
|
29
|
+
"""Initialize connections to Modal services"""
|
30
|
+
try:
|
31
|
+
# Connect to UI detection service
|
32
|
+
self.ui_app = modal.App.lookup("isa-vision-ui", create_if_missing=False)
|
33
|
+
logger.info(" Connected to UI detection service")
|
34
|
+
except Exception as e:
|
35
|
+
logger.warning(f"� UI service not available: {e}")
|
36
|
+
self.ui_app = None
|
37
|
+
|
38
|
+
try:
|
39
|
+
# Connect to document analysis service (when deployed)
|
40
|
+
self.doc_app = modal.App.lookup("isa-vision-doc", create_if_missing=False)
|
41
|
+
logger.info(" Connected to document analysis service")
|
42
|
+
except Exception as e:
|
43
|
+
logger.warning(f"� Document service not available: {e}")
|
44
|
+
self.doc_app = None
|
45
|
+
|
46
|
+
async def invoke(
|
47
|
+
self,
|
48
|
+
image: Union[str, BinaryIO],
|
49
|
+
prompt: Optional[str] = None,
|
50
|
+
task: Optional[str] = None,
|
51
|
+
**kwargs
|
52
|
+
) -> Dict[str, Any]:
|
53
|
+
"""
|
54
|
+
Unified invoke method for all vision operations
|
55
|
+
"""
|
56
|
+
if task == "detect_ui" or task == "ui_analysis":
|
57
|
+
return await self.detect_objects(image, **kwargs)
|
58
|
+
elif task == "extract_text" or task == "ocr":
|
59
|
+
return await self.extract_text(image)
|
60
|
+
elif task == "analyze_document":
|
61
|
+
return await self._analyze_document(image)
|
62
|
+
else:
|
63
|
+
return await self.analyze_image(image, prompt, **kwargs)
|
64
|
+
|
65
|
+
async def analyze_image(
|
66
|
+
self,
|
67
|
+
image: Union[str, BinaryIO],
|
68
|
+
prompt: Optional[str] = None,
|
69
|
+
max_tokens: int = 1000
|
70
|
+
) -> Dict[str, Any]:
|
71
|
+
"""Analyze image using UI detection service"""
|
72
|
+
|
73
|
+
if not self.ui_app:
|
74
|
+
return {
|
75
|
+
'error': 'UI detection service not available',
|
76
|
+
'success': False
|
77
|
+
}
|
78
|
+
|
79
|
+
try:
|
80
|
+
# Convert image to base64
|
81
|
+
image_b64 = self._encode_image(image)
|
82
|
+
|
83
|
+
# Call Modal UI detection service using from_name (new API)
|
84
|
+
ui_detector = modal.Cls.from_name("isa-vision-ui", "UIDetectionService")
|
85
|
+
result = ui_detector().detect_ui_elements.remote(image_b64)
|
86
|
+
|
87
|
+
if result.get('success'):
|
88
|
+
return {
|
89
|
+
'success': True,
|
90
|
+
'service': 'isa-vision',
|
91
|
+
'text': f"Detected {result.get('element_count', 0)} UI elements",
|
92
|
+
'detected_objects': result.get('ui_elements', []),
|
93
|
+
'confidence': 0.9,
|
94
|
+
'metadata': {
|
95
|
+
'processing_time': result.get('processing_time'),
|
96
|
+
'detection_method': result.get('detection_method'),
|
97
|
+
'model_info': result.get('model_info')
|
98
|
+
}
|
99
|
+
}
|
100
|
+
else:
|
101
|
+
return {
|
102
|
+
'success': False,
|
103
|
+
'error': result.get('error', 'Unknown error'),
|
104
|
+
'service': 'isa-vision'
|
105
|
+
}
|
106
|
+
|
107
|
+
except Exception as e:
|
108
|
+
logger.error(f"Image analysis failed: {e}")
|
109
|
+
return {
|
110
|
+
'success': False,
|
111
|
+
'error': str(e),
|
112
|
+
'service': 'isa-vision'
|
113
|
+
}
|
114
|
+
|
115
|
+
async def analyze_images(
|
116
|
+
self,
|
117
|
+
images: List[Union[str, BinaryIO]],
|
118
|
+
prompt: Optional[str] = None,
|
119
|
+
max_tokens: int = 1000
|
120
|
+
) -> List[Dict[str, Any]]:
|
121
|
+
"""Analyze multiple images"""
|
122
|
+
results = []
|
123
|
+
for image in images:
|
124
|
+
result = await self.analyze_image(image, prompt, max_tokens)
|
125
|
+
results.append(result)
|
126
|
+
return results
|
127
|
+
|
128
|
+
async def describe_image(
|
129
|
+
self,
|
130
|
+
image: Union[str, BinaryIO],
|
131
|
+
detail_level: str = "medium"
|
132
|
+
) -> Dict[str, Any]:
|
133
|
+
"""Generate description using UI detection"""
|
134
|
+
result = await self.analyze_image(image)
|
135
|
+
|
136
|
+
if result.get('success'):
|
137
|
+
objects = result.get('detected_objects', [])
|
138
|
+
description = f"This appears to be a user interface with {len(objects)} interactive elements. "
|
139
|
+
|
140
|
+
if objects:
|
141
|
+
element_types = list(set([obj.get('type', 'element') for obj in objects]))
|
142
|
+
description += f"The interface contains: {', '.join(element_types)}."
|
143
|
+
|
144
|
+
return {
|
145
|
+
'success': True,
|
146
|
+
'description': description,
|
147
|
+
'objects': objects,
|
148
|
+
'scene': 'User Interface',
|
149
|
+
'colors': ['unknown'] # Could be enhanced with color detection
|
150
|
+
}
|
151
|
+
else:
|
152
|
+
return result
|
153
|
+
|
154
|
+
async def extract_text(self, image: Union[str, BinaryIO]) -> Dict[str, Any]:
|
155
|
+
"""Extract text using document analysis service"""
|
156
|
+
|
157
|
+
if not self.doc_app:
|
158
|
+
# Fallback to UI service for basic text detection
|
159
|
+
return await self._extract_text_fallback(image)
|
160
|
+
|
161
|
+
try:
|
162
|
+
# Convert image to base64
|
163
|
+
image_b64 = self._encode_image(image)
|
164
|
+
|
165
|
+
# Call Modal document analysis service using from_name (new API)
|
166
|
+
doc_analyzer = modal.Cls.from_name("isa-vision-doc", "DocumentAnalysisService")
|
167
|
+
result = doc_analyzer().extract_text.remote(image_b64)
|
168
|
+
|
169
|
+
if result.get('success'):
|
170
|
+
text_results = result.get('text_results', [])
|
171
|
+
all_text = ' '.join([item.get('text', '') for item in text_results])
|
172
|
+
|
173
|
+
return {
|
174
|
+
'success': True,
|
175
|
+
'service': 'isa-vision-doc',
|
176
|
+
'text': all_text,
|
177
|
+
'confidence': sum([item.get('confidence', 0) for item in text_results]) / len(text_results) if text_results else 0,
|
178
|
+
'bounding_boxes': [item.get('bbox') for item in text_results],
|
179
|
+
'language': 'auto-detected',
|
180
|
+
'metadata': {
|
181
|
+
'processing_time': result.get('processing_time'),
|
182
|
+
'text_count': result.get('text_count')
|
183
|
+
}
|
184
|
+
}
|
185
|
+
else:
|
186
|
+
return {
|
187
|
+
'success': False,
|
188
|
+
'error': result.get('error', 'OCR failed'),
|
189
|
+
'service': 'isa-vision-doc'
|
190
|
+
}
|
191
|
+
|
192
|
+
except Exception as e:
|
193
|
+
logger.error(f"Text extraction failed: {e}")
|
194
|
+
return {
|
195
|
+
'success': False,
|
196
|
+
'error': str(e),
|
197
|
+
'service': 'isa-vision-doc'
|
198
|
+
}
|
199
|
+
|
200
|
+
async def detect_objects(
|
201
|
+
self,
|
202
|
+
image: Union[str, BinaryIO],
|
203
|
+
confidence_threshold: float = 0.5
|
204
|
+
) -> Dict[str, Any]:
|
205
|
+
"""Detect UI elements using UI detection service"""
|
206
|
+
|
207
|
+
result = await self.analyze_image(image)
|
208
|
+
|
209
|
+
if result.get('success'):
|
210
|
+
objects = result.get('detected_objects', [])
|
211
|
+
# Filter by confidence threshold
|
212
|
+
filtered_objects = [obj for obj in objects if obj.get('confidence', 0) >= confidence_threshold]
|
213
|
+
|
214
|
+
return {
|
215
|
+
'success': True,
|
216
|
+
'service': 'isa-vision-ui',
|
217
|
+
'objects': filtered_objects,
|
218
|
+
'count': len(filtered_objects),
|
219
|
+
'bounding_boxes': [obj.get('bbox') for obj in filtered_objects],
|
220
|
+
'metadata': result.get('metadata', {})
|
221
|
+
}
|
222
|
+
else:
|
223
|
+
return result
|
224
|
+
|
225
|
+
async def get_object_coordinates(
|
226
|
+
self,
|
227
|
+
image: Union[str, BinaryIO],
|
228
|
+
object_name: str
|
229
|
+
) -> Dict[str, Any]:
|
230
|
+
"""Get coordinates of specific UI element"""
|
231
|
+
|
232
|
+
detection_result = await self.detect_objects(image)
|
233
|
+
|
234
|
+
if not detection_result.get('success'):
|
235
|
+
return detection_result
|
236
|
+
|
237
|
+
objects = detection_result.get('objects', [])
|
238
|
+
|
239
|
+
# Look for object by name/type
|
240
|
+
for obj in objects:
|
241
|
+
obj_type = obj.get('type', '').lower()
|
242
|
+
obj_content = obj.get('content', '').lower()
|
243
|
+
|
244
|
+
if object_name.lower() in obj_type or object_name.lower() in obj_content:
|
245
|
+
return {
|
246
|
+
'success': True,
|
247
|
+
'found': True,
|
248
|
+
'center_coordinates': obj.get('center', [0, 0]),
|
249
|
+
'confidence': obj.get('confidence', 0),
|
250
|
+
'description': f"Found {obj.get('type')} at center coordinates",
|
251
|
+
'object_info': obj
|
252
|
+
}
|
253
|
+
|
254
|
+
return {
|
255
|
+
'success': True,
|
256
|
+
'found': False,
|
257
|
+
'center_coordinates': [0, 0],
|
258
|
+
'confidence': 0,
|
259
|
+
'description': f"Object '{object_name}' not found in image"
|
260
|
+
}
|
261
|
+
|
262
|
+
async def classify_image(
|
263
|
+
self,
|
264
|
+
image: Union[str, BinaryIO],
|
265
|
+
categories: Optional[List[str]] = None
|
266
|
+
) -> Dict[str, Any]:
|
267
|
+
"""Classify image type"""
|
268
|
+
|
269
|
+
result = await self.analyze_image(image)
|
270
|
+
|
271
|
+
if result.get('success'):
|
272
|
+
objects = result.get('detected_objects', [])
|
273
|
+
|
274
|
+
# Simple classification based on detected UI elements
|
275
|
+
if objects:
|
276
|
+
category = "user_interface"
|
277
|
+
confidence = 0.9
|
278
|
+
else:
|
279
|
+
category = "unknown"
|
280
|
+
confidence = 0.1
|
281
|
+
|
282
|
+
return {
|
283
|
+
'success': True,
|
284
|
+
'category': category,
|
285
|
+
'confidence': confidence,
|
286
|
+
'all_predictions': [
|
287
|
+
{'category': category, 'confidence': confidence}
|
288
|
+
]
|
289
|
+
}
|
290
|
+
else:
|
291
|
+
return result
|
292
|
+
|
293
|
+
async def compare_images(
|
294
|
+
self,
|
295
|
+
image1: Union[str, BinaryIO],
|
296
|
+
image2: Union[str, BinaryIO]
|
297
|
+
) -> Dict[str, Any]:
|
298
|
+
"""Compare two images based on UI elements"""
|
299
|
+
|
300
|
+
result1 = await self.analyze_image(image1)
|
301
|
+
result2 = await self.analyze_image(image2)
|
302
|
+
|
303
|
+
if not (result1.get('success') and result2.get('success')):
|
304
|
+
return {
|
305
|
+
'success': False,
|
306
|
+
'error': 'Failed to analyze one or both images'
|
307
|
+
}
|
308
|
+
|
309
|
+
objects1 = result1.get('detected_objects', [])
|
310
|
+
objects2 = result2.get('detected_objects', [])
|
311
|
+
|
312
|
+
# Simple comparison based on element counts and types
|
313
|
+
count_diff = abs(len(objects1) - len(objects2))
|
314
|
+
types1 = set([obj.get('type') for obj in objects1])
|
315
|
+
types2 = set([obj.get('type') for obj in objects2])
|
316
|
+
|
317
|
+
common_types = types1.intersection(types2)
|
318
|
+
unique_types = types1.symmetric_difference(types2)
|
319
|
+
|
320
|
+
similarity_score = len(common_types) / max(len(types1.union(types2)), 1)
|
321
|
+
|
322
|
+
return {
|
323
|
+
'success': True,
|
324
|
+
'similarity_score': similarity_score,
|
325
|
+
'differences': f"Different element types: {list(unique_types)}",
|
326
|
+
'common_elements': f"Common element types: {list(common_types)}",
|
327
|
+
'metadata': {
|
328
|
+
'elements_count_1': len(objects1),
|
329
|
+
'elements_count_2': len(objects2),
|
330
|
+
'count_difference': count_diff
|
331
|
+
}
|
332
|
+
}
|
333
|
+
|
334
|
+
def get_supported_formats(self) -> List[str]:
|
335
|
+
"""Get supported image formats"""
|
336
|
+
return ['jpg', 'jpeg', 'png', 'bmp', 'gif', 'tiff']
|
337
|
+
|
338
|
+
def get_max_image_size(self) -> Dict[str, int]:
|
339
|
+
"""Get maximum image dimensions"""
|
340
|
+
return {'width': 4096, 'height': 4096}
|
341
|
+
|
342
|
+
async def close(self):
|
343
|
+
"""Cleanup resources"""
|
344
|
+
# Modal connections don't need explicit cleanup
|
345
|
+
pass
|
346
|
+
|
347
|
+
# Helper methods
|
348
|
+
|
349
|
+
def _encode_image(self, image: Union[str, BinaryIO]) -> str:
|
350
|
+
"""Convert image to base64 string"""
|
351
|
+
if isinstance(image, str):
|
352
|
+
# File path
|
353
|
+
with open(image, 'rb') as f:
|
354
|
+
image_data = f.read()
|
355
|
+
else:
|
356
|
+
# Binary data
|
357
|
+
if hasattr(image, 'read'):
|
358
|
+
image_data = image.read()
|
359
|
+
else:
|
360
|
+
# Assume it's bytes
|
361
|
+
image_data = bytes(image) if not isinstance(image, bytes) else image
|
362
|
+
|
363
|
+
return base64.b64encode(image_data).decode('utf-8')
|
364
|
+
|
365
|
+
async def _extract_text_fallback(self, image: Union[str, BinaryIO]) -> Dict[str, Any]:
|
366
|
+
"""Fallback OCR using UI service (basic text detection)"""
|
367
|
+
# For now, return placeholder
|
368
|
+
return {
|
369
|
+
'success': False,
|
370
|
+
'error': 'OCR service not available, deploy document analysis service',
|
371
|
+
'text': '',
|
372
|
+
'confidence': 0,
|
373
|
+
'service': 'isa-vision-fallback'
|
374
|
+
}
|
375
|
+
|
376
|
+
async def _analyze_document(self, image: Union[str, BinaryIO]) -> Dict[str, Any]:
|
377
|
+
"""Analyze document with tables and OCR"""
|
378
|
+
|
379
|
+
if not self.doc_app:
|
380
|
+
return {
|
381
|
+
'success': False,
|
382
|
+
'error': 'Document analysis service not deployed',
|
383
|
+
'service': 'isa-vision-doc'
|
384
|
+
}
|
385
|
+
|
386
|
+
try:
|
387
|
+
# Convert image to base64
|
388
|
+
image_b64 = self._encode_image(image)
|
389
|
+
|
390
|
+
# Call Modal document analysis service using from_name (new API)
|
391
|
+
doc_analyzer = modal.Cls.from_name("isa-vision-doc", "DocumentAnalysisService")
|
392
|
+
result = doc_analyzer().analyze_document_complete.remote(image_b64)
|
393
|
+
|
394
|
+
return result
|
395
|
+
|
396
|
+
except Exception as e:
|
397
|
+
logger.error(f"Document analysis failed: {e}")
|
398
|
+
return {
|
399
|
+
'success': False,
|
400
|
+
'error': str(e),
|
401
|
+
'service': 'isa-vision-doc'
|
402
|
+
}
|
@@ -70,6 +70,32 @@ class OpenAIVisionService(BaseVisionService):
|
|
70
70
|
# If it's bytes data
|
71
71
|
return base64.b64encode(image_path_or_data).decode("utf-8") # type: ignore
|
72
72
|
|
73
|
+
async def invoke(
|
74
|
+
self,
|
75
|
+
image: Union[str, BinaryIO],
|
76
|
+
prompt: Optional[str] = None,
|
77
|
+
task: Optional[str] = None,
|
78
|
+
**kwargs
|
79
|
+
) -> Dict[str, Any]:
|
80
|
+
"""
|
81
|
+
Unified invoke method for all vision operations
|
82
|
+
"""
|
83
|
+
task = task or "analyze"
|
84
|
+
|
85
|
+
if task == "analyze":
|
86
|
+
return await self.analyze_image(image, prompt, kwargs.get("max_tokens", 1000))
|
87
|
+
elif task == "describe":
|
88
|
+
return await self.describe_image(image, kwargs.get("detail_level", "medium"))
|
89
|
+
elif task == "extract_text":
|
90
|
+
return await self.extract_text(image)
|
91
|
+
elif task == "detect_objects":
|
92
|
+
return await self.detect_objects(image, kwargs.get("confidence_threshold", 0.5))
|
93
|
+
elif task == "classify":
|
94
|
+
return await self.classify_image(image, kwargs.get("categories"))
|
95
|
+
else:
|
96
|
+
# Default to analyze_image for unknown tasks
|
97
|
+
return await self.analyze_image(image, prompt, kwargs.get("max_tokens", 1000))
|
98
|
+
|
73
99
|
@retry(
|
74
100
|
stop=stop_after_attempt(3),
|
75
101
|
wait=wait_exponential(multiplier=1, min=4, max=10),
|
@@ -221,24 +247,140 @@ class OpenAIVisionService(BaseVisionService):
|
|
221
247
|
confidence_threshold: float = 0.5
|
222
248
|
) -> Dict[str, Any]:
|
223
249
|
"""Detect objects in image"""
|
224
|
-
prompt = "List all objects visible in this image. For each object, provide
|
225
|
-
|
250
|
+
prompt = """List all objects visible in this image. For each object, provide:
|
251
|
+
1. Object name
|
252
|
+
2. Approximate location as percentages from top-left corner (x%, y%)
|
253
|
+
3. Approximate size as percentages of image dimensions (width%, height%)
|
254
|
+
4. Brief description
|
255
|
+
|
256
|
+
Format each object as: "ObjectName: x=X%, y=Y%, width=W%, height=H% - Description"
|
257
|
+
|
258
|
+
Example: "Car: x=25%, y=40%, width=15%, height=12% - Red sedan in the center"
|
259
|
+
"""
|
260
|
+
result = await self.analyze_image(image, prompt, 1500)
|
226
261
|
|
227
|
-
# Parse the response to extract object information
|
262
|
+
# Parse the response to extract object information with coordinates
|
228
263
|
objects = []
|
264
|
+
bounding_boxes = []
|
229
265
|
lines = result["text"].split('\n')
|
266
|
+
|
230
267
|
for line in lines:
|
231
268
|
line = line.strip()
|
232
|
-
if line and
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
269
|
+
if line and ':' in line and ('x=' in line or 'width=' in line):
|
270
|
+
try:
|
271
|
+
# Extract object name and details
|
272
|
+
parts = line.split(':', 1)
|
273
|
+
if len(parts) == 2:
|
274
|
+
object_name = parts[0].strip()
|
275
|
+
details = parts[1].strip()
|
276
|
+
|
277
|
+
# Extract coordinates using regex-like parsing
|
278
|
+
coords = {}
|
279
|
+
for param in ['x', 'y', 'width', 'height']:
|
280
|
+
param_pattern = f"{param}="
|
281
|
+
if param_pattern in details:
|
282
|
+
start_idx = details.find(param_pattern) + len(param_pattern)
|
283
|
+
end_idx = details.find('%', start_idx)
|
284
|
+
if end_idx > start_idx:
|
285
|
+
try:
|
286
|
+
value = float(details[start_idx:end_idx])
|
287
|
+
coords[param] = value
|
288
|
+
except ValueError:
|
289
|
+
continue
|
290
|
+
|
291
|
+
# Extract description (after the coordinates)
|
292
|
+
desc_start = details.find(' - ')
|
293
|
+
description = details[desc_start + 3:] if desc_start != -1 else details
|
294
|
+
|
295
|
+
objects.append({
|
296
|
+
"label": object_name,
|
297
|
+
"confidence": 1.0,
|
298
|
+
"coordinates": coords,
|
299
|
+
"description": description
|
300
|
+
})
|
301
|
+
|
302
|
+
# Add bounding box if we have coordinates
|
303
|
+
if all(k in coords for k in ['x', 'y', 'width', 'height']):
|
304
|
+
bounding_boxes.append({
|
305
|
+
"label": object_name,
|
306
|
+
"x_percent": coords['x'],
|
307
|
+
"y_percent": coords['y'],
|
308
|
+
"width_percent": coords['width'],
|
309
|
+
"height_percent": coords['height']
|
310
|
+
})
|
311
|
+
|
312
|
+
except Exception:
|
313
|
+
# Fallback for objects that don't match expected format
|
314
|
+
objects.append({
|
315
|
+
"label": line,
|
316
|
+
"confidence": 1.0,
|
317
|
+
"coordinates": {},
|
318
|
+
"description": line
|
319
|
+
})
|
237
320
|
|
238
321
|
return {
|
239
322
|
"objects": objects,
|
240
323
|
"count": len(objects),
|
241
|
-
"bounding_boxes":
|
324
|
+
"bounding_boxes": bounding_boxes,
|
325
|
+
"metadata": result["metadata"]
|
326
|
+
}
|
327
|
+
|
328
|
+
async def get_object_coordinates(
|
329
|
+
self,
|
330
|
+
image: Union[str, BinaryIO],
|
331
|
+
object_name: str
|
332
|
+
) -> Dict[str, Any]:
|
333
|
+
"""Get coordinates of a specific object in the image"""
|
334
|
+
prompt = f"""Locate the {object_name} in this image and return its center coordinates as [x, y] pixels.
|
335
|
+
|
336
|
+
Respond in this exact format:
|
337
|
+
FOUND: YES/NO
|
338
|
+
CENTER: [x, y]
|
339
|
+
DESCRIPTION: [Brief description]
|
340
|
+
|
341
|
+
If found, provide the pixel coordinates of the center point.
|
342
|
+
If not found, explain why.
|
343
|
+
|
344
|
+
Example:
|
345
|
+
FOUND: YES
|
346
|
+
CENTER: [640, 360]
|
347
|
+
DESCRIPTION: Blue login button in the center-left area
|
348
|
+
"""
|
349
|
+
|
350
|
+
result = await self.analyze_image(image, prompt, 300)
|
351
|
+
response_text = result["text"]
|
352
|
+
|
353
|
+
# Parse the structured response
|
354
|
+
found = False
|
355
|
+
center_coords = None
|
356
|
+
description = ""
|
357
|
+
|
358
|
+
lines = response_text.split('\n')
|
359
|
+
for line in lines:
|
360
|
+
line = line.strip()
|
361
|
+
if line.startswith('FOUND:'):
|
362
|
+
found = 'YES' in line.upper()
|
363
|
+
elif line.startswith('CENTER:') and found:
|
364
|
+
# Extract center coordinates [x, y]
|
365
|
+
coords_text = line.replace('CENTER:', '').strip()
|
366
|
+
try:
|
367
|
+
# Remove brackets and split
|
368
|
+
coords_text = coords_text.replace('[', '').replace(']', '')
|
369
|
+
if ',' in coords_text:
|
370
|
+
x_str, y_str = coords_text.split(',')
|
371
|
+
x = int(float(x_str.strip()))
|
372
|
+
y = int(float(y_str.strip()))
|
373
|
+
center_coords = [x, y]
|
374
|
+
except (ValueError, IndexError):
|
375
|
+
pass
|
376
|
+
elif line.startswith('DESCRIPTION:'):
|
377
|
+
description = line.replace('DESCRIPTION:', '').strip()
|
378
|
+
|
379
|
+
return {
|
380
|
+
"found": found,
|
381
|
+
"center_coordinates": center_coords,
|
382
|
+
"confidence": 1.0 if found else 0.0,
|
383
|
+
"description": description,
|
242
384
|
"metadata": result["metadata"]
|
243
385
|
}
|
244
386
|
|