isa-model 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. isa_model/config/__init__.py +9 -0
  2. isa_model/config/config_manager.py +213 -0
  3. isa_model/core/model_manager.py +5 -0
  4. isa_model/core/model_registry.py +39 -6
  5. isa_model/core/storage/supabase_storage.py +344 -0
  6. isa_model/core/vision_models_init.py +116 -0
  7. isa_model/deployment/cloud/__init__.py +9 -0
  8. isa_model/deployment/cloud/modal/__init__.py +10 -0
  9. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +612 -0
  10. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +305 -0
  11. isa_model/inference/ai_factory.py +238 -14
  12. isa_model/inference/providers/modal_provider.py +109 -0
  13. isa_model/inference/providers/yyds_provider.py +108 -0
  14. isa_model/inference/services/__init__.py +2 -1
  15. isa_model/inference/services/base_service.py +0 -38
  16. isa_model/inference/services/llm/base_llm_service.py +32 -0
  17. isa_model/inference/services/llm/llm_adapter.py +73 -3
  18. isa_model/inference/services/llm/ollama_llm_service.py +104 -3
  19. isa_model/inference/services/llm/openai_llm_service.py +67 -15
  20. isa_model/inference/services/llm/yyds_llm_service.py +254 -0
  21. isa_model/inference/services/stacked/__init__.py +26 -0
  22. isa_model/inference/services/stacked/base_stacked_service.py +269 -0
  23. isa_model/inference/services/stacked/config.py +426 -0
  24. isa_model/inference/services/stacked/doc_analysis_service.py +640 -0
  25. isa_model/inference/services/stacked/flux_professional_service.py +579 -0
  26. isa_model/inference/services/stacked/ui_analysis_service.py +1319 -0
  27. isa_model/inference/services/vision/base_image_gen_service.py +0 -34
  28. isa_model/inference/services/vision/base_vision_service.py +46 -2
  29. isa_model/inference/services/vision/isA_vision_service.py +402 -0
  30. isa_model/inference/services/vision/openai_vision_service.py +151 -9
  31. isa_model/inference/services/vision/replicate_image_gen_service.py +166 -38
  32. isa_model/inference/services/vision/replicate_vision_service.py +693 -0
  33. isa_model/serving/__init__.py +19 -0
  34. isa_model/serving/api/__init__.py +10 -0
  35. isa_model/serving/api/fastapi_server.py +84 -0
  36. isa_model/serving/api/middleware/__init__.py +9 -0
  37. isa_model/serving/api/middleware/request_logger.py +88 -0
  38. isa_model/serving/api/routes/__init__.py +5 -0
  39. isa_model/serving/api/routes/health.py +82 -0
  40. isa_model/serving/api/routes/llm.py +19 -0
  41. isa_model/serving/api/routes/ui_analysis.py +223 -0
  42. isa_model/serving/api/routes/vision.py +19 -0
  43. isa_model/serving/api/schemas/__init__.py +17 -0
  44. isa_model/serving/api/schemas/common.py +33 -0
  45. isa_model/serving/api/schemas/ui_analysis.py +78 -0
  46. {isa_model-0.3.3.dist-info → isa_model-0.3.5.dist-info}/METADATA +1 -1
  47. {isa_model-0.3.3.dist-info → isa_model-0.3.5.dist-info}/RECORD +49 -17
  48. {isa_model-0.3.3.dist-info → isa_model-0.3.5.dist-info}/WHEEL +0 -0
  49. {isa_model-0.3.3.dist-info → isa_model-0.3.5.dist-info}/top_level.txt +0 -0
@@ -68,40 +68,6 @@ class BaseImageGenService(BaseService):
68
68
  """
69
69
  pass
70
70
 
71
- @abstractmethod
72
- async def generate_image_to_file(
73
- self,
74
- prompt: str,
75
- output_path: str,
76
- negative_prompt: Optional[str] = None,
77
- width: int = 512,
78
- height: int = 512,
79
- num_inference_steps: int = 20,
80
- guidance_scale: float = 7.5,
81
- seed: Optional[int] = None
82
- ) -> Dict[str, Any]:
83
- """
84
- Generate image and save directly to file
85
-
86
- Args:
87
- prompt: Text description of the desired image
88
- output_path: Path to save the generated image
89
- negative_prompt: Text describing what to avoid in the image
90
- width: Image width in pixels
91
- height: Image height in pixels
92
- num_inference_steps: Number of denoising steps
93
- guidance_scale: How closely to follow the prompt
94
- seed: Random seed for reproducible results
95
-
96
- Returns:
97
- Dict containing generation results with keys:
98
- - file_path: Path to saved image
99
- - width: Image width
100
- - height: Image height
101
- - seed: Seed used for generation
102
- """
103
- pass
104
-
105
71
  @abstractmethod
106
72
  async def image_to_image(
107
73
  self,
@@ -5,6 +5,28 @@ from isa_model.inference.services.base_service import BaseService
5
5
  class BaseVisionService(BaseService):
6
6
  """Base class for vision understanding services"""
7
7
 
8
+ @abstractmethod
9
+ async def invoke(
10
+ self,
11
+ image: Union[str, BinaryIO],
12
+ prompt: Optional[str] = None,
13
+ task: Optional[str] = None,
14
+ **kwargs
15
+ ) -> Dict[str, Any]:
16
+ """
17
+ Unified invoke method for all vision operations
18
+
19
+ Args:
20
+ image: Path to image file or image data
21
+ prompt: Optional text prompt/question about the image
22
+ task: Task type (analyze, describe, extract_text, detect_objects, etc.)
23
+ **kwargs: Additional task-specific parameters
24
+
25
+ Returns:
26
+ Dict containing task results
27
+ """
28
+ pass
29
+
8
30
  @abstractmethod
9
31
  async def analyze_image(
10
32
  self,
@@ -103,9 +125,31 @@ class BaseVisionService(BaseService):
103
125
 
104
126
  Returns:
105
127
  Dict containing detection results with keys:
106
- - objects: List of detected objects with labels and confidence
128
+ - objects: List of detected objects with labels, confidence, and coordinates
107
129
  - count: Number of objects detected
108
- - bounding_boxes: Object locations (if available)
130
+ - bounding_boxes: Object locations with coordinates
131
+ """
132
+ pass
133
+
134
+ @abstractmethod
135
+ async def get_object_coordinates(
136
+ self,
137
+ image: Union[str, BinaryIO],
138
+ object_name: str
139
+ ) -> Dict[str, Any]:
140
+ """
141
+ Get coordinates of a specific object in the image
142
+
143
+ Args:
144
+ image: Path to image file or image data
145
+ object_name: Name of the object to locate
146
+
147
+ Returns:
148
+ Dict containing coordinate results with keys:
149
+ - found: Boolean indicating if object was found
150
+ - center_coordinates: List [x, y] with pixel coordinates of center point
151
+ - confidence: Confidence score for the detection
152
+ - description: Description of the object location
109
153
  """
110
154
  pass
111
155
 
@@ -0,0 +1,402 @@
1
+ """
2
+ ISA Vision Service
3
+
4
+ Connects to self-hosted Modal UI detection service
5
+ Provides vision capabilities using our deployed models
6
+ """
7
+
8
+ import modal
9
+ import base64
10
+ import io
11
+ import logging
12
+ from typing import Dict, Any, List, Union, Optional, BinaryIO
13
+ from PIL import Image
14
+
15
+ from .base_vision_service import BaseVisionService
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ class ISAVisionService(BaseVisionService):
20
+ """ISA Vision Service using Modal backend"""
21
+
22
+ def __init__(self, provider, model_name: str):
23
+ super().__init__(provider, model_name)
24
+ self.ui_app = None
25
+ self.doc_app = None
26
+ self._initialize_modal_connections()
27
+
28
+ def _initialize_modal_connections(self):
29
+ """Initialize connections to Modal services"""
30
+ try:
31
+ # Connect to UI detection service
32
+ self.ui_app = modal.App.lookup("isa-vision-ui", create_if_missing=False)
33
+ logger.info(" Connected to UI detection service")
34
+ except Exception as e:
35
+ logger.warning(f"� UI service not available: {e}")
36
+ self.ui_app = None
37
+
38
+ try:
39
+ # Connect to document analysis service (when deployed)
40
+ self.doc_app = modal.App.lookup("isa-vision-doc", create_if_missing=False)
41
+ logger.info(" Connected to document analysis service")
42
+ except Exception as e:
43
+ logger.warning(f"� Document service not available: {e}")
44
+ self.doc_app = None
45
+
46
+ async def invoke(
47
+ self,
48
+ image: Union[str, BinaryIO],
49
+ prompt: Optional[str] = None,
50
+ task: Optional[str] = None,
51
+ **kwargs
52
+ ) -> Dict[str, Any]:
53
+ """
54
+ Unified invoke method for all vision operations
55
+ """
56
+ if task == "detect_ui" or task == "ui_analysis":
57
+ return await self.detect_objects(image, **kwargs)
58
+ elif task == "extract_text" or task == "ocr":
59
+ return await self.extract_text(image)
60
+ elif task == "analyze_document":
61
+ return await self._analyze_document(image)
62
+ else:
63
+ return await self.analyze_image(image, prompt, **kwargs)
64
+
65
+ async def analyze_image(
66
+ self,
67
+ image: Union[str, BinaryIO],
68
+ prompt: Optional[str] = None,
69
+ max_tokens: int = 1000
70
+ ) -> Dict[str, Any]:
71
+ """Analyze image using UI detection service"""
72
+
73
+ if not self.ui_app:
74
+ return {
75
+ 'error': 'UI detection service not available',
76
+ 'success': False
77
+ }
78
+
79
+ try:
80
+ # Convert image to base64
81
+ image_b64 = self._encode_image(image)
82
+
83
+ # Call Modal UI detection service using from_name (new API)
84
+ ui_detector = modal.Cls.from_name("isa-vision-ui", "UIDetectionService")
85
+ result = ui_detector().detect_ui_elements.remote(image_b64)
86
+
87
+ if result.get('success'):
88
+ return {
89
+ 'success': True,
90
+ 'service': 'isa-vision',
91
+ 'text': f"Detected {result.get('element_count', 0)} UI elements",
92
+ 'detected_objects': result.get('ui_elements', []),
93
+ 'confidence': 0.9,
94
+ 'metadata': {
95
+ 'processing_time': result.get('processing_time'),
96
+ 'detection_method': result.get('detection_method'),
97
+ 'model_info': result.get('model_info')
98
+ }
99
+ }
100
+ else:
101
+ return {
102
+ 'success': False,
103
+ 'error': result.get('error', 'Unknown error'),
104
+ 'service': 'isa-vision'
105
+ }
106
+
107
+ except Exception as e:
108
+ logger.error(f"Image analysis failed: {e}")
109
+ return {
110
+ 'success': False,
111
+ 'error': str(e),
112
+ 'service': 'isa-vision'
113
+ }
114
+
115
+ async def analyze_images(
116
+ self,
117
+ images: List[Union[str, BinaryIO]],
118
+ prompt: Optional[str] = None,
119
+ max_tokens: int = 1000
120
+ ) -> List[Dict[str, Any]]:
121
+ """Analyze multiple images"""
122
+ results = []
123
+ for image in images:
124
+ result = await self.analyze_image(image, prompt, max_tokens)
125
+ results.append(result)
126
+ return results
127
+
128
+ async def describe_image(
129
+ self,
130
+ image: Union[str, BinaryIO],
131
+ detail_level: str = "medium"
132
+ ) -> Dict[str, Any]:
133
+ """Generate description using UI detection"""
134
+ result = await self.analyze_image(image)
135
+
136
+ if result.get('success'):
137
+ objects = result.get('detected_objects', [])
138
+ description = f"This appears to be a user interface with {len(objects)} interactive elements. "
139
+
140
+ if objects:
141
+ element_types = list(set([obj.get('type', 'element') for obj in objects]))
142
+ description += f"The interface contains: {', '.join(element_types)}."
143
+
144
+ return {
145
+ 'success': True,
146
+ 'description': description,
147
+ 'objects': objects,
148
+ 'scene': 'User Interface',
149
+ 'colors': ['unknown'] # Could be enhanced with color detection
150
+ }
151
+ else:
152
+ return result
153
+
154
+ async def extract_text(self, image: Union[str, BinaryIO]) -> Dict[str, Any]:
155
+ """Extract text using document analysis service"""
156
+
157
+ if not self.doc_app:
158
+ # Fallback to UI service for basic text detection
159
+ return await self._extract_text_fallback(image)
160
+
161
+ try:
162
+ # Convert image to base64
163
+ image_b64 = self._encode_image(image)
164
+
165
+ # Call Modal document analysis service using from_name (new API)
166
+ doc_analyzer = modal.Cls.from_name("isa-vision-doc", "DocumentAnalysisService")
167
+ result = doc_analyzer().extract_text.remote(image_b64)
168
+
169
+ if result.get('success'):
170
+ text_results = result.get('text_results', [])
171
+ all_text = ' '.join([item.get('text', '') for item in text_results])
172
+
173
+ return {
174
+ 'success': True,
175
+ 'service': 'isa-vision-doc',
176
+ 'text': all_text,
177
+ 'confidence': sum([item.get('confidence', 0) for item in text_results]) / len(text_results) if text_results else 0,
178
+ 'bounding_boxes': [item.get('bbox') for item in text_results],
179
+ 'language': 'auto-detected',
180
+ 'metadata': {
181
+ 'processing_time': result.get('processing_time'),
182
+ 'text_count': result.get('text_count')
183
+ }
184
+ }
185
+ else:
186
+ return {
187
+ 'success': False,
188
+ 'error': result.get('error', 'OCR failed'),
189
+ 'service': 'isa-vision-doc'
190
+ }
191
+
192
+ except Exception as e:
193
+ logger.error(f"Text extraction failed: {e}")
194
+ return {
195
+ 'success': False,
196
+ 'error': str(e),
197
+ 'service': 'isa-vision-doc'
198
+ }
199
+
200
+ async def detect_objects(
201
+ self,
202
+ image: Union[str, BinaryIO],
203
+ confidence_threshold: float = 0.5
204
+ ) -> Dict[str, Any]:
205
+ """Detect UI elements using UI detection service"""
206
+
207
+ result = await self.analyze_image(image)
208
+
209
+ if result.get('success'):
210
+ objects = result.get('detected_objects', [])
211
+ # Filter by confidence threshold
212
+ filtered_objects = [obj for obj in objects if obj.get('confidence', 0) >= confidence_threshold]
213
+
214
+ return {
215
+ 'success': True,
216
+ 'service': 'isa-vision-ui',
217
+ 'objects': filtered_objects,
218
+ 'count': len(filtered_objects),
219
+ 'bounding_boxes': [obj.get('bbox') for obj in filtered_objects],
220
+ 'metadata': result.get('metadata', {})
221
+ }
222
+ else:
223
+ return result
224
+
225
+ async def get_object_coordinates(
226
+ self,
227
+ image: Union[str, BinaryIO],
228
+ object_name: str
229
+ ) -> Dict[str, Any]:
230
+ """Get coordinates of specific UI element"""
231
+
232
+ detection_result = await self.detect_objects(image)
233
+
234
+ if not detection_result.get('success'):
235
+ return detection_result
236
+
237
+ objects = detection_result.get('objects', [])
238
+
239
+ # Look for object by name/type
240
+ for obj in objects:
241
+ obj_type = obj.get('type', '').lower()
242
+ obj_content = obj.get('content', '').lower()
243
+
244
+ if object_name.lower() in obj_type or object_name.lower() in obj_content:
245
+ return {
246
+ 'success': True,
247
+ 'found': True,
248
+ 'center_coordinates': obj.get('center', [0, 0]),
249
+ 'confidence': obj.get('confidence', 0),
250
+ 'description': f"Found {obj.get('type')} at center coordinates",
251
+ 'object_info': obj
252
+ }
253
+
254
+ return {
255
+ 'success': True,
256
+ 'found': False,
257
+ 'center_coordinates': [0, 0],
258
+ 'confidence': 0,
259
+ 'description': f"Object '{object_name}' not found in image"
260
+ }
261
+
262
+ async def classify_image(
263
+ self,
264
+ image: Union[str, BinaryIO],
265
+ categories: Optional[List[str]] = None
266
+ ) -> Dict[str, Any]:
267
+ """Classify image type"""
268
+
269
+ result = await self.analyze_image(image)
270
+
271
+ if result.get('success'):
272
+ objects = result.get('detected_objects', [])
273
+
274
+ # Simple classification based on detected UI elements
275
+ if objects:
276
+ category = "user_interface"
277
+ confidence = 0.9
278
+ else:
279
+ category = "unknown"
280
+ confidence = 0.1
281
+
282
+ return {
283
+ 'success': True,
284
+ 'category': category,
285
+ 'confidence': confidence,
286
+ 'all_predictions': [
287
+ {'category': category, 'confidence': confidence}
288
+ ]
289
+ }
290
+ else:
291
+ return result
292
+
293
+ async def compare_images(
294
+ self,
295
+ image1: Union[str, BinaryIO],
296
+ image2: Union[str, BinaryIO]
297
+ ) -> Dict[str, Any]:
298
+ """Compare two images based on UI elements"""
299
+
300
+ result1 = await self.analyze_image(image1)
301
+ result2 = await self.analyze_image(image2)
302
+
303
+ if not (result1.get('success') and result2.get('success')):
304
+ return {
305
+ 'success': False,
306
+ 'error': 'Failed to analyze one or both images'
307
+ }
308
+
309
+ objects1 = result1.get('detected_objects', [])
310
+ objects2 = result2.get('detected_objects', [])
311
+
312
+ # Simple comparison based on element counts and types
313
+ count_diff = abs(len(objects1) - len(objects2))
314
+ types1 = set([obj.get('type') for obj in objects1])
315
+ types2 = set([obj.get('type') for obj in objects2])
316
+
317
+ common_types = types1.intersection(types2)
318
+ unique_types = types1.symmetric_difference(types2)
319
+
320
+ similarity_score = len(common_types) / max(len(types1.union(types2)), 1)
321
+
322
+ return {
323
+ 'success': True,
324
+ 'similarity_score': similarity_score,
325
+ 'differences': f"Different element types: {list(unique_types)}",
326
+ 'common_elements': f"Common element types: {list(common_types)}",
327
+ 'metadata': {
328
+ 'elements_count_1': len(objects1),
329
+ 'elements_count_2': len(objects2),
330
+ 'count_difference': count_diff
331
+ }
332
+ }
333
+
334
+ def get_supported_formats(self) -> List[str]:
335
+ """Get supported image formats"""
336
+ return ['jpg', 'jpeg', 'png', 'bmp', 'gif', 'tiff']
337
+
338
+ def get_max_image_size(self) -> Dict[str, int]:
339
+ """Get maximum image dimensions"""
340
+ return {'width': 4096, 'height': 4096}
341
+
342
+ async def close(self):
343
+ """Cleanup resources"""
344
+ # Modal connections don't need explicit cleanup
345
+ pass
346
+
347
+ # Helper methods
348
+
349
+ def _encode_image(self, image: Union[str, BinaryIO]) -> str:
350
+ """Convert image to base64 string"""
351
+ if isinstance(image, str):
352
+ # File path
353
+ with open(image, 'rb') as f:
354
+ image_data = f.read()
355
+ else:
356
+ # Binary data
357
+ if hasattr(image, 'read'):
358
+ image_data = image.read()
359
+ else:
360
+ # Assume it's bytes
361
+ image_data = bytes(image) if not isinstance(image, bytes) else image
362
+
363
+ return base64.b64encode(image_data).decode('utf-8')
364
+
365
+ async def _extract_text_fallback(self, image: Union[str, BinaryIO]) -> Dict[str, Any]:
366
+ """Fallback OCR using UI service (basic text detection)"""
367
+ # For now, return placeholder
368
+ return {
369
+ 'success': False,
370
+ 'error': 'OCR service not available, deploy document analysis service',
371
+ 'text': '',
372
+ 'confidence': 0,
373
+ 'service': 'isa-vision-fallback'
374
+ }
375
+
376
+ async def _analyze_document(self, image: Union[str, BinaryIO]) -> Dict[str, Any]:
377
+ """Analyze document with tables and OCR"""
378
+
379
+ if not self.doc_app:
380
+ return {
381
+ 'success': False,
382
+ 'error': 'Document analysis service not deployed',
383
+ 'service': 'isa-vision-doc'
384
+ }
385
+
386
+ try:
387
+ # Convert image to base64
388
+ image_b64 = self._encode_image(image)
389
+
390
+ # Call Modal document analysis service using from_name (new API)
391
+ doc_analyzer = modal.Cls.from_name("isa-vision-doc", "DocumentAnalysisService")
392
+ result = doc_analyzer().analyze_document_complete.remote(image_b64)
393
+
394
+ return result
395
+
396
+ except Exception as e:
397
+ logger.error(f"Document analysis failed: {e}")
398
+ return {
399
+ 'success': False,
400
+ 'error': str(e),
401
+ 'service': 'isa-vision-doc'
402
+ }
@@ -70,6 +70,32 @@ class OpenAIVisionService(BaseVisionService):
70
70
  # If it's bytes data
71
71
  return base64.b64encode(image_path_or_data).decode("utf-8") # type: ignore
72
72
 
73
+ async def invoke(
74
+ self,
75
+ image: Union[str, BinaryIO],
76
+ prompt: Optional[str] = None,
77
+ task: Optional[str] = None,
78
+ **kwargs
79
+ ) -> Dict[str, Any]:
80
+ """
81
+ Unified invoke method for all vision operations
82
+ """
83
+ task = task or "analyze"
84
+
85
+ if task == "analyze":
86
+ return await self.analyze_image(image, prompt, kwargs.get("max_tokens", 1000))
87
+ elif task == "describe":
88
+ return await self.describe_image(image, kwargs.get("detail_level", "medium"))
89
+ elif task == "extract_text":
90
+ return await self.extract_text(image)
91
+ elif task == "detect_objects":
92
+ return await self.detect_objects(image, kwargs.get("confidence_threshold", 0.5))
93
+ elif task == "classify":
94
+ return await self.classify_image(image, kwargs.get("categories"))
95
+ else:
96
+ # Default to analyze_image for unknown tasks
97
+ return await self.analyze_image(image, prompt, kwargs.get("max_tokens", 1000))
98
+
73
99
  @retry(
74
100
  stop=stop_after_attempt(3),
75
101
  wait=wait_exponential(multiplier=1, min=4, max=10),
@@ -221,24 +247,140 @@ class OpenAIVisionService(BaseVisionService):
221
247
  confidence_threshold: float = 0.5
222
248
  ) -> Dict[str, Any]:
223
249
  """Detect objects in image"""
224
- prompt = "List all objects visible in this image. For each object, provide the object name and a brief description of its location in the image."
225
- result = await self.analyze_image(image, prompt, 1000)
250
+ prompt = """List all objects visible in this image. For each object, provide:
251
+ 1. Object name
252
+ 2. Approximate location as percentages from top-left corner (x%, y%)
253
+ 3. Approximate size as percentages of image dimensions (width%, height%)
254
+ 4. Brief description
255
+
256
+ Format each object as: "ObjectName: x=X%, y=Y%, width=W%, height=H% - Description"
257
+
258
+ Example: "Car: x=25%, y=40%, width=15%, height=12% - Red sedan in the center"
259
+ """
260
+ result = await self.analyze_image(image, prompt, 1500)
226
261
 
227
- # Parse the response to extract object information
262
+ # Parse the response to extract object information with coordinates
228
263
  objects = []
264
+ bounding_boxes = []
229
265
  lines = result["text"].split('\n')
266
+
230
267
  for line in lines:
231
268
  line = line.strip()
232
- if line and not line.startswith(('In this image', 'The image shows', 'I can see')):
233
- objects.append({
234
- "label": line,
235
- "confidence": 1.0 # OpenAI doesn't provide confidence scores
236
- })
269
+ if line and ':' in line and ('x=' in line or 'width=' in line):
270
+ try:
271
+ # Extract object name and details
272
+ parts = line.split(':', 1)
273
+ if len(parts) == 2:
274
+ object_name = parts[0].strip()
275
+ details = parts[1].strip()
276
+
277
+ # Extract coordinates using regex-like parsing
278
+ coords = {}
279
+ for param in ['x', 'y', 'width', 'height']:
280
+ param_pattern = f"{param}="
281
+ if param_pattern in details:
282
+ start_idx = details.find(param_pattern) + len(param_pattern)
283
+ end_idx = details.find('%', start_idx)
284
+ if end_idx > start_idx:
285
+ try:
286
+ value = float(details[start_idx:end_idx])
287
+ coords[param] = value
288
+ except ValueError:
289
+ continue
290
+
291
+ # Extract description (after the coordinates)
292
+ desc_start = details.find(' - ')
293
+ description = details[desc_start + 3:] if desc_start != -1 else details
294
+
295
+ objects.append({
296
+ "label": object_name,
297
+ "confidence": 1.0,
298
+ "coordinates": coords,
299
+ "description": description
300
+ })
301
+
302
+ # Add bounding box if we have coordinates
303
+ if all(k in coords for k in ['x', 'y', 'width', 'height']):
304
+ bounding_boxes.append({
305
+ "label": object_name,
306
+ "x_percent": coords['x'],
307
+ "y_percent": coords['y'],
308
+ "width_percent": coords['width'],
309
+ "height_percent": coords['height']
310
+ })
311
+
312
+ except Exception:
313
+ # Fallback for objects that don't match expected format
314
+ objects.append({
315
+ "label": line,
316
+ "confidence": 1.0,
317
+ "coordinates": {},
318
+ "description": line
319
+ })
237
320
 
238
321
  return {
239
322
  "objects": objects,
240
323
  "count": len(objects),
241
- "bounding_boxes": [], # Not available with current API
324
+ "bounding_boxes": bounding_boxes,
325
+ "metadata": result["metadata"]
326
+ }
327
+
328
+ async def get_object_coordinates(
329
+ self,
330
+ image: Union[str, BinaryIO],
331
+ object_name: str
332
+ ) -> Dict[str, Any]:
333
+ """Get coordinates of a specific object in the image"""
334
+ prompt = f"""Locate the {object_name} in this image and return its center coordinates as [x, y] pixels.
335
+
336
+ Respond in this exact format:
337
+ FOUND: YES/NO
338
+ CENTER: [x, y]
339
+ DESCRIPTION: [Brief description]
340
+
341
+ If found, provide the pixel coordinates of the center point.
342
+ If not found, explain why.
343
+
344
+ Example:
345
+ FOUND: YES
346
+ CENTER: [640, 360]
347
+ DESCRIPTION: Blue login button in the center-left area
348
+ """
349
+
350
+ result = await self.analyze_image(image, prompt, 300)
351
+ response_text = result["text"]
352
+
353
+ # Parse the structured response
354
+ found = False
355
+ center_coords = None
356
+ description = ""
357
+
358
+ lines = response_text.split('\n')
359
+ for line in lines:
360
+ line = line.strip()
361
+ if line.startswith('FOUND:'):
362
+ found = 'YES' in line.upper()
363
+ elif line.startswith('CENTER:') and found:
364
+ # Extract center coordinates [x, y]
365
+ coords_text = line.replace('CENTER:', '').strip()
366
+ try:
367
+ # Remove brackets and split
368
+ coords_text = coords_text.replace('[', '').replace(']', '')
369
+ if ',' in coords_text:
370
+ x_str, y_str = coords_text.split(',')
371
+ x = int(float(x_str.strip()))
372
+ y = int(float(y_str.strip()))
373
+ center_coords = [x, y]
374
+ except (ValueError, IndexError):
375
+ pass
376
+ elif line.startswith('DESCRIPTION:'):
377
+ description = line.replace('DESCRIPTION:', '').strip()
378
+
379
+ return {
380
+ "found": found,
381
+ "center_coordinates": center_coords,
382
+ "confidence": 1.0 if found else 0.0,
383
+ "description": description,
242
384
  "metadata": result["metadata"]
243
385
  }
244
386