isa-model 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. isa_model/__init__.py +30 -1
  2. isa_model/client.py +770 -0
  3. isa_model/core/config/__init__.py +16 -0
  4. isa_model/core/config/config_manager.py +514 -0
  5. isa_model/core/config.py +426 -0
  6. isa_model/core/models/model_billing_tracker.py +476 -0
  7. isa_model/core/models/model_manager.py +399 -0
  8. isa_model/core/models/model_repo.py +343 -0
  9. isa_model/core/pricing_manager.py +426 -0
  10. isa_model/core/services/__init__.py +19 -0
  11. isa_model/core/services/intelligent_model_selector.py +547 -0
  12. isa_model/core/types.py +291 -0
  13. isa_model/deployment/__init__.py +2 -0
  14. isa_model/deployment/cloud/__init__.py +9 -0
  15. isa_model/deployment/cloud/modal/__init__.py +10 -0
  16. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +766 -0
  17. isa_model/deployment/cloud/modal/isa_vision_table_service.py +532 -0
  18. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +406 -0
  19. isa_model/deployment/cloud/modal/register_models.py +321 -0
  20. isa_model/deployment/runtime/deployed_service.py +338 -0
  21. isa_model/deployment/services/__init__.py +9 -0
  22. isa_model/deployment/services/auto_deploy_vision_service.py +537 -0
  23. isa_model/deployment/services/model_service.py +332 -0
  24. isa_model/deployment/services/service_monitor.py +356 -0
  25. isa_model/deployment/services/service_registry.py +527 -0
  26. isa_model/eval/__init__.py +80 -44
  27. isa_model/eval/config/__init__.py +10 -0
  28. isa_model/eval/config/evaluation_config.py +108 -0
  29. isa_model/eval/evaluators/__init__.py +18 -0
  30. isa_model/eval/evaluators/base_evaluator.py +503 -0
  31. isa_model/eval/evaluators/llm_evaluator.py +472 -0
  32. isa_model/eval/factory.py +417 -709
  33. isa_model/eval/infrastructure/__init__.py +24 -0
  34. isa_model/eval/infrastructure/experiment_tracker.py +466 -0
  35. isa_model/eval/metrics.py +191 -21
  36. isa_model/inference/ai_factory.py +187 -387
  37. isa_model/inference/providers/modal_provider.py +109 -0
  38. isa_model/inference/providers/yyds_provider.py +108 -0
  39. isa_model/inference/services/__init__.py +2 -1
  40. isa_model/inference/services/audio/base_stt_service.py +65 -1
  41. isa_model/inference/services/audio/base_tts_service.py +75 -1
  42. isa_model/inference/services/audio/openai_stt_service.py +189 -151
  43. isa_model/inference/services/audio/openai_tts_service.py +12 -10
  44. isa_model/inference/services/audio/replicate_tts_service.py +61 -56
  45. isa_model/inference/services/base_service.py +55 -55
  46. isa_model/inference/services/embedding/base_embed_service.py +65 -1
  47. isa_model/inference/services/embedding/ollama_embed_service.py +103 -43
  48. isa_model/inference/services/embedding/openai_embed_service.py +8 -10
  49. isa_model/inference/services/helpers/stacked_config.py +148 -0
  50. isa_model/inference/services/img/__init__.py +18 -0
  51. isa_model/inference/services/{vision → img}/base_image_gen_service.py +80 -35
  52. isa_model/inference/services/img/flux_professional_service.py +603 -0
  53. isa_model/inference/services/img/helpers/base_stacked_service.py +274 -0
  54. isa_model/inference/services/{vision → img}/replicate_image_gen_service.py +210 -69
  55. isa_model/inference/services/llm/__init__.py +3 -3
  56. isa_model/inference/services/llm/base_llm_service.py +519 -35
  57. isa_model/inference/services/llm/{llm_adapter.py → helpers/llm_adapter.py} +40 -0
  58. isa_model/inference/services/llm/helpers/llm_prompts.py +258 -0
  59. isa_model/inference/services/llm/helpers/llm_utils.py +280 -0
  60. isa_model/inference/services/llm/ollama_llm_service.py +150 -15
  61. isa_model/inference/services/llm/openai_llm_service.py +134 -31
  62. isa_model/inference/services/llm/yyds_llm_service.py +255 -0
  63. isa_model/inference/services/vision/__init__.py +38 -4
  64. isa_model/inference/services/vision/base_vision_service.py +241 -96
  65. isa_model/inference/services/vision/disabled/isA_vision_service.py +500 -0
  66. isa_model/inference/services/vision/doc_analysis_service.py +640 -0
  67. isa_model/inference/services/vision/helpers/base_stacked_service.py +274 -0
  68. isa_model/inference/services/vision/helpers/image_utils.py +272 -3
  69. isa_model/inference/services/vision/helpers/vision_prompts.py +297 -0
  70. isa_model/inference/services/vision/openai_vision_service.py +109 -170
  71. isa_model/inference/services/vision/replicate_vision_service.py +508 -0
  72. isa_model/inference/services/vision/ui_analysis_service.py +823 -0
  73. isa_model/scripts/register_models.py +370 -0
  74. isa_model/scripts/register_models_with_embeddings.py +510 -0
  75. isa_model/serving/__init__.py +19 -0
  76. isa_model/serving/api/__init__.py +10 -0
  77. isa_model/serving/api/fastapi_server.py +89 -0
  78. isa_model/serving/api/middleware/__init__.py +9 -0
  79. isa_model/serving/api/middleware/request_logger.py +88 -0
  80. isa_model/serving/api/routes/__init__.py +5 -0
  81. isa_model/serving/api/routes/health.py +82 -0
  82. isa_model/serving/api/routes/llm.py +19 -0
  83. isa_model/serving/api/routes/ui_analysis.py +223 -0
  84. isa_model/serving/api/routes/unified.py +202 -0
  85. isa_model/serving/api/routes/vision.py +19 -0
  86. isa_model/serving/api/schemas/__init__.py +17 -0
  87. isa_model/serving/api/schemas/common.py +33 -0
  88. isa_model/serving/api/schemas/ui_analysis.py +78 -0
  89. {isa_model-0.3.4.dist-info → isa_model-0.3.6.dist-info}/METADATA +4 -1
  90. isa_model-0.3.6.dist-info/RECORD +147 -0
  91. isa_model/core/model_manager.py +0 -208
  92. isa_model/core/model_registry.py +0 -342
  93. isa_model/inference/billing_tracker.py +0 -406
  94. isa_model/inference/services/llm/triton_llm_service.py +0 -481
  95. isa_model/inference/services/vision/ollama_vision_service.py +0 -194
  96. isa_model-0.3.4.dist-info/RECORD +0 -91
  97. /isa_model/core/{model_storage.py → models/model_storage.py} +0 -0
  98. /isa_model/inference/services/{vision → embedding}/helpers/text_splitter.py +0 -0
  99. {isa_model-0.3.4.dist-info → isa_model-0.3.6.dist-info}/WHEEL +0 -0
  100. {isa_model-0.3.4.dist-info → isa_model-0.3.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,823 @@
1
+ """
2
+ UI Analysis Stacked Service
3
+
4
+ A stacked service for analyzing UI screenshots using multiple AI models:
5
+ - Layer 1: Page Intelligence (GPT-4V)
6
+ - Layer 2: Element Detection (OmniParser/Florence-2/YOLO)
7
+ - Layer 3: Classification (GPT-4V)
8
+ """
9
+
10
+ from typing import Dict, Any, List, Optional, Union, BinaryIO
11
+ import json
12
+ from PIL import Image, ImageDraw, ImageFont
13
+
14
+ from .helpers.base_stacked_service import BaseStackedService, LayerConfig, LayerType, LayerResult
15
+
16
+ class UIAnalysisService(BaseStackedService):
17
+ """
18
+ Generic UI Analysis service using stacked AI models for comprehensive UI understanding.
19
+ Can handle different page types: login, search, content extraction, navigation, etc.
20
+ """
21
+
22
+ def __init__(self, ai_factory, config=None):
23
+ super().__init__(ai_factory, "ui_analysis")
24
+ self.ai_factory = ai_factory
25
+ self.config = config or {}
26
+ self.task_type = self.config.get("task_type", "login") # Default to login for backward compatibility
27
+ self._setup_layers_by_task()
28
+
29
+ def _setup_layers_by_task(self):
30
+ """Setup layer configuration based on task type"""
31
+
32
+ if self.task_type == "search":
33
+ self._setup_search_layers()
34
+ elif self.task_type == "content":
35
+ self._setup_content_layers()
36
+ elif self.task_type == "navigation":
37
+ self._setup_navigation_layers()
38
+ else:
39
+ # Default to login/form analysis
40
+ self._setup_default_layers()
41
+
42
+ def _setup_default_layers(self):
43
+ """Setup simplified two-layer architecture for UI analysis"""
44
+
45
+ # Layer 1: OmniParser 元素检测
46
+ self.add_layer(LayerConfig(
47
+ name="ui_detection",
48
+ layer_type=LayerType.DETECTION,
49
+ service_type="vision",
50
+ model_name="omniparser",
51
+ parameters={
52
+ "task": "ui_detection",
53
+ "imgsz": 640,
54
+ "box_threshold": 0.05,
55
+ "iou_threshold": 0.1
56
+ },
57
+ depends_on=[],
58
+ timeout=30.0,
59
+ fallback_enabled=True
60
+ ))
61
+
62
+ # Layer 2: GPT-4.1-nano 智能决策
63
+ self.add_layer(LayerConfig(
64
+ name="action_planning",
65
+ layer_type=LayerType.INTELLIGENCE,
66
+ service_type="vision",
67
+ model_name="default",
68
+ parameters={
69
+ "task": "action_planning",
70
+ "max_tokens": 500
71
+ },
72
+ depends_on=["ui_detection"],
73
+ timeout=15.0,
74
+ fallback_enabled=False
75
+ ))
76
+
77
+ def _setup_search_layers(self):
78
+ """Setup simplified two-layer architecture for search page analysis"""
79
+
80
+ self.add_layer(LayerConfig(
81
+ name="ui_detection",
82
+ layer_type=LayerType.DETECTION,
83
+ service_type="vision",
84
+ model_name="omniparser",
85
+ parameters={
86
+ "task": "ui_detection",
87
+ "imgsz": 640,
88
+ "box_threshold": 0.05,
89
+ "iou_threshold": 0.1
90
+ },
91
+ depends_on=[],
92
+ timeout=30.0,
93
+ fallback_enabled=True
94
+ ))
95
+
96
+ self.add_layer(LayerConfig(
97
+ name="action_planning",
98
+ layer_type=LayerType.INTELLIGENCE,
99
+ service_type="vision",
100
+ model_name="default",
101
+ parameters={
102
+ "task": "search_action_planning",
103
+ "max_tokens": 500
104
+ },
105
+ depends_on=["ui_detection"],
106
+ timeout=15.0,
107
+ fallback_enabled=False
108
+ ))
109
+
110
+ def _setup_content_layers(self):
111
+ """Setup simplified two-layer architecture for content extraction"""
112
+
113
+ self.add_layer(LayerConfig(
114
+ name="ui_detection",
115
+ layer_type=LayerType.DETECTION,
116
+ service_type="vision",
117
+ model_name="omniparser",
118
+ parameters={
119
+ "task": "ui_detection",
120
+ "imgsz": 640,
121
+ "box_threshold": 0.05,
122
+ "iou_threshold": 0.1
123
+ },
124
+ depends_on=[],
125
+ timeout=30.0,
126
+ fallback_enabled=True
127
+ ))
128
+
129
+ self.add_layer(LayerConfig(
130
+ name="action_planning",
131
+ layer_type=LayerType.INTELLIGENCE,
132
+ service_type="vision",
133
+ model_name="default",
134
+ parameters={
135
+ "task": "content_action_planning",
136
+ "max_tokens": 500
137
+ },
138
+ depends_on=["ui_detection"],
139
+ timeout=15.0,
140
+ fallback_enabled=False
141
+ ))
142
+
143
+ def _setup_navigation_layers(self):
144
+ """Setup simplified two-layer architecture for navigation analysis"""
145
+
146
+ self.add_layer(LayerConfig(
147
+ name="ui_detection",
148
+ layer_type=LayerType.DETECTION,
149
+ service_type="vision",
150
+ model_name="omniparser",
151
+ parameters={
152
+ "task": "ui_detection",
153
+ "imgsz": 640,
154
+ "box_threshold": 0.05,
155
+ "iou_threshold": 0.1
156
+ },
157
+ depends_on=[],
158
+ timeout=30.0,
159
+ fallback_enabled=True
160
+ ))
161
+
162
+ self.add_layer(LayerConfig(
163
+ name="action_planning",
164
+ layer_type=LayerType.INTELLIGENCE,
165
+ service_type="vision",
166
+ model_name="default",
167
+ parameters={
168
+ "task": "navigation_action_planning",
169
+ "max_tokens": 500
170
+ },
171
+ depends_on=["ui_detection"],
172
+ timeout=15.0,
173
+ fallback_enabled=False
174
+ ))
175
+
176
+ def configure_detection_model(self, model_name: str, parameters: Dict[str, Any]):
177
+ """Configure the detection model and parameters"""
178
+ for layer in self.layers:
179
+ if layer.name == "element_detection":
180
+ layer.model_name = model_name
181
+ layer.parameters.update(parameters)
182
+ break
183
+
184
+ def configure_intelligence_model(self, model_name: str, parameters: Dict[str, Any]):
185
+ """Configure the page intelligence model"""
186
+ for layer in self.layers:
187
+ if layer.name == "page_intelligence":
188
+ layer.model_name = model_name
189
+ layer.parameters.update(parameters)
190
+ break
191
+
192
+ async def execute_layer_logic(self, layer: LayerConfig, service: Any, context: Dict[str, Any]) -> Any:
193
+ """Execute specific logic for each layer type using unified invoke method"""
194
+
195
+ task = layer.parameters.get("task")
196
+ image_path = context["input"]["image_path"]
197
+
198
+ if task == "ui_detection":
199
+ return await self._invoke_ui_detection(service, image_path, layer.parameters)
200
+
201
+ elif task in ["action_planning", "search_action_planning", "content_action_planning", "navigation_action_planning"]:
202
+ ui_elements = context["results"]["ui_detection"].data
203
+ return await self._invoke_action_planning(service, image_path, ui_elements, layer.parameters)
204
+
205
+ else:
206
+ raise ValueError(f"Unsupported task: {task}")
207
+
208
+ # ==================== SIMPLIFIED TWO-LAYER METHODS ====================
209
+
210
+ async def _invoke_ui_detection(self, service: Any, image_path: str, parameters: Dict[str, Any]) -> List[Dict[str, Any]]:
211
+ """执行UI元素检测(第一层)"""
212
+
213
+ if hasattr(service, 'run_omniparser'):
214
+ result = await service.run_omniparser(
215
+ image=image_path,
216
+ **{k: v for k, v in parameters.items() if k != "task"}
217
+ )
218
+
219
+ # 返回所有元素(包括文本和交互元素)
220
+ elements = result.get("parsed_elements", [])
221
+
222
+ # 添加更多有用的信息
223
+ for i, element in enumerate(elements):
224
+ element['element_id'] = i
225
+ element['element_index'] = i
226
+
227
+ # 计算中心点坐标
228
+ bbox = element.get('bbox', [0, 0, 0, 0])
229
+ if len(bbox) == 4:
230
+ # 转换归一化坐标到像素坐标
231
+ img = Image.open(image_path)
232
+ img_width, img_height = img.size
233
+
234
+ x1, y1, x2, y2 = bbox
235
+ pixel_x1 = int(x1 * img_width)
236
+ pixel_y1 = int(y1 * img_height)
237
+ pixel_x2 = int(x2 * img_width)
238
+ pixel_y2 = int(y2 * img_height)
239
+
240
+ element['pixel_bbox'] = [pixel_x1, pixel_y1, pixel_x2, pixel_y2]
241
+ element['center'] = [
242
+ (pixel_x1 + pixel_x2) // 2,
243
+ (pixel_y1 + pixel_y2) // 2
244
+ ]
245
+ element['size'] = [pixel_x2 - pixel_x1, pixel_y2 - pixel_y1]
246
+
247
+ return elements
248
+ else:
249
+ raise ValueError("OmniParser service not available")
250
+
251
+ async def _invoke_action_planning(self, service: Any, image_path: str, ui_elements: List[Dict[str, Any]], parameters: Dict[str, Any]) -> Dict[str, Any]:
252
+ """执行行动规划(第二层)"""
253
+
254
+ task_type = parameters.get("task", "action_planning")
255
+
256
+ # 构建元素摘要
257
+ elements_summary = []
258
+ interactive_elements = []
259
+
260
+ for element in ui_elements:
261
+ summary = {
262
+ "id": element.get('element_id'),
263
+ "type": element.get('type'),
264
+ "center": element.get('center'),
265
+ "content": element.get('content', ''),
266
+ "interactivity": element.get('interactivity', False),
267
+ "size": element.get('size')
268
+ }
269
+ elements_summary.append(summary)
270
+
271
+ if element.get('interactivity', False):
272
+ interactive_elements.append(summary)
273
+
274
+ # 构建智能决策提示词
275
+ prompt = self._build_action_planning_prompt(
276
+ task_type=task_type,
277
+ elements_summary=elements_summary,
278
+ interactive_elements=interactive_elements
279
+ )
280
+
281
+ # 调用GPT进行决策
282
+ result = await service.invoke(
283
+ image=image_path,
284
+ prompt=prompt,
285
+ task="analyze",
286
+ max_tokens=parameters.get("max_tokens", 500)
287
+ )
288
+
289
+ # 解析决策结果
290
+ decision = self._parse_action_plan(result.get('text', ''))
291
+
292
+ # 将决策与实际元素匹配
293
+ action_plan = self._match_actions_to_elements(decision, ui_elements)
294
+
295
+ return action_plan
296
+
297
+ # ==================== UNIFIED INVOKE METHODS ====================
298
+
299
+ async def _invoke_page_intelligence(self, service: Any, image_path: str, parameters: Dict[str, Any]) -> Dict[str, Any]:
300
+ """Invoke page intelligence analysis using unified interface"""
301
+
302
+ task = parameters.get("task", "page_intelligence")
303
+ prompt = self._get_intelligence_prompt(task)
304
+
305
+ # Use unified invoke method
306
+ result = await service.invoke(
307
+ image=image_path,
308
+ prompt=prompt,
309
+ task="analyze",
310
+ max_tokens=parameters.get("max_tokens", 500)
311
+ )
312
+
313
+ # Parse JSON response
314
+ response_text = result['text'].strip()
315
+ json_start = response_text.find('{')
316
+ json_end = response_text.rfind('}') + 1
317
+
318
+ if json_start >= 0 and json_end > json_start:
319
+ json_text = response_text[json_start:json_end]
320
+ try:
321
+ intelligence_data = json.loads(json_text)
322
+ except json.JSONDecodeError:
323
+ intelligence_data = self._parse_intelligence_fallback(response_text)
324
+ else:
325
+ intelligence_data = self._parse_intelligence_fallback(response_text)
326
+
327
+ return intelligence_data
328
+
329
+ def _get_intelligence_prompt(self, task: str) -> str:
330
+ """Get task-specific prompt for page intelligence"""
331
+
332
+ if task == "search_page_intelligence":
333
+ return '''Analyze this webpage screenshot to understand the search interface structure.
334
+
335
+ Identify:
336
+ 1. Page type (search_engine, search_results, query_page, homepage)
337
+ 2. Search elements (search box, search button, filters, suggestions)
338
+ 3. Layout pattern (header_search, center_search, sidebar_search)
339
+ 4. Language used in the interface
340
+ 5. Additional features (voice search, image search, advanced options)
341
+ 6. Visible text elements and labels
342
+
343
+ Return analysis as JSON with this exact structure:
344
+ {
345
+ "page_type": "search_engine|search_results|query_page|homepage|other",
346
+ "layout_pattern": "header_search|center_search|sidebar_search|embedded",
347
+ "language": "en|zh|es|fr|de|other",
348
+ "search_features": ["voice_search", "image_search", "advanced_options", "suggestions"],
349
+ "complexity_score": 0.1-1.0,
350
+ "visible_text_elements": ["Search", "Google Search", "I'm Feeling Lucky"],
351
+ "search_area_estimate": {"x": 0, "y": 0, "width": 0, "height": 0},
352
+ "confidence": 0.1-1.0,
353
+ "analysis_notes": "brief description of what you observe"
354
+ }
355
+
356
+ Be precise and only include elements you can clearly see.'''
357
+
358
+ elif task == "content_page_intelligence":
359
+ return '''Analyze this webpage screenshot to understand the content structure.
360
+
361
+ Identify:
362
+ 1. Page type (article, blog, news, documentation, product_page)
363
+ 2. Content layout (single_column, multi_column, grid, sidebar)
364
+ 3. Content elements (headings, paragraphs, images, videos, links)
365
+ 4. Language used in the interface
366
+ 5. Navigation elements (menu, breadcrumbs, pagination)
367
+ 6. Visible text content and structure
368
+
369
+ Return analysis as JSON with this exact structure:
370
+ {
371
+ "page_type": "article|blog|news|documentation|product_page|other",
372
+ "layout_pattern": "single_column|multi_column|grid|sidebar",
373
+ "language": "en|zh|es|fr|de|other",
374
+ "content_features": ["headings", "paragraphs", "images", "videos", "links", "navigation"],
375
+ "complexity_score": 0.1-1.0,
376
+ "visible_text_elements": ["Title", "Content", "Read More"],
377
+ "content_area_estimate": {"x": 0, "y": 0, "width": 0, "height": 0},
378
+ "confidence": 0.1-1.0,
379
+ "analysis_notes": "brief description of what you observe"
380
+ }
381
+
382
+ Be precise and only include elements you can clearly see.'''
383
+
384
+ elif task == "navigation_page_intelligence":
385
+ return '''Analyze this webpage screenshot to understand the navigation structure.
386
+
387
+ Identify:
388
+ 1. Page type (homepage, category_page, landing_page, dashboard)
389
+ 2. Navigation elements (menu, toolbar, sidebar, footer)
390
+ 3. Layout pattern (horizontal_nav, vertical_nav, dropdown_nav, mega_menu)
391
+ 4. Language used in the interface
392
+ 5. Interactive elements (buttons, links, icons, search)
393
+ 6. Visible navigation labels and structure
394
+
395
+ Return analysis as JSON with this exact structure:
396
+ {
397
+ "page_type": "homepage|category_page|landing_page|dashboard|other",
398
+ "layout_pattern": "horizontal_nav|vertical_nav|dropdown_nav|mega_menu",
399
+ "language": "en|zh|es|fr|de|other",
400
+ "navigation_features": ["main_menu", "sidebar", "footer", "breadcrumbs", "search"],
401
+ "complexity_score": 0.1-1.0,
402
+ "visible_text_elements": ["Home", "About", "Contact", "Products"],
403
+ "navigation_area_estimate": {"x": 0, "y": 0, "width": 0, "height": 0},
404
+ "confidence": 0.1-1.0,
405
+ "analysis_notes": "brief description of what you observe"
406
+ }
407
+
408
+ Be precise and only include elements you can clearly see.'''
409
+
410
+ else:
411
+ # Default login/form analysis prompt
412
+ return '''Analyze this webpage screenshot to understand the login interface structure.
413
+
414
+ Identify:
415
+ 1. Page type (login, register, multi-step auth, SSO)
416
+ 2. Layout pattern (vertical form, horizontal, modal, tabs)
417
+ 3. Language used in the interface
418
+ 4. Security features visible (CAPTCHA, 2FA indicators)
419
+ 5. Form complexity level
420
+ 6. Visible text elements that indicate field purposes
421
+
422
+ Return analysis as JSON with this exact structure:
423
+ {
424
+ "page_type": "login|register|multi_step|sso|other",
425
+ "layout_pattern": "vertical|horizontal|modal|tabs|embedded",
426
+ "language": "en|zh|es|fr|de|other",
427
+ "security_features": ["captcha", "recaptcha", "2fa_indicator", "security_questions"],
428
+ "complexity_score": 0.1-1.0,
429
+ "visible_text_elements": ["Login", "Password", "Sign In"],
430
+ "form_area_estimate": {"x": 0, "y": 0, "width": 0, "height": 0},
431
+ "confidence": 0.1-1.0,
432
+ "analysis_notes": "brief description of what you observe"
433
+ }
434
+
435
+ Be precise and only include elements you can clearly see.'''
436
+
437
+ async def execute_fallback(self, layer: LayerConfig, context: Dict[str, Any], error: str) -> Optional[Any]:
438
+ """Execute fallback logic for failed layers"""
439
+
440
+ if layer.layer_type == LayerType.INTELLIGENCE:
441
+ # Return basic page intelligence
442
+ return {
443
+ "page_type": "login",
444
+ "layout_pattern": "vertical",
445
+ "language": "en",
446
+ "security_features": [],
447
+ "complexity_score": 0.5,
448
+ "visible_text_elements": ["Login", "Password"],
449
+ "form_area_estimate": {"x": 200, "y": 200, "width": 600, "height": 400},
450
+ "confidence": 0.3,
451
+ "analysis_notes": f"Fallback analysis due to error: {error}"
452
+ }
453
+
454
+ elif layer.layer_type == LayerType.DETECTION:
455
+ # Create fallback elements based on typical form layout
456
+ intelligence = context["results"]["page_intelligence"].data
457
+ return self._create_fallback_elements(context["input"]["image_path"], intelligence)
458
+
459
+ return None
460
+
461
+ def generate_final_output(self, results: Dict[str, LayerResult]) -> Dict[str, Any]:
462
+ """Generate final UI analysis output for simplified two-layer architecture"""
463
+
464
+ # Extract data from the two layers
465
+ ui_elements = results.get("ui_detection", {}).data or []
466
+ action_plan = results.get("action_planning", {}).data or {}
467
+
468
+ # 分离交互和非交互元素
469
+ interactive_elements = [e for e in ui_elements if e.get('interactivity', False)]
470
+ text_elements = [e for e in ui_elements if not e.get('interactivity', False)]
471
+
472
+ return {
473
+ "ui_elements": {
474
+ "total_elements": len(ui_elements),
475
+ "interactive_elements": interactive_elements,
476
+ "text_elements": text_elements,
477
+ "summary": {
478
+ "interactive_count": len(interactive_elements),
479
+ "text_count": len(text_elements)
480
+ }
481
+ },
482
+ "action_plan": action_plan,
483
+ "automation_ready": {
484
+ "ready": action_plan.get("success_probability", 0) > 0.7,
485
+ "confidence": action_plan.get("success_probability", 0),
486
+ "steps_count": len(action_plan.get("action_plan", [])),
487
+ "page_type": action_plan.get("page_analysis", {}).get("page_type", "unknown")
488
+ },
489
+ "execution_summary": {
490
+ "can_automate": len(action_plan.get("action_plan", [])) > 0,
491
+ "recommended_action": action_plan.get("action_plan", [{}])[0] if action_plan.get("action_plan") else None
492
+ }
493
+ }
494
+
495
+ async def visualize_results(self, image_path: str, analysis_result: Dict[str, Any],
496
+ output_path: str = "ui_analysis_result.png") -> str:
497
+ """Generate visualization of the analysis results"""
498
+
499
+ # Load original image
500
+ img = Image.open(image_path)
501
+ img_copy = img.copy()
502
+ draw = ImageDraw.Draw(img_copy)
503
+
504
+ try:
505
+ font_large = ImageFont.truetype("arial.ttf", 24)
506
+ font_small = ImageFont.truetype("arial.ttf", 16)
507
+ except:
508
+ font_large = None
509
+ font_small = None
510
+
511
+ colors = {
512
+ 'username_field': 'red',
513
+ 'password_field': 'blue',
514
+ 'login_button': 'green',
515
+ 'register_button': 'orange',
516
+ 'submit_button': 'purple',
517
+ 'checkbox': 'yellow',
518
+ 'link': 'cyan',
519
+ 'other': 'gray'
520
+ }
521
+
522
+ final_output = analysis_result.get("final_output", {})
523
+ classified_elements = final_output.get("classified_elements", [])
524
+
525
+ # Draw each classified element
526
+ for element in classified_elements:
527
+ classification = element.get('classification', 'other')
528
+ confidence = element.get('classification_confidence', 0)
529
+ center = element.get('precise_center', element.get('center', [0, 0]))
530
+ bbox = element.get('bbox', [0, 0, 100, 100])
531
+ priority = element.get('interaction_priority', 5)
532
+
533
+ color = colors.get(classification, 'gray')
534
+ center_x, center_y = center
535
+ x1, y1, x2, y2 = bbox
536
+
537
+ # Draw bounding box
538
+ draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
539
+
540
+ # Draw center crosshair
541
+ crosshair_size = 35
542
+ draw.line([center_x - crosshair_size, center_y, center_x + crosshair_size, center_y],
543
+ fill=color, width=8)
544
+ draw.line([center_x, center_y - crosshair_size, center_x, center_y + crosshair_size],
545
+ fill=color, width=8)
546
+
547
+ # Draw center circle
548
+ circle_radius = 20
549
+ draw.ellipse([center_x - circle_radius, center_y - circle_radius,
550
+ center_x + circle_radius, center_y + circle_radius],
551
+ outline=color, width=8)
552
+
553
+ # Draw labels
554
+ label = f"{classification.replace('_', ' ').title()}"
555
+ detail = f"Conf: {confidence:.2f} | Pri: {priority}"
556
+
557
+ if font_large:
558
+ draw.text((x1, y1 - 50), label, fill=color, font=font_large)
559
+ draw.text((x1, y1 - 25), detail, fill=color, font=font_small)
560
+ else:
561
+ draw.text((x1, y1 - 40), label, fill=color)
562
+ draw.text((x1, y1 - 20), detail, fill=color)
563
+
564
+ # Add header information
565
+ intelligence = final_output.get("page_intelligence", {})
566
+ summary = final_output.get("analysis_summary", {})
567
+
568
+ header_text = f"UI Analysis: {summary.get('page_type', 'unknown')} | Elements: {summary.get('interactive_elements', 0)} | Confidence: {summary.get('overall_confidence', 0):.2f}"
569
+
570
+ if font_large:
571
+ draw.text((10, 10), "Stacked UI Analysis", fill='black', font=font_large)
572
+ draw.text((10, 40), header_text, fill='black', font=font_small)
573
+ else:
574
+ draw.text((10, 10), "Stacked UI Analysis", fill='black')
575
+ draw.text((10, 30), header_text, fill='black')
576
+
577
+ # Save visualization
578
+ img_copy.save(output_path)
579
+ return output_path
580
+
581
+ # Helper methods
582
+ def _parse_intelligence_fallback(self, text: str) -> Dict[str, Any]:
583
+ """Fallback parser for intelligence data"""
584
+ return {
585
+ "page_type": "login",
586
+ "layout_pattern": "vertical",
587
+ "language": "en",
588
+ "security_features": [],
589
+ "complexity_score": 0.6,
590
+ "visible_text_elements": ["Username", "Password", "Login"],
591
+ "form_area_estimate": {"x": 200, "y": 200, "width": 600, "height": 400},
592
+ "confidence": 0.4,
593
+ "analysis_notes": "Parsed from text analysis"
594
+ }
595
+
596
+ def _parse_classification_result(self, text: str) -> Dict[str, Any]:
597
+ """Parse classification result from GPT response"""
598
+ try:
599
+ json_start = text.find('{')
600
+ json_end = text.rfind('}') + 1
601
+
602
+ if json_start >= 0 and json_end > json_start:
603
+ json_text = text[json_start:json_end]
604
+ return json.loads(json_text)
605
+ except:
606
+ pass
607
+
608
+ # Fallback text parsing
609
+ classification = 'other'
610
+ confidence = 0.5
611
+
612
+ text_lower = text.lower()
613
+ if 'username' in text_lower or 'email' in text_lower:
614
+ classification = 'username_field'
615
+ confidence = 0.7
616
+ elif 'password' in text_lower:
617
+ classification = 'password_field'
618
+ confidence = 0.7
619
+ elif 'button' in text_lower and ('login' in text_lower or 'sign' in text_lower):
620
+ classification = 'login_button'
621
+ confidence = 0.7
622
+
623
+ return {
624
+ 'classification': classification,
625
+ 'confidence': confidence,
626
+ 'reasoning': 'Parsed from text',
627
+ 'visual_evidence': ['text_analysis'],
628
+ 'interaction_priority': 5
629
+ }
630
+
631
+ def _build_action_planning_prompt(self, task_type: str, elements_summary: List[Dict], interactive_elements: List[Dict]) -> str:
632
+ """构建行动规划提示词"""
633
+
634
+ # 任务特定的指引
635
+ if "search" in task_type:
636
+ instruction = """
637
+ 目标:完成搜索操作
638
+ 需要识别:
639
+ 1. 搜索输入框
640
+ 2. 搜索按钮(可选,通常可以按回车)
641
+ 操作顺序:点击搜索框 → 输入搜索内容 → 点击搜索按钮或按回车
642
+ """
643
+ elif "content" in task_type:
644
+ instruction = """
645
+ 目标:识别和提取页面内容
646
+ 需要识别:
647
+ 1. 主要内容区域
648
+ 2. 标题和正文
649
+ 3. 相关链接
650
+ 4. 导航元素
651
+ """
652
+ elif "navigation" in task_type:
653
+ instruction = """
654
+ 目标:识别页面导航结构
655
+ 需要识别:
656
+ 1. 主导航菜单
657
+ 2. 子菜单项
658
+ 3. 面包屑导航
659
+ 4. 页脚链接
660
+ """
661
+ else:
662
+ instruction = """
663
+ 目标:完成登录操作
664
+ 需要识别:
665
+ 1. 用户名/邮箱输入框
666
+ 2. 密码输入框
667
+ 3. 登录/提交按钮
668
+ 操作顺序:点击用户名框 → 输入用户名 → 点击密码框 → 输入密码 → 点击登录按钮
669
+ """
670
+
671
+ # 构建元素列表
672
+ elements_text = "可用的UI元素:\n"
673
+ for i, elem in enumerate(elements_summary):
674
+ interactivity_mark = "🔴" if elem['interactivity'] else "⚪"
675
+ elements_text += f"{interactivity_mark} 元素{elem['id']}: {elem['type']} - \"{elem['content'][:50]}\" - 中心点{elem['center']}\n"
676
+
677
+ interactive_text = f"\n交互元素(共{len(interactive_elements)}个):\n"
678
+ for elem in interactive_elements:
679
+ interactive_text += f"🔴 元素{elem['id']}: \"{elem['content'][:30]}\" - 中心点{elem['center']}\n"
680
+
681
+ return f"""你是一个UI自动化专家。基于以下信息,为网页操作制定精确的行动计划。
682
+
683
+ {instruction}
684
+
685
+ {elements_text}
686
+ {interactive_text}
687
+
688
+ 请分析这个页面并提供操作计划:
689
+
690
+ 1. 确定页面类型和当前状态
691
+ 2. 识别完成目标所需的关键元素
692
+ 3. 提供精确的操作步骤(包括点击坐标)
693
+
694
+ 返回JSON格式:
695
+ {{
696
+ "page_analysis": {{
697
+ "page_type": "登录页面|搜索页面|内容页面|导航页面",
698
+ "confidence": 0.1-1.0,
699
+ "key_elements_found": ["element_type1", "element_type2"]
700
+ }},
701
+ "action_plan": [
702
+ {{
703
+ "step": 1,
704
+ "action": "click|type|scroll",
705
+ "target_element_id": 元素ID,
706
+ "target_coordinates": [x, y],
707
+ "description": "操作描述",
708
+ "input_text": "要输入的文本(如果是type操作)"
709
+ }}
710
+ ],
711
+ "success_probability": 0.1-1.0,
712
+ "notes": "额外说明"
713
+ }}
714
+
715
+ 只基于实际看到的元素制定计划,确保坐标准确。"""
716
+
717
+ def _parse_action_plan(self, text: str) -> Dict[str, Any]:
718
+ """解析行动计划"""
719
+ try:
720
+ # 尝试解析JSON
721
+ json_start = text.find('{')
722
+ json_end = text.rfind('}') + 1
723
+
724
+ if json_start >= 0 and json_end > json_start:
725
+ json_text = text[json_start:json_end]
726
+ return json.loads(json_text)
727
+ except:
728
+ pass
729
+
730
+ # 失败时返回基础计划
731
+ return {
732
+ "page_analysis": {
733
+ "page_type": f"{self.task_type}页面",
734
+ "confidence": 0.5,
735
+ "key_elements_found": []
736
+ },
737
+ "action_plan": [],
738
+ "success_probability": 0.3,
739
+ "notes": "解析失败,使用fallback计划"
740
+ }
741
+
742
+ def _match_actions_to_elements(self, decision: Dict[str, Any], ui_elements: List[Dict[str, Any]]) -> Dict[str, Any]:
743
+ """将决策与实际UI元素匹配"""
744
+
745
+ action_plan = decision.get("action_plan", [])
746
+
747
+ # 为每个行动步骤匹配实际元素
748
+ for step in action_plan:
749
+ target_id = step.get("target_element_id")
750
+
751
+ if target_id is not None:
752
+ # 找到对应的元素
753
+ target_element = None
754
+ for element in ui_elements:
755
+ if element.get('element_id') == target_id:
756
+ target_element = element
757
+ break
758
+
759
+ if target_element:
760
+ # 使用实际元素的坐标
761
+ step["actual_coordinates"] = target_element.get("center")
762
+ step["actual_bbox"] = target_element.get("pixel_bbox")
763
+ step["element_content"] = target_element.get("content")
764
+ step["element_type"] = target_element.get("type")
765
+ step["element_size"] = target_element.get("size")
766
+
767
+ return {
768
+ "page_analysis": decision.get("page_analysis", {}),
769
+ "action_plan": action_plan,
770
+ "success_probability": decision.get("success_probability", 0.5),
771
+ "notes": decision.get("notes", ""),
772
+ "total_steps": len(action_plan),
773
+ "interactive_elements_available": len([e for e in ui_elements if e.get('interactivity')])
774
+ }
775
+
776
+ def _create_fallback_elements(self, image_path: str, intelligence: Dict[str, Any]) -> List[Dict[str, Any]]:
777
+ """Create fallback elements based on typical layouts"""
778
+ img = Image.open(image_path)
779
+ img_width, img_height = img.size
780
+
781
+ layout = intelligence.get('layout_pattern', 'vertical')
782
+
783
+ if layout == 'vertical':
784
+ form_center_x = img_width // 2
785
+ form_start_y = img_height // 3
786
+
787
+ return [
788
+ {
789
+ 'id': 'fallback_username',
790
+ 'bbox': [form_center_x - 150, form_start_y, form_center_x + 150, form_start_y + 40],
791
+ 'center': [form_center_x, form_start_y + 20],
792
+ 'size': [300, 40],
793
+ 'confidence': 0.6,
794
+ 'type': 'input'
795
+ },
796
+ {
797
+ 'id': 'fallback_password',
798
+ 'bbox': [form_center_x - 150, form_start_y + 70, form_center_x + 150, form_start_y + 110],
799
+ 'center': [form_center_x, form_start_y + 90],
800
+ 'size': [300, 40],
801
+ 'confidence': 0.6,
802
+ 'type': 'input'
803
+ },
804
+ {
805
+ 'id': 'fallback_button',
806
+ 'bbox': [form_center_x - 75, form_start_y + 140, form_center_x + 75, form_start_y + 180],
807
+ 'center': [form_center_x, form_start_y + 160],
808
+ 'size': [150, 40],
809
+ 'confidence': 0.5,
810
+ 'type': 'button'
811
+ }
812
+ ]
813
+ else:
814
+ return [
815
+ {
816
+ 'id': 'fallback_form',
817
+ 'bbox': [img_width//4, img_height//3, 3*img_width//4, 2*img_height//3],
818
+ 'center': [img_width//2, img_height//2],
819
+ 'size': [img_width//2, img_height//3],
820
+ 'confidence': 0.4,
821
+ 'type': 'form'
822
+ }
823
+ ]