isa-model 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. isa_model/config/__init__.py +9 -0
  2. isa_model/config/config_manager.py +213 -0
  3. isa_model/core/model_manager.py +5 -0
  4. isa_model/core/model_registry.py +39 -6
  5. isa_model/core/storage/supabase_storage.py +344 -0
  6. isa_model/core/vision_models_init.py +116 -0
  7. isa_model/deployment/cloud/__init__.py +9 -0
  8. isa_model/deployment/cloud/modal/__init__.py +10 -0
  9. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +612 -0
  10. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +305 -0
  11. isa_model/inference/ai_factory.py +238 -14
  12. isa_model/inference/providers/modal_provider.py +109 -0
  13. isa_model/inference/providers/yyds_provider.py +108 -0
  14. isa_model/inference/services/__init__.py +2 -1
  15. isa_model/inference/services/base_service.py +0 -38
  16. isa_model/inference/services/llm/base_llm_service.py +32 -0
  17. isa_model/inference/services/llm/llm_adapter.py +73 -3
  18. isa_model/inference/services/llm/ollama_llm_service.py +104 -3
  19. isa_model/inference/services/llm/openai_llm_service.py +67 -15
  20. isa_model/inference/services/llm/yyds_llm_service.py +254 -0
  21. isa_model/inference/services/stacked/__init__.py +26 -0
  22. isa_model/inference/services/stacked/base_stacked_service.py +269 -0
  23. isa_model/inference/services/stacked/config.py +426 -0
  24. isa_model/inference/services/stacked/doc_analysis_service.py +640 -0
  25. isa_model/inference/services/stacked/flux_professional_service.py +579 -0
  26. isa_model/inference/services/stacked/ui_analysis_service.py +1319 -0
  27. isa_model/inference/services/vision/base_image_gen_service.py +0 -34
  28. isa_model/inference/services/vision/base_vision_service.py +46 -2
  29. isa_model/inference/services/vision/isA_vision_service.py +402 -0
  30. isa_model/inference/services/vision/openai_vision_service.py +151 -9
  31. isa_model/inference/services/vision/replicate_image_gen_service.py +166 -38
  32. isa_model/inference/services/vision/replicate_vision_service.py +693 -0
  33. isa_model/serving/__init__.py +19 -0
  34. isa_model/serving/api/__init__.py +10 -0
  35. isa_model/serving/api/fastapi_server.py +84 -0
  36. isa_model/serving/api/middleware/__init__.py +9 -0
  37. isa_model/serving/api/middleware/request_logger.py +88 -0
  38. isa_model/serving/api/routes/__init__.py +5 -0
  39. isa_model/serving/api/routes/health.py +82 -0
  40. isa_model/serving/api/routes/llm.py +19 -0
  41. isa_model/serving/api/routes/ui_analysis.py +223 -0
  42. isa_model/serving/api/routes/vision.py +19 -0
  43. isa_model/serving/api/schemas/__init__.py +17 -0
  44. isa_model/serving/api/schemas/common.py +33 -0
  45. isa_model/serving/api/schemas/ui_analysis.py +78 -0
  46. {isa_model-0.3.3.dist-info → isa_model-0.3.5.dist-info}/METADATA +1 -1
  47. {isa_model-0.3.3.dist-info → isa_model-0.3.5.dist-info}/RECORD +49 -17
  48. {isa_model-0.3.3.dist-info → isa_model-0.3.5.dist-info}/WHEEL +0 -0
  49. {isa_model-0.3.3.dist-info → isa_model-0.3.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1319 @@
1
+ """
2
+ UI Analysis Stacked Service
3
+
4
+ A stacked service for analyzing UI screenshots using multiple AI models:
5
+ - Layer 1: Page Intelligence (GPT-4V)
6
+ - Layer 2: Element Detection (OmniParser/Florence-2/YOLO)
7
+ - Layer 3: Classification (GPT-4V)
8
+ """
9
+
10
+ from typing import Dict, Any, List, Optional, Union, BinaryIO
11
+ import json
12
+ from PIL import Image, ImageDraw, ImageFont
13
+
14
+ from .base_stacked_service import BaseStackedService, LayerConfig, LayerType, LayerResult
15
+
16
+ class UIAnalysisService(BaseStackedService):
17
+ """
18
+ Generic UI Analysis service using stacked AI models for comprehensive UI understanding.
19
+ Can handle different page types: login, search, content extraction, navigation, etc.
20
+ """
21
+
22
+ def __init__(self, ai_factory, config=None):
23
+ super().__init__(ai_factory, "ui_analysis")
24
+ self.ai_factory = ai_factory
25
+ self.config = config or {}
26
+ self.task_type = self.config.get("task_type", "login") # Default to login for backward compatibility
27
+ self._setup_layers_by_task()
28
+
29
+ def _setup_layers_by_task(self):
30
+ """Setup layer configuration based on task type"""
31
+
32
+ if self.task_type == "search":
33
+ self._setup_search_layers()
34
+ elif self.task_type == "content":
35
+ self._setup_content_layers()
36
+ elif self.task_type == "navigation":
37
+ self._setup_navigation_layers()
38
+ else:
39
+ # Default to login/form analysis
40
+ self._setup_default_layers()
41
+
42
+ def _setup_default_layers(self):
43
+ """Setup simplified two-layer architecture for UI analysis"""
44
+
45
+ # Layer 1: OmniParser 元素检测
46
+ self.add_layer(LayerConfig(
47
+ name="ui_detection",
48
+ layer_type=LayerType.DETECTION,
49
+ service_type="vision",
50
+ model_name="omniparser",
51
+ parameters={
52
+ "task": "ui_detection",
53
+ "imgsz": 640,
54
+ "box_threshold": 0.05,
55
+ "iou_threshold": 0.1
56
+ },
57
+ depends_on=[],
58
+ timeout=30.0,
59
+ fallback_enabled=True
60
+ ))
61
+
62
+ # Layer 2: GPT-4.1-nano 智能决策
63
+ self.add_layer(LayerConfig(
64
+ name="action_planning",
65
+ layer_type=LayerType.INTELLIGENCE,
66
+ service_type="vision",
67
+ model_name="default",
68
+ parameters={
69
+ "task": "action_planning",
70
+ "max_tokens": 500
71
+ },
72
+ depends_on=["ui_detection"],
73
+ timeout=15.0,
74
+ fallback_enabled=False
75
+ ))
76
+
77
+ def _setup_search_layers(self):
78
+ """Setup simplified two-layer architecture for search page analysis"""
79
+
80
+ self.add_layer(LayerConfig(
81
+ name="ui_detection",
82
+ layer_type=LayerType.DETECTION,
83
+ service_type="vision",
84
+ model_name="omniparser",
85
+ parameters={
86
+ "task": "ui_detection",
87
+ "imgsz": 640,
88
+ "box_threshold": 0.05,
89
+ "iou_threshold": 0.1
90
+ },
91
+ depends_on=[],
92
+ timeout=30.0,
93
+ fallback_enabled=True
94
+ ))
95
+
96
+ self.add_layer(LayerConfig(
97
+ name="action_planning",
98
+ layer_type=LayerType.INTELLIGENCE,
99
+ service_type="vision",
100
+ model_name="default",
101
+ parameters={
102
+ "task": "search_action_planning",
103
+ "max_tokens": 500
104
+ },
105
+ depends_on=["ui_detection"],
106
+ timeout=15.0,
107
+ fallback_enabled=False
108
+ ))
109
+
110
+ def _setup_content_layers(self):
111
+ """Setup simplified two-layer architecture for content extraction"""
112
+
113
+ self.add_layer(LayerConfig(
114
+ name="ui_detection",
115
+ layer_type=LayerType.DETECTION,
116
+ service_type="vision",
117
+ model_name="omniparser",
118
+ parameters={
119
+ "task": "ui_detection",
120
+ "imgsz": 640,
121
+ "box_threshold": 0.05,
122
+ "iou_threshold": 0.1
123
+ },
124
+ depends_on=[],
125
+ timeout=30.0,
126
+ fallback_enabled=True
127
+ ))
128
+
129
+ self.add_layer(LayerConfig(
130
+ name="action_planning",
131
+ layer_type=LayerType.INTELLIGENCE,
132
+ service_type="vision",
133
+ model_name="default",
134
+ parameters={
135
+ "task": "content_action_planning",
136
+ "max_tokens": 500
137
+ },
138
+ depends_on=["ui_detection"],
139
+ timeout=15.0,
140
+ fallback_enabled=False
141
+ ))
142
+
143
+ def _setup_navigation_layers(self):
144
+ """Setup simplified two-layer architecture for navigation analysis"""
145
+
146
+ self.add_layer(LayerConfig(
147
+ name="ui_detection",
148
+ layer_type=LayerType.DETECTION,
149
+ service_type="vision",
150
+ model_name="omniparser",
151
+ parameters={
152
+ "task": "ui_detection",
153
+ "imgsz": 640,
154
+ "box_threshold": 0.05,
155
+ "iou_threshold": 0.1
156
+ },
157
+ depends_on=[],
158
+ timeout=30.0,
159
+ fallback_enabled=True
160
+ ))
161
+
162
+ self.add_layer(LayerConfig(
163
+ name="action_planning",
164
+ layer_type=LayerType.INTELLIGENCE,
165
+ service_type="vision",
166
+ model_name="default",
167
+ parameters={
168
+ "task": "navigation_action_planning",
169
+ "max_tokens": 500
170
+ },
171
+ depends_on=["ui_detection"],
172
+ timeout=15.0,
173
+ fallback_enabled=False
174
+ ))
175
+
176
+ def configure_detection_model(self, model_name: str, parameters: Dict[str, Any]):
177
+ """Configure the detection model and parameters"""
178
+ for layer in self.layers:
179
+ if layer.name == "element_detection":
180
+ layer.model_name = model_name
181
+ layer.parameters.update(parameters)
182
+ break
183
+
184
+ def configure_intelligence_model(self, model_name: str, parameters: Dict[str, Any]):
185
+ """Configure the page intelligence model"""
186
+ for layer in self.layers:
187
+ if layer.name == "page_intelligence":
188
+ layer.model_name = model_name
189
+ layer.parameters.update(parameters)
190
+ break
191
+
192
+ async def execute_layer_logic(self, layer: LayerConfig, service: Any, context: Dict[str, Any]) -> Any:
193
+ """Execute specific logic for each layer type using unified invoke method"""
194
+
195
+ task = layer.parameters.get("task")
196
+ image_path = context["input"]["image_path"]
197
+
198
+ if task == "ui_detection":
199
+ return await self._invoke_ui_detection(service, image_path, layer.parameters)
200
+
201
+ elif task in ["action_planning", "search_action_planning", "content_action_planning", "navigation_action_planning"]:
202
+ ui_elements = context["results"]["ui_detection"].data
203
+ return await self._invoke_action_planning(service, image_path, ui_elements, layer.parameters)
204
+
205
+ else:
206
+ raise ValueError(f"Unsupported task: {task}")
207
+
208
+ # ==================== SIMPLIFIED TWO-LAYER METHODS ====================
209
+
210
+ async def _invoke_ui_detection(self, service: Any, image_path: str, parameters: Dict[str, Any]) -> List[Dict[str, Any]]:
211
+ """执行UI元素检测(第一层)"""
212
+
213
+ if hasattr(service, 'run_omniparser'):
214
+ result = await service.run_omniparser(
215
+ image=image_path,
216
+ **{k: v for k, v in parameters.items() if k != "task"}
217
+ )
218
+
219
+ # 返回所有元素(包括文本和交互元素)
220
+ elements = result.get("parsed_elements", [])
221
+
222
+ # 添加更多有用的信息
223
+ for i, element in enumerate(elements):
224
+ element['element_id'] = i
225
+ element['element_index'] = i
226
+
227
+ # 计算中心点坐标
228
+ bbox = element.get('bbox', [0, 0, 0, 0])
229
+ if len(bbox) == 4:
230
+ # 转换归一化坐标到像素坐标
231
+ img = Image.open(image_path)
232
+ img_width, img_height = img.size
233
+
234
+ x1, y1, x2, y2 = bbox
235
+ pixel_x1 = int(x1 * img_width)
236
+ pixel_y1 = int(y1 * img_height)
237
+ pixel_x2 = int(x2 * img_width)
238
+ pixel_y2 = int(y2 * img_height)
239
+
240
+ element['pixel_bbox'] = [pixel_x1, pixel_y1, pixel_x2, pixel_y2]
241
+ element['center'] = [
242
+ (pixel_x1 + pixel_x2) // 2,
243
+ (pixel_y1 + pixel_y2) // 2
244
+ ]
245
+ element['size'] = [pixel_x2 - pixel_x1, pixel_y2 - pixel_y1]
246
+
247
+ return elements
248
+ else:
249
+ raise ValueError("OmniParser service not available")
250
+
251
+ async def _invoke_action_planning(self, service: Any, image_path: str, ui_elements: List[Dict[str, Any]], parameters: Dict[str, Any]) -> Dict[str, Any]:
252
+ """执行行动规划(第二层)"""
253
+
254
+ task_type = parameters.get("task", "action_planning")
255
+
256
+ # 构建元素摘要
257
+ elements_summary = []
258
+ interactive_elements = []
259
+
260
+ for element in ui_elements:
261
+ summary = {
262
+ "id": element.get('element_id'),
263
+ "type": element.get('type'),
264
+ "center": element.get('center'),
265
+ "content": element.get('content', ''),
266
+ "interactivity": element.get('interactivity', False),
267
+ "size": element.get('size')
268
+ }
269
+ elements_summary.append(summary)
270
+
271
+ if element.get('interactivity', False):
272
+ interactive_elements.append(summary)
273
+
274
+ # 构建智能决策提示词
275
+ prompt = self._build_action_planning_prompt(
276
+ task_type=task_type,
277
+ elements_summary=elements_summary,
278
+ interactive_elements=interactive_elements
279
+ )
280
+
281
+ # 调用GPT进行决策
282
+ result = await service.invoke(
283
+ image=image_path,
284
+ prompt=prompt,
285
+ task="analyze",
286
+ max_tokens=parameters.get("max_tokens", 500)
287
+ )
288
+
289
+ # 解析决策结果
290
+ decision = self._parse_action_plan(result.get('text', ''))
291
+
292
+ # 将决策与实际元素匹配
293
+ action_plan = self._match_actions_to_elements(decision, ui_elements)
294
+
295
+ return action_plan
296
+
297
+ # ==================== UNIFIED INVOKE METHODS ====================
298
+
299
+ async def _invoke_page_intelligence(self, service: Any, image_path: str, parameters: Dict[str, Any]) -> Dict[str, Any]:
300
+ """Invoke page intelligence analysis using unified interface"""
301
+
302
+ task = parameters.get("task", "page_intelligence")
303
+ prompt = self._get_intelligence_prompt(task)
304
+
305
+ # Use unified invoke method
306
+ result = await service.invoke(
307
+ image=image_path,
308
+ prompt=prompt,
309
+ task="analyze",
310
+ max_tokens=parameters.get("max_tokens", 500)
311
+ )
312
+
313
+ # Parse JSON response
314
+ response_text = result['text'].strip()
315
+ json_start = response_text.find('{')
316
+ json_end = response_text.rfind('}') + 1
317
+
318
+ if json_start >= 0 and json_end > json_start:
319
+ json_text = response_text[json_start:json_end]
320
+ try:
321
+ intelligence_data = json.loads(json_text)
322
+ except json.JSONDecodeError:
323
+ intelligence_data = self._parse_intelligence_fallback(response_text)
324
+ else:
325
+ intelligence_data = self._parse_intelligence_fallback(response_text)
326
+
327
+ return intelligence_data
328
+
329
+ def _get_intelligence_prompt(self, task: str) -> str:
330
+ """Get task-specific prompt for page intelligence"""
331
+
332
+ if task == "search_page_intelligence":
333
+ return '''Analyze this webpage screenshot to understand the search interface structure.
334
+
335
+ Identify:
336
+ 1. Page type (search_engine, search_results, query_page, homepage)
337
+ 2. Search elements (search box, search button, filters, suggestions)
338
+ 3. Layout pattern (header_search, center_search, sidebar_search)
339
+ 4. Language used in the interface
340
+ 5. Additional features (voice search, image search, advanced options)
341
+ 6. Visible text elements and labels
342
+
343
+ Return analysis as JSON with this exact structure:
344
+ {
345
+ "page_type": "search_engine|search_results|query_page|homepage|other",
346
+ "layout_pattern": "header_search|center_search|sidebar_search|embedded",
347
+ "language": "en|zh|es|fr|de|other",
348
+ "search_features": ["voice_search", "image_search", "advanced_options", "suggestions"],
349
+ "complexity_score": 0.1-1.0,
350
+ "visible_text_elements": ["Search", "Google Search", "I'm Feeling Lucky"],
351
+ "search_area_estimate": {"x": 0, "y": 0, "width": 0, "height": 0},
352
+ "confidence": 0.1-1.0,
353
+ "analysis_notes": "brief description of what you observe"
354
+ }
355
+
356
+ Be precise and only include elements you can clearly see.'''
357
+
358
+ elif task == "content_page_intelligence":
359
+ return '''Analyze this webpage screenshot to understand the content structure.
360
+
361
+ Identify:
362
+ 1. Page type (article, blog, news, documentation, product_page)
363
+ 2. Content layout (single_column, multi_column, grid, sidebar)
364
+ 3. Content elements (headings, paragraphs, images, videos, links)
365
+ 4. Language used in the interface
366
+ 5. Navigation elements (menu, breadcrumbs, pagination)
367
+ 6. Visible text content and structure
368
+
369
+ Return analysis as JSON with this exact structure:
370
+ {
371
+ "page_type": "article|blog|news|documentation|product_page|other",
372
+ "layout_pattern": "single_column|multi_column|grid|sidebar",
373
+ "language": "en|zh|es|fr|de|other",
374
+ "content_features": ["headings", "paragraphs", "images", "videos", "links", "navigation"],
375
+ "complexity_score": 0.1-1.0,
376
+ "visible_text_elements": ["Title", "Content", "Read More"],
377
+ "content_area_estimate": {"x": 0, "y": 0, "width": 0, "height": 0},
378
+ "confidence": 0.1-1.0,
379
+ "analysis_notes": "brief description of what you observe"
380
+ }
381
+
382
+ Be precise and only include elements you can clearly see.'''
383
+
384
+ elif task == "navigation_page_intelligence":
385
+ return '''Analyze this webpage screenshot to understand the navigation structure.
386
+
387
+ Identify:
388
+ 1. Page type (homepage, category_page, landing_page, dashboard)
389
+ 2. Navigation elements (menu, toolbar, sidebar, footer)
390
+ 3. Layout pattern (horizontal_nav, vertical_nav, dropdown_nav, mega_menu)
391
+ 4. Language used in the interface
392
+ 5. Interactive elements (buttons, links, icons, search)
393
+ 6. Visible navigation labels and structure
394
+
395
+ Return analysis as JSON with this exact structure:
396
+ {
397
+ "page_type": "homepage|category_page|landing_page|dashboard|other",
398
+ "layout_pattern": "horizontal_nav|vertical_nav|dropdown_nav|mega_menu",
399
+ "language": "en|zh|es|fr|de|other",
400
+ "navigation_features": ["main_menu", "sidebar", "footer", "breadcrumbs", "search"],
401
+ "complexity_score": 0.1-1.0,
402
+ "visible_text_elements": ["Home", "About", "Contact", "Products"],
403
+ "navigation_area_estimate": {"x": 0, "y": 0, "width": 0, "height": 0},
404
+ "confidence": 0.1-1.0,
405
+ "analysis_notes": "brief description of what you observe"
406
+ }
407
+
408
+ Be precise and only include elements you can clearly see.'''
409
+
410
+ else:
411
+ # Default login/form analysis prompt
412
+ return '''Analyze this webpage screenshot to understand the login interface structure.
413
+
414
+ Identify:
415
+ 1. Page type (login, register, multi-step auth, SSO)
416
+ 2. Layout pattern (vertical form, horizontal, modal, tabs)
417
+ 3. Language used in the interface
418
+ 4. Security features visible (CAPTCHA, 2FA indicators)
419
+ 5. Form complexity level
420
+ 6. Visible text elements that indicate field purposes
421
+
422
+ Return analysis as JSON with this exact structure:
423
+ {
424
+ "page_type": "login|register|multi_step|sso|other",
425
+ "layout_pattern": "vertical|horizontal|modal|tabs|embedded",
426
+ "language": "en|zh|es|fr|de|other",
427
+ "security_features": ["captcha", "recaptcha", "2fa_indicator", "security_questions"],
428
+ "complexity_score": 0.1-1.0,
429
+ "visible_text_elements": ["Login", "Password", "Sign In"],
430
+ "form_area_estimate": {"x": 0, "y": 0, "width": 0, "height": 0},
431
+ "confidence": 0.1-1.0,
432
+ "analysis_notes": "brief description of what you observe"
433
+ }
434
+
435
+ Be precise and only include elements you can clearly see.'''
436
+
437
+ async def _invoke_element_detection(self, service: Any, image_path: str, intelligence: Dict[str, Any], parameters: Dict[str, Any]) -> List[Dict[str, Any]]:
438
+ """Invoke element detection using unified interface"""
439
+
440
+ # Adapt parameters based on page intelligence
441
+ complexity = intelligence.get('complexity_score', 0.5)
442
+
443
+ # Check if this is omniparser service
444
+ if hasattr(service, 'run_omniparser'):
445
+ # Use replicate omniparser
446
+ params = {}
447
+ if complexity > 0.7:
448
+ params.update({
449
+ "box_threshold": 0.03,
450
+ "iou_threshold": 0.1,
451
+ "imgsz": 1024
452
+ })
453
+ else:
454
+ params.update({
455
+ "box_threshold": parameters.get("box_threshold", 0.05),
456
+ "iou_threshold": parameters.get("iou_threshold", 0.1),
457
+ "imgsz": parameters.get("imgsz", 640)
458
+ })
459
+
460
+ result = await service.run_omniparser(image=image_path, **params)
461
+ # Filter for interactive elements only
462
+ elements = [e for e in result.get("parsed_elements", []) if e.get("interactivity", False)]
463
+ return elements
464
+ else:
465
+ # Use fallback generic detection
466
+ return await self._fallback_element_detection(service, image_path, intelligence)
467
+
468
+ async def _invoke_element_classification(self, service: Any, image_path: str, elements: List[Dict[str, Any]], intelligence: Dict[str, Any], parameters: Dict[str, Any]) -> List[Dict[str, Any]]:
469
+ """Invoke element classification using unified interface"""
470
+
471
+ classified_elements = []
472
+ img = Image.open(image_path)
473
+
474
+ for i, element in enumerate(elements):
475
+ # Crop element region with padding
476
+ bbox = element['bbox']
477
+ x1, y1, x2, y2 = bbox
478
+
479
+ padding_x = max(20, int((x2 - x1) * 0.2))
480
+ padding_y = max(20, int((y2 - y1) * 0.2))
481
+
482
+ crop_x1 = max(0, x1 - padding_x)
483
+ crop_y1 = max(0, y1 - padding_y)
484
+ crop_x2 = min(img.width, x2 + padding_x)
485
+ crop_y2 = min(img.height, y2 + padding_y)
486
+
487
+ cropped_img = img.crop((crop_x1, crop_y1, crop_x2, crop_y2))
488
+ crop_path = f"temp_classify_{i}.png"
489
+ cropped_img.save(crop_path)
490
+
491
+ try:
492
+ context_info = f'''Page Context:
493
+ - Type: {intelligence.get('page_type', 'unknown')}
494
+ - Layout: {intelligence.get('layout_pattern', 'unknown')}
495
+ - Language: {intelligence.get('language', 'en')}
496
+ - Complexity: {intelligence.get('complexity_score', 0.5)}
497
+
498
+ Element Context:
499
+ - Position: {element.get('center', [0, 0])} (center)
500
+ - Size: {element.get('size', [0, 0])} (width x height)
501
+ - Element {i+1} of {len(elements)} total elements'''
502
+
503
+ prompt = f'''Classify this UI element from a login/authentication interface.
504
+
505
+ {context_info}
506
+
507
+ Classify as one of:
508
+ - username_field: for username, email, user ID inputs
509
+ - password_field: for password inputs
510
+ - confirm_password: for password confirmation fields
511
+ - login_button: for sign in/login buttons
512
+ - register_button: for sign up/register buttons
513
+ - submit_button: for general form submission
514
+ - checkbox: for remember me, terms agreement
515
+ - link: for forgot password, register links
516
+ - other: for unrelated elements
517
+
518
+ Response format:
519
+ {{
520
+ "classification": "username_field|password_field|login_button|other",
521
+ "confidence": 0.1-1.0,
522
+ "reasoning": "brief explanation of classification decision",
523
+ "visual_evidence": ["text_visible", "styling_clues", "position_indicators"],
524
+ "interaction_priority": 1-10
525
+ }}'''
526
+
527
+ # Use unified invoke method
528
+ result = await service.invoke(
529
+ image=crop_path,
530
+ prompt=prompt,
531
+ task="analyze",
532
+ max_tokens=parameters.get("max_tokens", 300)
533
+ )
534
+
535
+ classification_data = self._parse_classification_result(result['text'])
536
+
537
+ # Use original detection center coordinates
538
+ classified_element = {
539
+ **element,
540
+ 'classification': classification_data.get('classification', 'other'),
541
+ 'classification_confidence': classification_data.get('confidence', 0.5),
542
+ 'precise_center': element.get('center', [0, 0]),
543
+ 'reasoning': classification_data.get('reasoning', ''),
544
+ 'visual_evidence': classification_data.get('visual_evidence', []),
545
+ 'interaction_priority': classification_data.get('interaction_priority', 5),
546
+ 'crop_region': [crop_x1, crop_y1, crop_x2, crop_y2]
547
+ }
548
+
549
+ classified_elements.append(classified_element)
550
+
551
+ except Exception as e:
552
+ # Keep element with basic classification
553
+ classified_element = {
554
+ **element,
555
+ 'classification': 'other',
556
+ 'classification_confidence': 0.3,
557
+ 'precise_center': element.get('center', [0, 0]),
558
+ 'reasoning': f'Classification failed: {str(e)}',
559
+ 'visual_evidence': [],
560
+ 'interaction_priority': 5,
561
+ 'error': str(e)
562
+ }
563
+ classified_elements.append(classified_element)
564
+
565
+ finally:
566
+ # Cleanup temp file
567
+ try:
568
+ import os
569
+ os.remove(crop_path)
570
+ except:
571
+ pass
572
+
573
+ return classified_elements
574
+
575
+ # ==================== LEGACY METHODS (for compatibility) ====================
576
+
577
+ async def _execute_page_intelligence(self, layer: LayerConfig, service: Any, context: Dict[str, Any]) -> Dict[str, Any]:
578
+ """Execute page intelligence analysis"""
579
+ image_path = context["input"]["image_path"]
580
+
581
+ prompt = '''Analyze this webpage screenshot to understand the login interface structure.
582
+
583
+ Identify:
584
+ 1. Page type (login, register, multi-step auth, SSO)
585
+ 2. Layout pattern (vertical form, horizontal, modal, tabs)
586
+ 3. Language used in the interface
587
+ 4. Security features visible (CAPTCHA, 2FA indicators)
588
+ 5. Form complexity level
589
+ 6. Visible text elements that indicate field purposes
590
+
591
+ Return analysis as JSON with this exact structure:
592
+ {
593
+ "page_type": "login|register|multi_step|sso|other",
594
+ "layout_pattern": "vertical|horizontal|modal|tabs|embedded",
595
+ "language": "en|zh|es|fr|de|other",
596
+ "security_features": ["captcha", "recaptcha", "2fa_indicator", "security_questions"],
597
+ "complexity_score": 0.1-1.0,
598
+ "visible_text_elements": ["Login", "Password", "Sign In"],
599
+ "form_area_estimate": {"x": 0, "y": 0, "width": 0, "height": 0},
600
+ "confidence": 0.1-1.0,
601
+ "analysis_notes": "brief description of what you observe"
602
+ }
603
+
604
+ Be precise and only include elements you can clearly see.'''
605
+
606
+ result = await service.invoke(
607
+ image=image_path,
608
+ prompt=prompt,
609
+ task="analyze",
610
+ max_tokens=layer.parameters.get("max_tokens", 500)
611
+ )
612
+
613
+ # Parse JSON response
614
+ response_text = result['text'].strip()
615
+ json_start = response_text.find('{')
616
+ json_end = response_text.rfind('}') + 1
617
+
618
+ if json_start >= 0 and json_end > json_start:
619
+ json_text = response_text[json_start:json_end]
620
+ try:
621
+ intelligence_data = json.loads(json_text)
622
+ except json.JSONDecodeError:
623
+ # Fallback parsing
624
+ intelligence_data = self._parse_intelligence_fallback(response_text)
625
+ else:
626
+ intelligence_data = self._parse_intelligence_fallback(response_text)
627
+
628
+ return intelligence_data
629
+
630
+ async def _execute_element_detection(self, layer: LayerConfig, service: Any, context: Dict[str, Any]) -> List[Dict[str, Any]]:
631
+ """Execute element detection"""
632
+ image_path = context["input"]["image_path"]
633
+ intelligence = context["results"]["page_intelligence"].data
634
+
635
+ # Adapt parameters based on page intelligence
636
+ complexity = intelligence.get('complexity_score', 0.5)
637
+ params = layer.parameters.copy()
638
+
639
+ if complexity > 0.7:
640
+ params.update({
641
+ "box_threshold": 0.03,
642
+ "iou_threshold": 0.1,
643
+ "imgsz": 1024
644
+ })
645
+ elif complexity > 0.4:
646
+ params.update({
647
+ "box_threshold": 0.05,
648
+ "iou_threshold": 0.1,
649
+ "imgsz": 640
650
+ })
651
+ else:
652
+ params.update({
653
+ "box_threshold": 0.08,
654
+ "iou_threshold": 0.2,
655
+ "imgsz": 640
656
+ })
657
+
658
+ # Run detection based on model type
659
+ if layer.model_name == "omniparser":
660
+ if hasattr(service, 'run_omniparser'):
661
+ result = await service.run_omniparser(
662
+ image=image_path,
663
+ **params
664
+ )
665
+ # Filter for interactive elements only
666
+ elements = [e for e in result.get("parsed_elements", []) if e.get("interactivity", False)]
667
+ else:
668
+ # Fallback for services without omniparser support
669
+ elements = await self._fallback_element_detection(service, image_path, intelligence)
670
+
671
+ elif layer.model_name == "florence-2":
672
+ if hasattr(service, 'run_florence2'):
673
+ result = await service.run_florence2(
674
+ image=image_path,
675
+ task="<OPEN_VOCABULARY_DETECTION>",
676
+ text_input="login form elements, input fields, buttons"
677
+ )
678
+ elements = result.get("parsed_objects", [])
679
+ else:
680
+ elements = await self._fallback_element_detection(service, image_path, intelligence)
681
+
682
+ elif layer.model_name == "yolov8":
683
+ if hasattr(service, 'run_yolo'):
684
+ result = await service.run_yolo(
685
+ image=image_path,
686
+ confidence=params.get("box_threshold", 0.5)
687
+ )
688
+ elements = result.get("detected_objects", [])
689
+ else:
690
+ elements = await self._fallback_element_detection(service, image_path, intelligence)
691
+
692
+ else:
693
+ # Fallback to generic object detection
694
+ elements = await self._fallback_element_detection(service, image_path, intelligence)
695
+
696
+ return elements
697
+
698
+ async def _execute_element_classification(self, layer: LayerConfig, service: Any, context: Dict[str, Any]) -> List[Dict[str, Any]]:
699
+ """Execute element classification"""
700
+ image_path = context["input"]["image_path"]
701
+ elements = context["results"]["element_detection"].data
702
+ intelligence = context["results"]["page_intelligence"].data
703
+
704
+ classified_elements = []
705
+ img = Image.open(image_path)
706
+
707
+ for i, element in enumerate(elements):
708
+ # Crop element region with padding
709
+ bbox = element['bbox']
710
+ x1, y1, x2, y2 = bbox
711
+
712
+ padding_x = max(20, int((x2 - x1) * 0.2))
713
+ padding_y = max(20, int((y2 - y1) * 0.2))
714
+
715
+ crop_x1 = max(0, x1 - padding_x)
716
+ crop_y1 = max(0, y1 - padding_y)
717
+ crop_x2 = min(img.width, x2 + padding_x)
718
+ crop_y2 = min(img.height, y2 + padding_y)
719
+
720
+ cropped_img = img.crop((crop_x1, crop_y1, crop_x2, crop_y2))
721
+ crop_path = f"temp_classify_{i}.png"
722
+ cropped_img.save(crop_path)
723
+
724
+ try:
725
+ context_info = f'''Page Context:
726
+ - Type: {intelligence.get('page_type', 'unknown')}
727
+ - Layout: {intelligence.get('layout_pattern', 'unknown')}
728
+ - Language: {intelligence.get('language', 'en')}
729
+ - Complexity: {intelligence.get('complexity_score', 0.5)}
730
+
731
+ Element Context:
732
+ - Position: {element.get('center', [0, 0])} (center)
733
+ - Size: {element.get('size', [0, 0])} (width x height)
734
+ - Element {i+1} of {len(elements)} total elements'''
735
+
736
+ # Get task-specific classification prompt
737
+ prompt = self._get_classification_prompt(
738
+ parameters.get("task", "element_classification"),
739
+ context_info
740
+ )
741
+
742
+ result = await service.invoke(
743
+ image=crop_path,
744
+ prompt=prompt,
745
+ task="analyze",
746
+ max_tokens=layer.parameters.get("max_tokens", 300)
747
+ )
748
+
749
+ classification_data = self._parse_classification_result(result['text'])
750
+
751
+ # Use original detection center coordinates
752
+ classified_element = {
753
+ **element,
754
+ 'classification': classification_data.get('classification', 'other'),
755
+ 'classification_confidence': classification_data.get('confidence', 0.5),
756
+ 'precise_center': element.get('center', [0, 0]),
757
+ 'reasoning': classification_data.get('reasoning', ''),
758
+ 'visual_evidence': classification_data.get('visual_evidence', []),
759
+ 'interaction_priority': classification_data.get('interaction_priority', 5),
760
+ 'crop_region': [crop_x1, crop_y1, crop_x2, crop_y2]
761
+ }
762
+
763
+ classified_elements.append(classified_element)
764
+
765
+ except Exception as e:
766
+ # Keep element with basic classification
767
+ classified_element = {
768
+ **element,
769
+ 'classification': 'other',
770
+ 'classification_confidence': 0.3,
771
+ 'precise_center': element.get('center', [0, 0]),
772
+ 'reasoning': f'Classification failed: {str(e)}',
773
+ 'visual_evidence': [],
774
+ 'interaction_priority': 5,
775
+ 'error': str(e)
776
+ }
777
+ classified_elements.append(classified_element)
778
+
779
+ finally:
780
+ # Cleanup temp file
781
+ try:
782
+ import os
783
+ os.remove(crop_path)
784
+ except:
785
+ pass
786
+
787
+ return classified_elements
788
+
789
+ async def execute_fallback(self, layer: LayerConfig, context: Dict[str, Any], error: str) -> Optional[Any]:
790
+ """Execute fallback logic for failed layers"""
791
+
792
+ if layer.layer_type == LayerType.INTELLIGENCE:
793
+ # Return basic page intelligence
794
+ return {
795
+ "page_type": "login",
796
+ "layout_pattern": "vertical",
797
+ "language": "en",
798
+ "security_features": [],
799
+ "complexity_score": 0.5,
800
+ "visible_text_elements": ["Login", "Password"],
801
+ "form_area_estimate": {"x": 200, "y": 200, "width": 600, "height": 400},
802
+ "confidence": 0.3,
803
+ "analysis_notes": f"Fallback analysis due to error: {error}"
804
+ }
805
+
806
+ elif layer.layer_type == LayerType.DETECTION:
807
+ # Create fallback elements based on typical form layout
808
+ intelligence = context["results"]["page_intelligence"].data
809
+ return self._create_fallback_elements(context["input"]["image_path"], intelligence)
810
+
811
+ return None
812
+
813
+ def generate_final_output(self, results: Dict[str, LayerResult]) -> Dict[str, Any]:
814
+ """Generate final UI analysis output for simplified two-layer architecture"""
815
+
816
+ # Extract data from the two layers
817
+ ui_elements = results.get("ui_detection", {}).data or []
818
+ action_plan = results.get("action_planning", {}).data or {}
819
+
820
+ # 分离交互和非交互元素
821
+ interactive_elements = [e for e in ui_elements if e.get('interactivity', False)]
822
+ text_elements = [e for e in ui_elements if not e.get('interactivity', False)]
823
+
824
+ return {
825
+ "ui_elements": {
826
+ "total_elements": len(ui_elements),
827
+ "interactive_elements": interactive_elements,
828
+ "text_elements": text_elements,
829
+ "summary": {
830
+ "interactive_count": len(interactive_elements),
831
+ "text_count": len(text_elements)
832
+ }
833
+ },
834
+ "action_plan": action_plan,
835
+ "automation_ready": {
836
+ "ready": action_plan.get("success_probability", 0) > 0.7,
837
+ "confidence": action_plan.get("success_probability", 0),
838
+ "steps_count": len(action_plan.get("action_plan", [])),
839
+ "page_type": action_plan.get("page_analysis", {}).get("page_type", "unknown")
840
+ },
841
+ "execution_summary": {
842
+ "can_automate": len(action_plan.get("action_plan", [])) > 0,
843
+ "recommended_action": action_plan.get("action_plan", [{}])[0] if action_plan.get("action_plan") else None
844
+ }
845
+ }
846
+
847
+ async def visualize_results(self, image_path: str, analysis_result: Dict[str, Any],
848
+ output_path: str = "ui_analysis_result.png") -> str:
849
+ """Generate visualization of the analysis results"""
850
+
851
+ # Load original image
852
+ img = Image.open(image_path)
853
+ img_copy = img.copy()
854
+ draw = ImageDraw.Draw(img_copy)
855
+
856
+ try:
857
+ font_large = ImageFont.truetype("arial.ttf", 24)
858
+ font_small = ImageFont.truetype("arial.ttf", 16)
859
+ except:
860
+ font_large = None
861
+ font_small = None
862
+
863
+ colors = {
864
+ 'username_field': 'red',
865
+ 'password_field': 'blue',
866
+ 'login_button': 'green',
867
+ 'register_button': 'orange',
868
+ 'submit_button': 'purple',
869
+ 'checkbox': 'yellow',
870
+ 'link': 'cyan',
871
+ 'other': 'gray'
872
+ }
873
+
874
+ final_output = analysis_result.get("final_output", {})
875
+ classified_elements = final_output.get("classified_elements", [])
876
+
877
+ # Draw each classified element
878
+ for element in classified_elements:
879
+ classification = element.get('classification', 'other')
880
+ confidence = element.get('classification_confidence', 0)
881
+ center = element.get('precise_center', element.get('center', [0, 0]))
882
+ bbox = element.get('bbox', [0, 0, 100, 100])
883
+ priority = element.get('interaction_priority', 5)
884
+
885
+ color = colors.get(classification, 'gray')
886
+ center_x, center_y = center
887
+ x1, y1, x2, y2 = bbox
888
+
889
+ # Draw bounding box
890
+ draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
891
+
892
+ # Draw center crosshair
893
+ crosshair_size = 35
894
+ draw.line([center_x - crosshair_size, center_y, center_x + crosshair_size, center_y],
895
+ fill=color, width=8)
896
+ draw.line([center_x, center_y - crosshair_size, center_x, center_y + crosshair_size],
897
+ fill=color, width=8)
898
+
899
+ # Draw center circle
900
+ circle_radius = 20
901
+ draw.ellipse([center_x - circle_radius, center_y - circle_radius,
902
+ center_x + circle_radius, center_y + circle_radius],
903
+ outline=color, width=8)
904
+
905
+ # Draw labels
906
+ label = f"{classification.replace('_', ' ').title()}"
907
+ detail = f"Conf: {confidence:.2f} | Pri: {priority}"
908
+
909
+ if font_large:
910
+ draw.text((x1, y1 - 50), label, fill=color, font=font_large)
911
+ draw.text((x1, y1 - 25), detail, fill=color, font=font_small)
912
+ else:
913
+ draw.text((x1, y1 - 40), label, fill=color)
914
+ draw.text((x1, y1 - 20), detail, fill=color)
915
+
916
+ # Add header information
917
+ intelligence = final_output.get("page_intelligence", {})
918
+ summary = final_output.get("analysis_summary", {})
919
+
920
+ header_text = f"UI Analysis: {summary.get('page_type', 'unknown')} | Elements: {summary.get('interactive_elements', 0)} | Confidence: {summary.get('overall_confidence', 0):.2f}"
921
+
922
+ if font_large:
923
+ draw.text((10, 10), "Stacked UI Analysis", fill='black', font=font_large)
924
+ draw.text((10, 40), header_text, fill='black', font=font_small)
925
+ else:
926
+ draw.text((10, 10), "Stacked UI Analysis", fill='black')
927
+ draw.text((10, 30), header_text, fill='black')
928
+
929
+ # Save visualization
930
+ img_copy.save(output_path)
931
+ return output_path
932
+
933
+ # Helper methods
934
+ def _parse_intelligence_fallback(self, text: str) -> Dict[str, Any]:
935
+ """Fallback parser for intelligence data"""
936
+ return {
937
+ "page_type": "login",
938
+ "layout_pattern": "vertical",
939
+ "language": "en",
940
+ "security_features": [],
941
+ "complexity_score": 0.6,
942
+ "visible_text_elements": ["Username", "Password", "Login"],
943
+ "form_area_estimate": {"x": 200, "y": 200, "width": 600, "height": 400},
944
+ "confidence": 0.4,
945
+ "analysis_notes": "Parsed from text analysis"
946
+ }
947
+
948
+ def _parse_classification_result(self, text: str) -> Dict[str, Any]:
949
+ """Parse classification result from GPT response"""
950
+ try:
951
+ json_start = text.find('{')
952
+ json_end = text.rfind('}') + 1
953
+
954
+ if json_start >= 0 and json_end > json_start:
955
+ json_text = text[json_start:json_end]
956
+ return json.loads(json_text)
957
+ except:
958
+ pass
959
+
960
+ # Fallback text parsing
961
+ classification = 'other'
962
+ confidence = 0.5
963
+
964
+ text_lower = text.lower()
965
+ if 'username' in text_lower or 'email' in text_lower:
966
+ classification = 'username_field'
967
+ confidence = 0.7
968
+ elif 'password' in text_lower:
969
+ classification = 'password_field'
970
+ confidence = 0.7
971
+ elif 'button' in text_lower and ('login' in text_lower or 'sign' in text_lower):
972
+ classification = 'login_button'
973
+ confidence = 0.7
974
+
975
+ return {
976
+ 'classification': classification,
977
+ 'confidence': confidence,
978
+ 'reasoning': 'Parsed from text',
979
+ 'visual_evidence': ['text_analysis'],
980
+ 'interaction_priority': 5
981
+ }
982
+
983
+ def _get_classification_prompt(self, task: str, context_info: str) -> str:
984
+ """Get task-specific classification prompt"""
985
+
986
+ if task == "search_element_classification":
987
+ return f'''Classify this UI element from a search interface.
988
+
989
+ {context_info}
990
+
991
+ Classify as one of:
992
+ - search_field: for search input boxes, query fields
993
+ - search_button: for search/go buttons, submit search
994
+ - search_suggestion: for autocomplete suggestions
995
+ - filter: for search filters, sorting options
996
+ - voice_search: for voice search buttons
997
+ - image_search: for image search options
998
+ - advanced_search: for advanced search links/options
999
+ - nav_link: for navigation menu items
1000
+ - other: for unrelated elements
1001
+
1002
+ Response format:
1003
+ {{
1004
+ "classification": "search_field|search_button|filter|nav_link|other",
1005
+ "confidence": 0.1-1.0,
1006
+ "reasoning": "brief explanation of classification decision",
1007
+ "visual_evidence": ["text_visible", "styling_clues", "position_indicators"],
1008
+ "interaction_priority": 1-10
1009
+ }}'''
1010
+
1011
+ elif task == "content_element_classification":
1012
+ return f'''Classify this UI element from a content page.
1013
+
1014
+ {context_info}
1015
+
1016
+ Classify as one of:
1017
+ - article_title: for main article/page titles
1018
+ - article_body: for main content/article text
1019
+ - sidebar_content: for sidebar information
1020
+ - navigation_menu: for navigation elements
1021
+ - related_links: for related articles/links
1022
+ - comment: for comment sections
1023
+ - share_button: for social sharing
1024
+ - read_more: for read more links
1025
+ - image: for content images
1026
+ - video: for embedded videos
1027
+ - other: for unrelated elements
1028
+
1029
+ Response format:
1030
+ {{
1031
+ "classification": "article_title|article_body|sidebar_content|navigation_menu|other",
1032
+ "confidence": 0.1-1.0,
1033
+ "reasoning": "brief explanation of classification decision",
1034
+ "visual_evidence": ["text_visible", "styling_clues", "position_indicators"],
1035
+ "interaction_priority": 1-10
1036
+ }}'''
1037
+
1038
+ elif task == "navigation_element_classification":
1039
+ return f'''Classify this UI element from a navigation-focused page.
1040
+
1041
+ {context_info}
1042
+
1043
+ Classify as one of:
1044
+ - nav_link: for main navigation links
1045
+ - menu_item: for menu items and categories
1046
+ - breadcrumb: for breadcrumb navigation
1047
+ - dropdown_menu: for dropdown menu elements
1048
+ - footer_link: for footer navigation
1049
+ - logo: for site logos/branding
1050
+ - search_box: for site search functionality
1051
+ - user_menu: for user account menus
1052
+ - cta_button: for call-to-action buttons
1053
+ - other: for unrelated elements
1054
+
1055
+ Response format:
1056
+ {{
1057
+ "classification": "nav_link|menu_item|breadcrumb|dropdown_menu|other",
1058
+ "confidence": 0.1-1.0,
1059
+ "reasoning": "brief explanation of classification decision",
1060
+ "visual_evidence": ["text_visible", "styling_clues", "position_indicators"],
1061
+ "interaction_priority": 1-10
1062
+ }}'''
1063
+
1064
+ else:
1065
+ # Default login classification
1066
+ return f'''Classify this UI element from a login/authentication interface.
1067
+
1068
+ {context_info}
1069
+
1070
+ Classify as one of:
1071
+ - username_field: for username, email, user ID inputs
1072
+ - password_field: for password inputs
1073
+ - confirm_password: for password confirmation fields
1074
+ - login_button: for sign in/login buttons
1075
+ - register_button: for sign up/register buttons
1076
+ - submit_button: for general form submission
1077
+ - checkbox: for remember me, terms agreement
1078
+ - link: for forgot password, register links
1079
+ - other: for unrelated elements
1080
+
1081
+ Response format:
1082
+ {{
1083
+ "classification": "username_field|password_field|login_button|other",
1084
+ "confidence": 0.1-1.0,
1085
+ "reasoning": "brief explanation of classification decision",
1086
+ "visual_evidence": ["text_visible", "styling_clues", "position_indicators"],
1087
+ "interaction_priority": 1-10
1088
+ }}'''
1089
+
1090
+ def _build_action_planning_prompt(self, task_type: str, elements_summary: List[Dict], interactive_elements: List[Dict]) -> str:
1091
+ """构建行动规划提示词"""
1092
+
1093
+ # 任务特定的指引
1094
+ if "search" in task_type:
1095
+ instruction = """
1096
+ 目标:完成搜索操作
1097
+ 需要识别:
1098
+ 1. 搜索输入框
1099
+ 2. 搜索按钮(可选,通常可以按回车)
1100
+ 操作顺序:点击搜索框 → 输入搜索内容 → 点击搜索按钮或按回车
1101
+ """
1102
+ elif "content" in task_type:
1103
+ instruction = """
1104
+ 目标:识别和提取页面内容
1105
+ 需要识别:
1106
+ 1. 主要内容区域
1107
+ 2. 标题和正文
1108
+ 3. 相关链接
1109
+ 4. 导航元素
1110
+ """
1111
+ elif "navigation" in task_type:
1112
+ instruction = """
1113
+ 目标:识别页面导航结构
1114
+ 需要识别:
1115
+ 1. 主导航菜单
1116
+ 2. 子菜单项
1117
+ 3. 面包屑导航
1118
+ 4. 页脚链接
1119
+ """
1120
+ else:
1121
+ instruction = """
1122
+ 目标:完成登录操作
1123
+ 需要识别:
1124
+ 1. 用户名/邮箱输入框
1125
+ 2. 密码输入框
1126
+ 3. 登录/提交按钮
1127
+ 操作顺序:点击用户名框 → 输入用户名 → 点击密码框 → 输入密码 → 点击登录按钮
1128
+ """
1129
+
1130
+ # 构建元素列表
1131
+ elements_text = "可用的UI元素:\n"
1132
+ for i, elem in enumerate(elements_summary):
1133
+ interactivity_mark = "🔴" if elem['interactivity'] else "⚪"
1134
+ elements_text += f"{interactivity_mark} 元素{elem['id']}: {elem['type']} - \"{elem['content'][:50]}\" - 中心点{elem['center']}\n"
1135
+
1136
+ interactive_text = f"\n交互元素(共{len(interactive_elements)}个):\n"
1137
+ for elem in interactive_elements:
1138
+ interactive_text += f"🔴 元素{elem['id']}: \"{elem['content'][:30]}\" - 中心点{elem['center']}\n"
1139
+
1140
+ return f"""你是一个UI自动化专家。基于以下信息,为网页操作制定精确的行动计划。
1141
+
1142
+ {instruction}
1143
+
1144
+ {elements_text}
1145
+ {interactive_text}
1146
+
1147
+ 请分析这个页面并提供操作计划:
1148
+
1149
+ 1. 确定页面类型和当前状态
1150
+ 2. 识别完成目标所需的关键元素
1151
+ 3. 提供精确的操作步骤(包括点击坐标)
1152
+
1153
+ 返回JSON格式:
1154
+ {{
1155
+ "page_analysis": {{
1156
+ "page_type": "登录页面|搜索页面|内容页面|导航页面",
1157
+ "confidence": 0.1-1.0,
1158
+ "key_elements_found": ["element_type1", "element_type2"]
1159
+ }},
1160
+ "action_plan": [
1161
+ {{
1162
+ "step": 1,
1163
+ "action": "click|type|scroll",
1164
+ "target_element_id": 元素ID,
1165
+ "target_coordinates": [x, y],
1166
+ "description": "操作描述",
1167
+ "input_text": "要输入的文本(如果是type操作)"
1168
+ }}
1169
+ ],
1170
+ "success_probability": 0.1-1.0,
1171
+ "notes": "额外说明"
1172
+ }}
1173
+
1174
+ 只基于实际看到的元素制定计划,确保坐标准确。"""
1175
+
1176
+ def _parse_action_plan(self, text: str) -> Dict[str, Any]:
1177
+ """解析行动计划"""
1178
+ try:
1179
+ # 尝试解析JSON
1180
+ json_start = text.find('{')
1181
+ json_end = text.rfind('}') + 1
1182
+
1183
+ if json_start >= 0 and json_end > json_start:
1184
+ json_text = text[json_start:json_end]
1185
+ return json.loads(json_text)
1186
+ except:
1187
+ pass
1188
+
1189
+ # 失败时返回基础计划
1190
+ return {
1191
+ "page_analysis": {
1192
+ "page_type": f"{self.task_type}页面",
1193
+ "confidence": 0.5,
1194
+ "key_elements_found": []
1195
+ },
1196
+ "action_plan": [],
1197
+ "success_probability": 0.3,
1198
+ "notes": "解析失败,使用fallback计划"
1199
+ }
1200
+
1201
+ def _match_actions_to_elements(self, decision: Dict[str, Any], ui_elements: List[Dict[str, Any]]) -> Dict[str, Any]:
1202
+ """将决策与实际UI元素匹配"""
1203
+
1204
+ action_plan = decision.get("action_plan", [])
1205
+
1206
+ # 为每个行动步骤匹配实际元素
1207
+ for step in action_plan:
1208
+ target_id = step.get("target_element_id")
1209
+
1210
+ if target_id is not None:
1211
+ # 找到对应的元素
1212
+ target_element = None
1213
+ for element in ui_elements:
1214
+ if element.get('element_id') == target_id:
1215
+ target_element = element
1216
+ break
1217
+
1218
+ if target_element:
1219
+ # 使用实际元素的坐标
1220
+ step["actual_coordinates"] = target_element.get("center")
1221
+ step["actual_bbox"] = target_element.get("pixel_bbox")
1222
+ step["element_content"] = target_element.get("content")
1223
+ step["element_type"] = target_element.get("type")
1224
+ step["element_size"] = target_element.get("size")
1225
+
1226
+ return {
1227
+ "page_analysis": decision.get("page_analysis", {}),
1228
+ "action_plan": action_plan,
1229
+ "success_probability": decision.get("success_probability", 0.5),
1230
+ "notes": decision.get("notes", ""),
1231
+ "total_steps": len(action_plan),
1232
+ "interactive_elements_available": len([e for e in ui_elements if e.get('interactivity')])
1233
+ }
1234
+
1235
+ def _create_fallback_elements(self, image_path: str, intelligence: Dict[str, Any]) -> List[Dict[str, Any]]:
1236
+ """Create fallback elements based on typical layouts"""
1237
+ img = Image.open(image_path)
1238
+ img_width, img_height = img.size
1239
+
1240
+ layout = intelligence.get('layout_pattern', 'vertical')
1241
+
1242
+ if layout == 'vertical':
1243
+ form_center_x = img_width // 2
1244
+ form_start_y = img_height // 3
1245
+
1246
+ return [
1247
+ {
1248
+ 'id': 'fallback_username',
1249
+ 'bbox': [form_center_x - 150, form_start_y, form_center_x + 150, form_start_y + 40],
1250
+ 'center': [form_center_x, form_start_y + 20],
1251
+ 'size': [300, 40],
1252
+ 'confidence': 0.6,
1253
+ 'type': 'input'
1254
+ },
1255
+ {
1256
+ 'id': 'fallback_password',
1257
+ 'bbox': [form_center_x - 150, form_start_y + 70, form_center_x + 150, form_start_y + 110],
1258
+ 'center': [form_center_x, form_start_y + 90],
1259
+ 'size': [300, 40],
1260
+ 'confidence': 0.6,
1261
+ 'type': 'input'
1262
+ },
1263
+ {
1264
+ 'id': 'fallback_button',
1265
+ 'bbox': [form_center_x - 75, form_start_y + 140, form_center_x + 75, form_start_y + 180],
1266
+ 'center': [form_center_x, form_start_y + 160],
1267
+ 'size': [150, 40],
1268
+ 'confidence': 0.5,
1269
+ 'type': 'button'
1270
+ }
1271
+ ]
1272
+ else:
1273
+ return [
1274
+ {
1275
+ 'id': 'fallback_form',
1276
+ 'bbox': [img_width//4, img_height//3, 3*img_width//4, 2*img_height//3],
1277
+ 'center': [img_width//2, img_height//2],
1278
+ 'size': [img_width//2, img_height//3],
1279
+ 'confidence': 0.4,
1280
+ 'type': 'form'
1281
+ }
1282
+ ]
1283
+
1284
+ async def _fallback_element_detection(self, service: Any, image_path: str, intelligence: Dict[str, Any]) -> List[Dict[str, Any]]:
1285
+ """Fallback element detection using generic methods"""
1286
+ try:
1287
+ # Try generic object detection
1288
+ result = await service.detect_objects(image_path, confidence_threshold=0.5)
1289
+ objects = result.get("objects", [])
1290
+
1291
+ # Convert to standard format
1292
+ elements = []
1293
+ for i, obj in enumerate(objects):
1294
+ coords = obj.get("coordinates", {})
1295
+ if all(k in coords for k in ['x', 'y', 'width', 'height']):
1296
+ # Convert percentage to pixels
1297
+ img = Image.open(image_path)
1298
+ img_width, img_height = img.size
1299
+
1300
+ x = int(coords['x'] * img_width / 100)
1301
+ y = int(coords['y'] * img_height / 100)
1302
+ w = int(coords['width'] * img_width / 100)
1303
+ h = int(coords['height'] * img_height / 100)
1304
+
1305
+ element = {
1306
+ 'id': f'fallback_{i}',
1307
+ 'bbox': [x, y, x + w, y + h],
1308
+ 'center': [x + w//2, y + h//2],
1309
+ 'size': [w, h],
1310
+ 'confidence': obj.get('confidence', 0.7),
1311
+ 'type': 'detected'
1312
+ }
1313
+ elements.append(element)
1314
+
1315
+ return elements
1316
+
1317
+ except Exception:
1318
+ # Ultimate fallback
1319
+ return self._create_fallback_elements(image_path, intelligence)