isa-model 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/__init__.py +30 -1
- isa_model/client.py +770 -0
- isa_model/core/config/__init__.py +16 -0
- isa_model/core/config/config_manager.py +514 -0
- isa_model/core/config.py +426 -0
- isa_model/core/models/model_billing_tracker.py +476 -0
- isa_model/core/models/model_manager.py +399 -0
- isa_model/core/{storage/supabase_storage.py → models/model_repo.py} +72 -73
- isa_model/core/pricing_manager.py +426 -0
- isa_model/core/services/__init__.py +19 -0
- isa_model/core/services/intelligent_model_selector.py +547 -0
- isa_model/core/types.py +291 -0
- isa_model/deployment/__init__.py +2 -0
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +157 -3
- isa_model/deployment/cloud/modal/isa_vision_table_service.py +532 -0
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +104 -3
- isa_model/deployment/cloud/modal/register_models.py +321 -0
- isa_model/deployment/runtime/deployed_service.py +338 -0
- isa_model/deployment/services/__init__.py +9 -0
- isa_model/deployment/services/auto_deploy_vision_service.py +537 -0
- isa_model/deployment/services/model_service.py +332 -0
- isa_model/deployment/services/service_monitor.py +356 -0
- isa_model/deployment/services/service_registry.py +527 -0
- isa_model/eval/__init__.py +80 -44
- isa_model/eval/config/__init__.py +10 -0
- isa_model/eval/config/evaluation_config.py +108 -0
- isa_model/eval/evaluators/__init__.py +18 -0
- isa_model/eval/evaluators/base_evaluator.py +503 -0
- isa_model/eval/evaluators/llm_evaluator.py +472 -0
- isa_model/eval/factory.py +417 -709
- isa_model/eval/infrastructure/__init__.py +24 -0
- isa_model/eval/infrastructure/experiment_tracker.py +466 -0
- isa_model/eval/metrics.py +191 -21
- isa_model/inference/ai_factory.py +181 -605
- isa_model/inference/services/audio/base_stt_service.py +65 -1
- isa_model/inference/services/audio/base_tts_service.py +75 -1
- isa_model/inference/services/audio/openai_stt_service.py +189 -151
- isa_model/inference/services/audio/openai_tts_service.py +12 -10
- isa_model/inference/services/audio/replicate_tts_service.py +61 -56
- isa_model/inference/services/base_service.py +55 -17
- isa_model/inference/services/embedding/base_embed_service.py +65 -1
- isa_model/inference/services/embedding/ollama_embed_service.py +103 -43
- isa_model/inference/services/embedding/openai_embed_service.py +8 -10
- isa_model/inference/services/helpers/stacked_config.py +148 -0
- isa_model/inference/services/img/__init__.py +18 -0
- isa_model/inference/services/{vision → img}/base_image_gen_service.py +80 -1
- isa_model/inference/services/{stacked → img}/flux_professional_service.py +25 -1
- isa_model/inference/services/{stacked → img/helpers}/base_stacked_service.py +40 -35
- isa_model/inference/services/{vision → img}/replicate_image_gen_service.py +44 -31
- isa_model/inference/services/llm/__init__.py +3 -3
- isa_model/inference/services/llm/base_llm_service.py +492 -40
- isa_model/inference/services/llm/helpers/llm_prompts.py +258 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +280 -0
- isa_model/inference/services/llm/ollama_llm_service.py +51 -17
- isa_model/inference/services/llm/openai_llm_service.py +70 -19
- isa_model/inference/services/llm/yyds_llm_service.py +24 -23
- isa_model/inference/services/vision/__init__.py +38 -4
- isa_model/inference/services/vision/base_vision_service.py +218 -117
- isa_model/inference/services/vision/{isA_vision_service.py → disabled/isA_vision_service.py} +98 -0
- isa_model/inference/services/{stacked → vision}/doc_analysis_service.py +1 -1
- isa_model/inference/services/vision/helpers/base_stacked_service.py +274 -0
- isa_model/inference/services/vision/helpers/image_utils.py +272 -3
- isa_model/inference/services/vision/helpers/vision_prompts.py +297 -0
- isa_model/inference/services/vision/openai_vision_service.py +104 -307
- isa_model/inference/services/vision/replicate_vision_service.py +140 -325
- isa_model/inference/services/{stacked → vision}/ui_analysis_service.py +2 -498
- isa_model/scripts/register_models.py +370 -0
- isa_model/scripts/register_models_with_embeddings.py +510 -0
- isa_model/serving/api/fastapi_server.py +6 -1
- isa_model/serving/api/routes/unified.py +202 -0
- {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/METADATA +4 -1
- {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/RECORD +77 -53
- isa_model/config/__init__.py +0 -9
- isa_model/config/config_manager.py +0 -213
- isa_model/core/model_manager.py +0 -213
- isa_model/core/model_registry.py +0 -375
- isa_model/core/vision_models_init.py +0 -116
- isa_model/inference/billing_tracker.py +0 -406
- isa_model/inference/services/llm/triton_llm_service.py +0 -481
- isa_model/inference/services/stacked/__init__.py +0 -26
- isa_model/inference/services/stacked/config.py +0 -426
- isa_model/inference/services/vision/ollama_vision_service.py +0 -194
- /isa_model/core/{model_storage.py → models/model_storage.py} +0 -0
- /isa_model/inference/services/{vision → embedding}/helpers/text_splitter.py +0 -0
- /isa_model/inference/services/llm/{llm_adapter.py → helpers/llm_adapter.py} +0 -0
- {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/WHEEL +0 -0
- {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/top_level.txt +0 -0
@@ -11,7 +11,7 @@ from typing import Dict, Any, List, Optional, Union, BinaryIO
|
|
11
11
|
import json
|
12
12
|
from PIL import Image, ImageDraw, ImageFont
|
13
13
|
|
14
|
-
from .base_stacked_service import BaseStackedService, LayerConfig, LayerType, LayerResult
|
14
|
+
from .helpers.base_stacked_service import BaseStackedService, LayerConfig, LayerType, LayerResult
|
15
15
|
|
16
16
|
class UIAnalysisService(BaseStackedService):
|
17
17
|
"""
|
@@ -434,358 +434,6 @@ Return analysis as JSON with this exact structure:
|
|
434
434
|
|
435
435
|
Be precise and only include elements you can clearly see.'''
|
436
436
|
|
437
|
-
async def _invoke_element_detection(self, service: Any, image_path: str, intelligence: Dict[str, Any], parameters: Dict[str, Any]) -> List[Dict[str, Any]]:
|
438
|
-
"""Invoke element detection using unified interface"""
|
439
|
-
|
440
|
-
# Adapt parameters based on page intelligence
|
441
|
-
complexity = intelligence.get('complexity_score', 0.5)
|
442
|
-
|
443
|
-
# Check if this is omniparser service
|
444
|
-
if hasattr(service, 'run_omniparser'):
|
445
|
-
# Use replicate omniparser
|
446
|
-
params = {}
|
447
|
-
if complexity > 0.7:
|
448
|
-
params.update({
|
449
|
-
"box_threshold": 0.03,
|
450
|
-
"iou_threshold": 0.1,
|
451
|
-
"imgsz": 1024
|
452
|
-
})
|
453
|
-
else:
|
454
|
-
params.update({
|
455
|
-
"box_threshold": parameters.get("box_threshold", 0.05),
|
456
|
-
"iou_threshold": parameters.get("iou_threshold", 0.1),
|
457
|
-
"imgsz": parameters.get("imgsz", 640)
|
458
|
-
})
|
459
|
-
|
460
|
-
result = await service.run_omniparser(image=image_path, **params)
|
461
|
-
# Filter for interactive elements only
|
462
|
-
elements = [e for e in result.get("parsed_elements", []) if e.get("interactivity", False)]
|
463
|
-
return elements
|
464
|
-
else:
|
465
|
-
# Use fallback generic detection
|
466
|
-
return await self._fallback_element_detection(service, image_path, intelligence)
|
467
|
-
|
468
|
-
async def _invoke_element_classification(self, service: Any, image_path: str, elements: List[Dict[str, Any]], intelligence: Dict[str, Any], parameters: Dict[str, Any]) -> List[Dict[str, Any]]:
|
469
|
-
"""Invoke element classification using unified interface"""
|
470
|
-
|
471
|
-
classified_elements = []
|
472
|
-
img = Image.open(image_path)
|
473
|
-
|
474
|
-
for i, element in enumerate(elements):
|
475
|
-
# Crop element region with padding
|
476
|
-
bbox = element['bbox']
|
477
|
-
x1, y1, x2, y2 = bbox
|
478
|
-
|
479
|
-
padding_x = max(20, int((x2 - x1) * 0.2))
|
480
|
-
padding_y = max(20, int((y2 - y1) * 0.2))
|
481
|
-
|
482
|
-
crop_x1 = max(0, x1 - padding_x)
|
483
|
-
crop_y1 = max(0, y1 - padding_y)
|
484
|
-
crop_x2 = min(img.width, x2 + padding_x)
|
485
|
-
crop_y2 = min(img.height, y2 + padding_y)
|
486
|
-
|
487
|
-
cropped_img = img.crop((crop_x1, crop_y1, crop_x2, crop_y2))
|
488
|
-
crop_path = f"temp_classify_{i}.png"
|
489
|
-
cropped_img.save(crop_path)
|
490
|
-
|
491
|
-
try:
|
492
|
-
context_info = f'''Page Context:
|
493
|
-
- Type: {intelligence.get('page_type', 'unknown')}
|
494
|
-
- Layout: {intelligence.get('layout_pattern', 'unknown')}
|
495
|
-
- Language: {intelligence.get('language', 'en')}
|
496
|
-
- Complexity: {intelligence.get('complexity_score', 0.5)}
|
497
|
-
|
498
|
-
Element Context:
|
499
|
-
- Position: {element.get('center', [0, 0])} (center)
|
500
|
-
- Size: {element.get('size', [0, 0])} (width x height)
|
501
|
-
- Element {i+1} of {len(elements)} total elements'''
|
502
|
-
|
503
|
-
prompt = f'''Classify this UI element from a login/authentication interface.
|
504
|
-
|
505
|
-
{context_info}
|
506
|
-
|
507
|
-
Classify as one of:
|
508
|
-
- username_field: for username, email, user ID inputs
|
509
|
-
- password_field: for password inputs
|
510
|
-
- confirm_password: for password confirmation fields
|
511
|
-
- login_button: for sign in/login buttons
|
512
|
-
- register_button: for sign up/register buttons
|
513
|
-
- submit_button: for general form submission
|
514
|
-
- checkbox: for remember me, terms agreement
|
515
|
-
- link: for forgot password, register links
|
516
|
-
- other: for unrelated elements
|
517
|
-
|
518
|
-
Response format:
|
519
|
-
{{
|
520
|
-
"classification": "username_field|password_field|login_button|other",
|
521
|
-
"confidence": 0.1-1.0,
|
522
|
-
"reasoning": "brief explanation of classification decision",
|
523
|
-
"visual_evidence": ["text_visible", "styling_clues", "position_indicators"],
|
524
|
-
"interaction_priority": 1-10
|
525
|
-
}}'''
|
526
|
-
|
527
|
-
# Use unified invoke method
|
528
|
-
result = await service.invoke(
|
529
|
-
image=crop_path,
|
530
|
-
prompt=prompt,
|
531
|
-
task="analyze",
|
532
|
-
max_tokens=parameters.get("max_tokens", 300)
|
533
|
-
)
|
534
|
-
|
535
|
-
classification_data = self._parse_classification_result(result['text'])
|
536
|
-
|
537
|
-
# Use original detection center coordinates
|
538
|
-
classified_element = {
|
539
|
-
**element,
|
540
|
-
'classification': classification_data.get('classification', 'other'),
|
541
|
-
'classification_confidence': classification_data.get('confidence', 0.5),
|
542
|
-
'precise_center': element.get('center', [0, 0]),
|
543
|
-
'reasoning': classification_data.get('reasoning', ''),
|
544
|
-
'visual_evidence': classification_data.get('visual_evidence', []),
|
545
|
-
'interaction_priority': classification_data.get('interaction_priority', 5),
|
546
|
-
'crop_region': [crop_x1, crop_y1, crop_x2, crop_y2]
|
547
|
-
}
|
548
|
-
|
549
|
-
classified_elements.append(classified_element)
|
550
|
-
|
551
|
-
except Exception as e:
|
552
|
-
# Keep element with basic classification
|
553
|
-
classified_element = {
|
554
|
-
**element,
|
555
|
-
'classification': 'other',
|
556
|
-
'classification_confidence': 0.3,
|
557
|
-
'precise_center': element.get('center', [0, 0]),
|
558
|
-
'reasoning': f'Classification failed: {str(e)}',
|
559
|
-
'visual_evidence': [],
|
560
|
-
'interaction_priority': 5,
|
561
|
-
'error': str(e)
|
562
|
-
}
|
563
|
-
classified_elements.append(classified_element)
|
564
|
-
|
565
|
-
finally:
|
566
|
-
# Cleanup temp file
|
567
|
-
try:
|
568
|
-
import os
|
569
|
-
os.remove(crop_path)
|
570
|
-
except:
|
571
|
-
pass
|
572
|
-
|
573
|
-
return classified_elements
|
574
|
-
|
575
|
-
# ==================== LEGACY METHODS (for compatibility) ====================
|
576
|
-
|
577
|
-
async def _execute_page_intelligence(self, layer: LayerConfig, service: Any, context: Dict[str, Any]) -> Dict[str, Any]:
|
578
|
-
"""Execute page intelligence analysis"""
|
579
|
-
image_path = context["input"]["image_path"]
|
580
|
-
|
581
|
-
prompt = '''Analyze this webpage screenshot to understand the login interface structure.
|
582
|
-
|
583
|
-
Identify:
|
584
|
-
1. Page type (login, register, multi-step auth, SSO)
|
585
|
-
2. Layout pattern (vertical form, horizontal, modal, tabs)
|
586
|
-
3. Language used in the interface
|
587
|
-
4. Security features visible (CAPTCHA, 2FA indicators)
|
588
|
-
5. Form complexity level
|
589
|
-
6. Visible text elements that indicate field purposes
|
590
|
-
|
591
|
-
Return analysis as JSON with this exact structure:
|
592
|
-
{
|
593
|
-
"page_type": "login|register|multi_step|sso|other",
|
594
|
-
"layout_pattern": "vertical|horizontal|modal|tabs|embedded",
|
595
|
-
"language": "en|zh|es|fr|de|other",
|
596
|
-
"security_features": ["captcha", "recaptcha", "2fa_indicator", "security_questions"],
|
597
|
-
"complexity_score": 0.1-1.0,
|
598
|
-
"visible_text_elements": ["Login", "Password", "Sign In"],
|
599
|
-
"form_area_estimate": {"x": 0, "y": 0, "width": 0, "height": 0},
|
600
|
-
"confidence": 0.1-1.0,
|
601
|
-
"analysis_notes": "brief description of what you observe"
|
602
|
-
}
|
603
|
-
|
604
|
-
Be precise and only include elements you can clearly see.'''
|
605
|
-
|
606
|
-
result = await service.invoke(
|
607
|
-
image=image_path,
|
608
|
-
prompt=prompt,
|
609
|
-
task="analyze",
|
610
|
-
max_tokens=layer.parameters.get("max_tokens", 500)
|
611
|
-
)
|
612
|
-
|
613
|
-
# Parse JSON response
|
614
|
-
response_text = result['text'].strip()
|
615
|
-
json_start = response_text.find('{')
|
616
|
-
json_end = response_text.rfind('}') + 1
|
617
|
-
|
618
|
-
if json_start >= 0 and json_end > json_start:
|
619
|
-
json_text = response_text[json_start:json_end]
|
620
|
-
try:
|
621
|
-
intelligence_data = json.loads(json_text)
|
622
|
-
except json.JSONDecodeError:
|
623
|
-
# Fallback parsing
|
624
|
-
intelligence_data = self._parse_intelligence_fallback(response_text)
|
625
|
-
else:
|
626
|
-
intelligence_data = self._parse_intelligence_fallback(response_text)
|
627
|
-
|
628
|
-
return intelligence_data
|
629
|
-
|
630
|
-
async def _execute_element_detection(self, layer: LayerConfig, service: Any, context: Dict[str, Any]) -> List[Dict[str, Any]]:
|
631
|
-
"""Execute element detection"""
|
632
|
-
image_path = context["input"]["image_path"]
|
633
|
-
intelligence = context["results"]["page_intelligence"].data
|
634
|
-
|
635
|
-
# Adapt parameters based on page intelligence
|
636
|
-
complexity = intelligence.get('complexity_score', 0.5)
|
637
|
-
params = layer.parameters.copy()
|
638
|
-
|
639
|
-
if complexity > 0.7:
|
640
|
-
params.update({
|
641
|
-
"box_threshold": 0.03,
|
642
|
-
"iou_threshold": 0.1,
|
643
|
-
"imgsz": 1024
|
644
|
-
})
|
645
|
-
elif complexity > 0.4:
|
646
|
-
params.update({
|
647
|
-
"box_threshold": 0.05,
|
648
|
-
"iou_threshold": 0.1,
|
649
|
-
"imgsz": 640
|
650
|
-
})
|
651
|
-
else:
|
652
|
-
params.update({
|
653
|
-
"box_threshold": 0.08,
|
654
|
-
"iou_threshold": 0.2,
|
655
|
-
"imgsz": 640
|
656
|
-
})
|
657
|
-
|
658
|
-
# Run detection based on model type
|
659
|
-
if layer.model_name == "omniparser":
|
660
|
-
if hasattr(service, 'run_omniparser'):
|
661
|
-
result = await service.run_omniparser(
|
662
|
-
image=image_path,
|
663
|
-
**params
|
664
|
-
)
|
665
|
-
# Filter for interactive elements only
|
666
|
-
elements = [e for e in result.get("parsed_elements", []) if e.get("interactivity", False)]
|
667
|
-
else:
|
668
|
-
# Fallback for services without omniparser support
|
669
|
-
elements = await self._fallback_element_detection(service, image_path, intelligence)
|
670
|
-
|
671
|
-
elif layer.model_name == "florence-2":
|
672
|
-
if hasattr(service, 'run_florence2'):
|
673
|
-
result = await service.run_florence2(
|
674
|
-
image=image_path,
|
675
|
-
task="<OPEN_VOCABULARY_DETECTION>",
|
676
|
-
text_input="login form elements, input fields, buttons"
|
677
|
-
)
|
678
|
-
elements = result.get("parsed_objects", [])
|
679
|
-
else:
|
680
|
-
elements = await self._fallback_element_detection(service, image_path, intelligence)
|
681
|
-
|
682
|
-
elif layer.model_name == "yolov8":
|
683
|
-
if hasattr(service, 'run_yolo'):
|
684
|
-
result = await service.run_yolo(
|
685
|
-
image=image_path,
|
686
|
-
confidence=params.get("box_threshold", 0.5)
|
687
|
-
)
|
688
|
-
elements = result.get("detected_objects", [])
|
689
|
-
else:
|
690
|
-
elements = await self._fallback_element_detection(service, image_path, intelligence)
|
691
|
-
|
692
|
-
else:
|
693
|
-
# Fallback to generic object detection
|
694
|
-
elements = await self._fallback_element_detection(service, image_path, intelligence)
|
695
|
-
|
696
|
-
return elements
|
697
|
-
|
698
|
-
async def _execute_element_classification(self, layer: LayerConfig, service: Any, context: Dict[str, Any]) -> List[Dict[str, Any]]:
|
699
|
-
"""Execute element classification"""
|
700
|
-
image_path = context["input"]["image_path"]
|
701
|
-
elements = context["results"]["element_detection"].data
|
702
|
-
intelligence = context["results"]["page_intelligence"].data
|
703
|
-
|
704
|
-
classified_elements = []
|
705
|
-
img = Image.open(image_path)
|
706
|
-
|
707
|
-
for i, element in enumerate(elements):
|
708
|
-
# Crop element region with padding
|
709
|
-
bbox = element['bbox']
|
710
|
-
x1, y1, x2, y2 = bbox
|
711
|
-
|
712
|
-
padding_x = max(20, int((x2 - x1) * 0.2))
|
713
|
-
padding_y = max(20, int((y2 - y1) * 0.2))
|
714
|
-
|
715
|
-
crop_x1 = max(0, x1 - padding_x)
|
716
|
-
crop_y1 = max(0, y1 - padding_y)
|
717
|
-
crop_x2 = min(img.width, x2 + padding_x)
|
718
|
-
crop_y2 = min(img.height, y2 + padding_y)
|
719
|
-
|
720
|
-
cropped_img = img.crop((crop_x1, crop_y1, crop_x2, crop_y2))
|
721
|
-
crop_path = f"temp_classify_{i}.png"
|
722
|
-
cropped_img.save(crop_path)
|
723
|
-
|
724
|
-
try:
|
725
|
-
context_info = f'''Page Context:
|
726
|
-
- Type: {intelligence.get('page_type', 'unknown')}
|
727
|
-
- Layout: {intelligence.get('layout_pattern', 'unknown')}
|
728
|
-
- Language: {intelligence.get('language', 'en')}
|
729
|
-
- Complexity: {intelligence.get('complexity_score', 0.5)}
|
730
|
-
|
731
|
-
Element Context:
|
732
|
-
- Position: {element.get('center', [0, 0])} (center)
|
733
|
-
- Size: {element.get('size', [0, 0])} (width x height)
|
734
|
-
- Element {i+1} of {len(elements)} total elements'''
|
735
|
-
|
736
|
-
# Get task-specific classification prompt
|
737
|
-
prompt = self._get_classification_prompt(
|
738
|
-
parameters.get("task", "element_classification"),
|
739
|
-
context_info
|
740
|
-
)
|
741
|
-
|
742
|
-
result = await service.invoke(
|
743
|
-
image=crop_path,
|
744
|
-
prompt=prompt,
|
745
|
-
task="analyze",
|
746
|
-
max_tokens=layer.parameters.get("max_tokens", 300)
|
747
|
-
)
|
748
|
-
|
749
|
-
classification_data = self._parse_classification_result(result['text'])
|
750
|
-
|
751
|
-
# Use original detection center coordinates
|
752
|
-
classified_element = {
|
753
|
-
**element,
|
754
|
-
'classification': classification_data.get('classification', 'other'),
|
755
|
-
'classification_confidence': classification_data.get('confidence', 0.5),
|
756
|
-
'precise_center': element.get('center', [0, 0]),
|
757
|
-
'reasoning': classification_data.get('reasoning', ''),
|
758
|
-
'visual_evidence': classification_data.get('visual_evidence', []),
|
759
|
-
'interaction_priority': classification_data.get('interaction_priority', 5),
|
760
|
-
'crop_region': [crop_x1, crop_y1, crop_x2, crop_y2]
|
761
|
-
}
|
762
|
-
|
763
|
-
classified_elements.append(classified_element)
|
764
|
-
|
765
|
-
except Exception as e:
|
766
|
-
# Keep element with basic classification
|
767
|
-
classified_element = {
|
768
|
-
**element,
|
769
|
-
'classification': 'other',
|
770
|
-
'classification_confidence': 0.3,
|
771
|
-
'precise_center': element.get('center', [0, 0]),
|
772
|
-
'reasoning': f'Classification failed: {str(e)}',
|
773
|
-
'visual_evidence': [],
|
774
|
-
'interaction_priority': 5,
|
775
|
-
'error': str(e)
|
776
|
-
}
|
777
|
-
classified_elements.append(classified_element)
|
778
|
-
|
779
|
-
finally:
|
780
|
-
# Cleanup temp file
|
781
|
-
try:
|
782
|
-
import os
|
783
|
-
os.remove(crop_path)
|
784
|
-
except:
|
785
|
-
pass
|
786
|
-
|
787
|
-
return classified_elements
|
788
|
-
|
789
437
|
async def execute_fallback(self, layer: LayerConfig, context: Dict[str, Any], error: str) -> Optional[Any]:
|
790
438
|
"""Execute fallback logic for failed layers"""
|
791
439
|
|
@@ -980,113 +628,6 @@ Element Context:
|
|
980
628
|
'interaction_priority': 5
|
981
629
|
}
|
982
630
|
|
983
|
-
def _get_classification_prompt(self, task: str, context_info: str) -> str:
|
984
|
-
"""Get task-specific classification prompt"""
|
985
|
-
|
986
|
-
if task == "search_element_classification":
|
987
|
-
return f'''Classify this UI element from a search interface.
|
988
|
-
|
989
|
-
{context_info}
|
990
|
-
|
991
|
-
Classify as one of:
|
992
|
-
- search_field: for search input boxes, query fields
|
993
|
-
- search_button: for search/go buttons, submit search
|
994
|
-
- search_suggestion: for autocomplete suggestions
|
995
|
-
- filter: for search filters, sorting options
|
996
|
-
- voice_search: for voice search buttons
|
997
|
-
- image_search: for image search options
|
998
|
-
- advanced_search: for advanced search links/options
|
999
|
-
- nav_link: for navigation menu items
|
1000
|
-
- other: for unrelated elements
|
1001
|
-
|
1002
|
-
Response format:
|
1003
|
-
{{
|
1004
|
-
"classification": "search_field|search_button|filter|nav_link|other",
|
1005
|
-
"confidence": 0.1-1.0,
|
1006
|
-
"reasoning": "brief explanation of classification decision",
|
1007
|
-
"visual_evidence": ["text_visible", "styling_clues", "position_indicators"],
|
1008
|
-
"interaction_priority": 1-10
|
1009
|
-
}}'''
|
1010
|
-
|
1011
|
-
elif task == "content_element_classification":
|
1012
|
-
return f'''Classify this UI element from a content page.
|
1013
|
-
|
1014
|
-
{context_info}
|
1015
|
-
|
1016
|
-
Classify as one of:
|
1017
|
-
- article_title: for main article/page titles
|
1018
|
-
- article_body: for main content/article text
|
1019
|
-
- sidebar_content: for sidebar information
|
1020
|
-
- navigation_menu: for navigation elements
|
1021
|
-
- related_links: for related articles/links
|
1022
|
-
- comment: for comment sections
|
1023
|
-
- share_button: for social sharing
|
1024
|
-
- read_more: for read more links
|
1025
|
-
- image: for content images
|
1026
|
-
- video: for embedded videos
|
1027
|
-
- other: for unrelated elements
|
1028
|
-
|
1029
|
-
Response format:
|
1030
|
-
{{
|
1031
|
-
"classification": "article_title|article_body|sidebar_content|navigation_menu|other",
|
1032
|
-
"confidence": 0.1-1.0,
|
1033
|
-
"reasoning": "brief explanation of classification decision",
|
1034
|
-
"visual_evidence": ["text_visible", "styling_clues", "position_indicators"],
|
1035
|
-
"interaction_priority": 1-10
|
1036
|
-
}}'''
|
1037
|
-
|
1038
|
-
elif task == "navigation_element_classification":
|
1039
|
-
return f'''Classify this UI element from a navigation-focused page.
|
1040
|
-
|
1041
|
-
{context_info}
|
1042
|
-
|
1043
|
-
Classify as one of:
|
1044
|
-
- nav_link: for main navigation links
|
1045
|
-
- menu_item: for menu items and categories
|
1046
|
-
- breadcrumb: for breadcrumb navigation
|
1047
|
-
- dropdown_menu: for dropdown menu elements
|
1048
|
-
- footer_link: for footer navigation
|
1049
|
-
- logo: for site logos/branding
|
1050
|
-
- search_box: for site search functionality
|
1051
|
-
- user_menu: for user account menus
|
1052
|
-
- cta_button: for call-to-action buttons
|
1053
|
-
- other: for unrelated elements
|
1054
|
-
|
1055
|
-
Response format:
|
1056
|
-
{{
|
1057
|
-
"classification": "nav_link|menu_item|breadcrumb|dropdown_menu|other",
|
1058
|
-
"confidence": 0.1-1.0,
|
1059
|
-
"reasoning": "brief explanation of classification decision",
|
1060
|
-
"visual_evidence": ["text_visible", "styling_clues", "position_indicators"],
|
1061
|
-
"interaction_priority": 1-10
|
1062
|
-
}}'''
|
1063
|
-
|
1064
|
-
else:
|
1065
|
-
# Default login classification
|
1066
|
-
return f'''Classify this UI element from a login/authentication interface.
|
1067
|
-
|
1068
|
-
{context_info}
|
1069
|
-
|
1070
|
-
Classify as one of:
|
1071
|
-
- username_field: for username, email, user ID inputs
|
1072
|
-
- password_field: for password inputs
|
1073
|
-
- confirm_password: for password confirmation fields
|
1074
|
-
- login_button: for sign in/login buttons
|
1075
|
-
- register_button: for sign up/register buttons
|
1076
|
-
- submit_button: for general form submission
|
1077
|
-
- checkbox: for remember me, terms agreement
|
1078
|
-
- link: for forgot password, register links
|
1079
|
-
- other: for unrelated elements
|
1080
|
-
|
1081
|
-
Response format:
|
1082
|
-
{{
|
1083
|
-
"classification": "username_field|password_field|login_button|other",
|
1084
|
-
"confidence": 0.1-1.0,
|
1085
|
-
"reasoning": "brief explanation of classification decision",
|
1086
|
-
"visual_evidence": ["text_visible", "styling_clues", "position_indicators"],
|
1087
|
-
"interaction_priority": 1-10
|
1088
|
-
}}'''
|
1089
|
-
|
1090
631
|
def _build_action_planning_prompt(self, task_type: str, elements_summary: List[Dict], interactive_elements: List[Dict]) -> str:
|
1091
632
|
"""构建行动规划提示词"""
|
1092
633
|
|
@@ -1279,41 +820,4 @@ Response format:
|
|
1279
820
|
'confidence': 0.4,
|
1280
821
|
'type': 'form'
|
1281
822
|
}
|
1282
|
-
]
|
1283
|
-
|
1284
|
-
async def _fallback_element_detection(self, service: Any, image_path: str, intelligence: Dict[str, Any]) -> List[Dict[str, Any]]:
|
1285
|
-
"""Fallback element detection using generic methods"""
|
1286
|
-
try:
|
1287
|
-
# Try generic object detection
|
1288
|
-
result = await service.detect_objects(image_path, confidence_threshold=0.5)
|
1289
|
-
objects = result.get("objects", [])
|
1290
|
-
|
1291
|
-
# Convert to standard format
|
1292
|
-
elements = []
|
1293
|
-
for i, obj in enumerate(objects):
|
1294
|
-
coords = obj.get("coordinates", {})
|
1295
|
-
if all(k in coords for k in ['x', 'y', 'width', 'height']):
|
1296
|
-
# Convert percentage to pixels
|
1297
|
-
img = Image.open(image_path)
|
1298
|
-
img_width, img_height = img.size
|
1299
|
-
|
1300
|
-
x = int(coords['x'] * img_width / 100)
|
1301
|
-
y = int(coords['y'] * img_height / 100)
|
1302
|
-
w = int(coords['width'] * img_width / 100)
|
1303
|
-
h = int(coords['height'] * img_height / 100)
|
1304
|
-
|
1305
|
-
element = {
|
1306
|
-
'id': f'fallback_{i}',
|
1307
|
-
'bbox': [x, y, x + w, y + h],
|
1308
|
-
'center': [x + w//2, y + h//2],
|
1309
|
-
'size': [w, h],
|
1310
|
-
'confidence': obj.get('confidence', 0.7),
|
1311
|
-
'type': 'detected'
|
1312
|
-
}
|
1313
|
-
elements.append(element)
|
1314
|
-
|
1315
|
-
return elements
|
1316
|
-
|
1317
|
-
except Exception:
|
1318
|
-
# Ultimate fallback
|
1319
|
-
return self._create_fallback_elements(image_path, intelligence)
|
823
|
+
]
|