isa-model 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. isa_model/__init__.py +30 -1
  2. isa_model/client.py +770 -0
  3. isa_model/core/config/__init__.py +16 -0
  4. isa_model/core/config/config_manager.py +514 -0
  5. isa_model/core/config.py +426 -0
  6. isa_model/core/models/model_billing_tracker.py +476 -0
  7. isa_model/core/models/model_manager.py +399 -0
  8. isa_model/core/{storage/supabase_storage.py → models/model_repo.py} +72 -73
  9. isa_model/core/pricing_manager.py +426 -0
  10. isa_model/core/services/__init__.py +19 -0
  11. isa_model/core/services/intelligent_model_selector.py +547 -0
  12. isa_model/core/types.py +291 -0
  13. isa_model/deployment/__init__.py +2 -0
  14. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +157 -3
  15. isa_model/deployment/cloud/modal/isa_vision_table_service.py +532 -0
  16. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +104 -3
  17. isa_model/deployment/cloud/modal/register_models.py +321 -0
  18. isa_model/deployment/runtime/deployed_service.py +338 -0
  19. isa_model/deployment/services/__init__.py +9 -0
  20. isa_model/deployment/services/auto_deploy_vision_service.py +537 -0
  21. isa_model/deployment/services/model_service.py +332 -0
  22. isa_model/deployment/services/service_monitor.py +356 -0
  23. isa_model/deployment/services/service_registry.py +527 -0
  24. isa_model/eval/__init__.py +80 -44
  25. isa_model/eval/config/__init__.py +10 -0
  26. isa_model/eval/config/evaluation_config.py +108 -0
  27. isa_model/eval/evaluators/__init__.py +18 -0
  28. isa_model/eval/evaluators/base_evaluator.py +503 -0
  29. isa_model/eval/evaluators/llm_evaluator.py +472 -0
  30. isa_model/eval/factory.py +417 -709
  31. isa_model/eval/infrastructure/__init__.py +24 -0
  32. isa_model/eval/infrastructure/experiment_tracker.py +466 -0
  33. isa_model/eval/metrics.py +191 -21
  34. isa_model/inference/ai_factory.py +181 -605
  35. isa_model/inference/services/audio/base_stt_service.py +65 -1
  36. isa_model/inference/services/audio/base_tts_service.py +75 -1
  37. isa_model/inference/services/audio/openai_stt_service.py +189 -151
  38. isa_model/inference/services/audio/openai_tts_service.py +12 -10
  39. isa_model/inference/services/audio/replicate_tts_service.py +61 -56
  40. isa_model/inference/services/base_service.py +55 -17
  41. isa_model/inference/services/embedding/base_embed_service.py +65 -1
  42. isa_model/inference/services/embedding/ollama_embed_service.py +103 -43
  43. isa_model/inference/services/embedding/openai_embed_service.py +8 -10
  44. isa_model/inference/services/helpers/stacked_config.py +148 -0
  45. isa_model/inference/services/img/__init__.py +18 -0
  46. isa_model/inference/services/{vision → img}/base_image_gen_service.py +80 -1
  47. isa_model/inference/services/{stacked → img}/flux_professional_service.py +25 -1
  48. isa_model/inference/services/{stacked → img/helpers}/base_stacked_service.py +40 -35
  49. isa_model/inference/services/{vision → img}/replicate_image_gen_service.py +44 -31
  50. isa_model/inference/services/llm/__init__.py +3 -3
  51. isa_model/inference/services/llm/base_llm_service.py +492 -40
  52. isa_model/inference/services/llm/helpers/llm_prompts.py +258 -0
  53. isa_model/inference/services/llm/helpers/llm_utils.py +280 -0
  54. isa_model/inference/services/llm/ollama_llm_service.py +51 -17
  55. isa_model/inference/services/llm/openai_llm_service.py +70 -19
  56. isa_model/inference/services/llm/yyds_llm_service.py +24 -23
  57. isa_model/inference/services/vision/__init__.py +38 -4
  58. isa_model/inference/services/vision/base_vision_service.py +218 -117
  59. isa_model/inference/services/vision/{isA_vision_service.py → disabled/isA_vision_service.py} +98 -0
  60. isa_model/inference/services/{stacked → vision}/doc_analysis_service.py +1 -1
  61. isa_model/inference/services/vision/helpers/base_stacked_service.py +274 -0
  62. isa_model/inference/services/vision/helpers/image_utils.py +272 -3
  63. isa_model/inference/services/vision/helpers/vision_prompts.py +297 -0
  64. isa_model/inference/services/vision/openai_vision_service.py +104 -307
  65. isa_model/inference/services/vision/replicate_vision_service.py +140 -325
  66. isa_model/inference/services/{stacked → vision}/ui_analysis_service.py +2 -498
  67. isa_model/scripts/register_models.py +370 -0
  68. isa_model/scripts/register_models_with_embeddings.py +510 -0
  69. isa_model/serving/api/fastapi_server.py +6 -1
  70. isa_model/serving/api/routes/unified.py +202 -0
  71. {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/METADATA +4 -1
  72. {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/RECORD +77 -53
  73. isa_model/config/__init__.py +0 -9
  74. isa_model/config/config_manager.py +0 -213
  75. isa_model/core/model_manager.py +0 -213
  76. isa_model/core/model_registry.py +0 -375
  77. isa_model/core/vision_models_init.py +0 -116
  78. isa_model/inference/billing_tracker.py +0 -406
  79. isa_model/inference/services/llm/triton_llm_service.py +0 -481
  80. isa_model/inference/services/stacked/__init__.py +0 -26
  81. isa_model/inference/services/stacked/config.py +0 -426
  82. isa_model/inference/services/vision/ollama_vision_service.py +0 -194
  83. /isa_model/core/{model_storage.py → models/model_storage.py} +0 -0
  84. /isa_model/inference/services/{vision → embedding}/helpers/text_splitter.py +0 -0
  85. /isa_model/inference/services/llm/{llm_adapter.py → helpers/llm_adapter.py} +0 -0
  86. {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/WHEEL +0 -0
  87. {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/top_level.txt +0 -0
@@ -11,7 +11,7 @@ from typing import Dict, Any, List, Optional, Union, BinaryIO
11
11
  import json
12
12
  from PIL import Image, ImageDraw, ImageFont
13
13
 
14
- from .base_stacked_service import BaseStackedService, LayerConfig, LayerType, LayerResult
14
+ from .helpers.base_stacked_service import BaseStackedService, LayerConfig, LayerType, LayerResult
15
15
 
16
16
  class UIAnalysisService(BaseStackedService):
17
17
  """
@@ -434,358 +434,6 @@ Return analysis as JSON with this exact structure:
434
434
 
435
435
  Be precise and only include elements you can clearly see.'''
436
436
 
437
- async def _invoke_element_detection(self, service: Any, image_path: str, intelligence: Dict[str, Any], parameters: Dict[str, Any]) -> List[Dict[str, Any]]:
438
- """Invoke element detection using unified interface"""
439
-
440
- # Adapt parameters based on page intelligence
441
- complexity = intelligence.get('complexity_score', 0.5)
442
-
443
- # Check if this is omniparser service
444
- if hasattr(service, 'run_omniparser'):
445
- # Use replicate omniparser
446
- params = {}
447
- if complexity > 0.7:
448
- params.update({
449
- "box_threshold": 0.03,
450
- "iou_threshold": 0.1,
451
- "imgsz": 1024
452
- })
453
- else:
454
- params.update({
455
- "box_threshold": parameters.get("box_threshold", 0.05),
456
- "iou_threshold": parameters.get("iou_threshold", 0.1),
457
- "imgsz": parameters.get("imgsz", 640)
458
- })
459
-
460
- result = await service.run_omniparser(image=image_path, **params)
461
- # Filter for interactive elements only
462
- elements = [e for e in result.get("parsed_elements", []) if e.get("interactivity", False)]
463
- return elements
464
- else:
465
- # Use fallback generic detection
466
- return await self._fallback_element_detection(service, image_path, intelligence)
467
-
468
- async def _invoke_element_classification(self, service: Any, image_path: str, elements: List[Dict[str, Any]], intelligence: Dict[str, Any], parameters: Dict[str, Any]) -> List[Dict[str, Any]]:
469
- """Invoke element classification using unified interface"""
470
-
471
- classified_elements = []
472
- img = Image.open(image_path)
473
-
474
- for i, element in enumerate(elements):
475
- # Crop element region with padding
476
- bbox = element['bbox']
477
- x1, y1, x2, y2 = bbox
478
-
479
- padding_x = max(20, int((x2 - x1) * 0.2))
480
- padding_y = max(20, int((y2 - y1) * 0.2))
481
-
482
- crop_x1 = max(0, x1 - padding_x)
483
- crop_y1 = max(0, y1 - padding_y)
484
- crop_x2 = min(img.width, x2 + padding_x)
485
- crop_y2 = min(img.height, y2 + padding_y)
486
-
487
- cropped_img = img.crop((crop_x1, crop_y1, crop_x2, crop_y2))
488
- crop_path = f"temp_classify_{i}.png"
489
- cropped_img.save(crop_path)
490
-
491
- try:
492
- context_info = f'''Page Context:
493
- - Type: {intelligence.get('page_type', 'unknown')}
494
- - Layout: {intelligence.get('layout_pattern', 'unknown')}
495
- - Language: {intelligence.get('language', 'en')}
496
- - Complexity: {intelligence.get('complexity_score', 0.5)}
497
-
498
- Element Context:
499
- - Position: {element.get('center', [0, 0])} (center)
500
- - Size: {element.get('size', [0, 0])} (width x height)
501
- - Element {i+1} of {len(elements)} total elements'''
502
-
503
- prompt = f'''Classify this UI element from a login/authentication interface.
504
-
505
- {context_info}
506
-
507
- Classify as one of:
508
- - username_field: for username, email, user ID inputs
509
- - password_field: for password inputs
510
- - confirm_password: for password confirmation fields
511
- - login_button: for sign in/login buttons
512
- - register_button: for sign up/register buttons
513
- - submit_button: for general form submission
514
- - checkbox: for remember me, terms agreement
515
- - link: for forgot password, register links
516
- - other: for unrelated elements
517
-
518
- Response format:
519
- {{
520
- "classification": "username_field|password_field|login_button|other",
521
- "confidence": 0.1-1.0,
522
- "reasoning": "brief explanation of classification decision",
523
- "visual_evidence": ["text_visible", "styling_clues", "position_indicators"],
524
- "interaction_priority": 1-10
525
- }}'''
526
-
527
- # Use unified invoke method
528
- result = await service.invoke(
529
- image=crop_path,
530
- prompt=prompt,
531
- task="analyze",
532
- max_tokens=parameters.get("max_tokens", 300)
533
- )
534
-
535
- classification_data = self._parse_classification_result(result['text'])
536
-
537
- # Use original detection center coordinates
538
- classified_element = {
539
- **element,
540
- 'classification': classification_data.get('classification', 'other'),
541
- 'classification_confidence': classification_data.get('confidence', 0.5),
542
- 'precise_center': element.get('center', [0, 0]),
543
- 'reasoning': classification_data.get('reasoning', ''),
544
- 'visual_evidence': classification_data.get('visual_evidence', []),
545
- 'interaction_priority': classification_data.get('interaction_priority', 5),
546
- 'crop_region': [crop_x1, crop_y1, crop_x2, crop_y2]
547
- }
548
-
549
- classified_elements.append(classified_element)
550
-
551
- except Exception as e:
552
- # Keep element with basic classification
553
- classified_element = {
554
- **element,
555
- 'classification': 'other',
556
- 'classification_confidence': 0.3,
557
- 'precise_center': element.get('center', [0, 0]),
558
- 'reasoning': f'Classification failed: {str(e)}',
559
- 'visual_evidence': [],
560
- 'interaction_priority': 5,
561
- 'error': str(e)
562
- }
563
- classified_elements.append(classified_element)
564
-
565
- finally:
566
- # Cleanup temp file
567
- try:
568
- import os
569
- os.remove(crop_path)
570
- except:
571
- pass
572
-
573
- return classified_elements
574
-
575
- # ==================== LEGACY METHODS (for compatibility) ====================
576
-
577
- async def _execute_page_intelligence(self, layer: LayerConfig, service: Any, context: Dict[str, Any]) -> Dict[str, Any]:
578
- """Execute page intelligence analysis"""
579
- image_path = context["input"]["image_path"]
580
-
581
- prompt = '''Analyze this webpage screenshot to understand the login interface structure.
582
-
583
- Identify:
584
- 1. Page type (login, register, multi-step auth, SSO)
585
- 2. Layout pattern (vertical form, horizontal, modal, tabs)
586
- 3. Language used in the interface
587
- 4. Security features visible (CAPTCHA, 2FA indicators)
588
- 5. Form complexity level
589
- 6. Visible text elements that indicate field purposes
590
-
591
- Return analysis as JSON with this exact structure:
592
- {
593
- "page_type": "login|register|multi_step|sso|other",
594
- "layout_pattern": "vertical|horizontal|modal|tabs|embedded",
595
- "language": "en|zh|es|fr|de|other",
596
- "security_features": ["captcha", "recaptcha", "2fa_indicator", "security_questions"],
597
- "complexity_score": 0.1-1.0,
598
- "visible_text_elements": ["Login", "Password", "Sign In"],
599
- "form_area_estimate": {"x": 0, "y": 0, "width": 0, "height": 0},
600
- "confidence": 0.1-1.0,
601
- "analysis_notes": "brief description of what you observe"
602
- }
603
-
604
- Be precise and only include elements you can clearly see.'''
605
-
606
- result = await service.invoke(
607
- image=image_path,
608
- prompt=prompt,
609
- task="analyze",
610
- max_tokens=layer.parameters.get("max_tokens", 500)
611
- )
612
-
613
- # Parse JSON response
614
- response_text = result['text'].strip()
615
- json_start = response_text.find('{')
616
- json_end = response_text.rfind('}') + 1
617
-
618
- if json_start >= 0 and json_end > json_start:
619
- json_text = response_text[json_start:json_end]
620
- try:
621
- intelligence_data = json.loads(json_text)
622
- except json.JSONDecodeError:
623
- # Fallback parsing
624
- intelligence_data = self._parse_intelligence_fallback(response_text)
625
- else:
626
- intelligence_data = self._parse_intelligence_fallback(response_text)
627
-
628
- return intelligence_data
629
-
630
- async def _execute_element_detection(self, layer: LayerConfig, service: Any, context: Dict[str, Any]) -> List[Dict[str, Any]]:
631
- """Execute element detection"""
632
- image_path = context["input"]["image_path"]
633
- intelligence = context["results"]["page_intelligence"].data
634
-
635
- # Adapt parameters based on page intelligence
636
- complexity = intelligence.get('complexity_score', 0.5)
637
- params = layer.parameters.copy()
638
-
639
- if complexity > 0.7:
640
- params.update({
641
- "box_threshold": 0.03,
642
- "iou_threshold": 0.1,
643
- "imgsz": 1024
644
- })
645
- elif complexity > 0.4:
646
- params.update({
647
- "box_threshold": 0.05,
648
- "iou_threshold": 0.1,
649
- "imgsz": 640
650
- })
651
- else:
652
- params.update({
653
- "box_threshold": 0.08,
654
- "iou_threshold": 0.2,
655
- "imgsz": 640
656
- })
657
-
658
- # Run detection based on model type
659
- if layer.model_name == "omniparser":
660
- if hasattr(service, 'run_omniparser'):
661
- result = await service.run_omniparser(
662
- image=image_path,
663
- **params
664
- )
665
- # Filter for interactive elements only
666
- elements = [e for e in result.get("parsed_elements", []) if e.get("interactivity", False)]
667
- else:
668
- # Fallback for services without omniparser support
669
- elements = await self._fallback_element_detection(service, image_path, intelligence)
670
-
671
- elif layer.model_name == "florence-2":
672
- if hasattr(service, 'run_florence2'):
673
- result = await service.run_florence2(
674
- image=image_path,
675
- task="<OPEN_VOCABULARY_DETECTION>",
676
- text_input="login form elements, input fields, buttons"
677
- )
678
- elements = result.get("parsed_objects", [])
679
- else:
680
- elements = await self._fallback_element_detection(service, image_path, intelligence)
681
-
682
- elif layer.model_name == "yolov8":
683
- if hasattr(service, 'run_yolo'):
684
- result = await service.run_yolo(
685
- image=image_path,
686
- confidence=params.get("box_threshold", 0.5)
687
- )
688
- elements = result.get("detected_objects", [])
689
- else:
690
- elements = await self._fallback_element_detection(service, image_path, intelligence)
691
-
692
- else:
693
- # Fallback to generic object detection
694
- elements = await self._fallback_element_detection(service, image_path, intelligence)
695
-
696
- return elements
697
-
698
- async def _execute_element_classification(self, layer: LayerConfig, service: Any, context: Dict[str, Any]) -> List[Dict[str, Any]]:
699
- """Execute element classification"""
700
- image_path = context["input"]["image_path"]
701
- elements = context["results"]["element_detection"].data
702
- intelligence = context["results"]["page_intelligence"].data
703
-
704
- classified_elements = []
705
- img = Image.open(image_path)
706
-
707
- for i, element in enumerate(elements):
708
- # Crop element region with padding
709
- bbox = element['bbox']
710
- x1, y1, x2, y2 = bbox
711
-
712
- padding_x = max(20, int((x2 - x1) * 0.2))
713
- padding_y = max(20, int((y2 - y1) * 0.2))
714
-
715
- crop_x1 = max(0, x1 - padding_x)
716
- crop_y1 = max(0, y1 - padding_y)
717
- crop_x2 = min(img.width, x2 + padding_x)
718
- crop_y2 = min(img.height, y2 + padding_y)
719
-
720
- cropped_img = img.crop((crop_x1, crop_y1, crop_x2, crop_y2))
721
- crop_path = f"temp_classify_{i}.png"
722
- cropped_img.save(crop_path)
723
-
724
- try:
725
- context_info = f'''Page Context:
726
- - Type: {intelligence.get('page_type', 'unknown')}
727
- - Layout: {intelligence.get('layout_pattern', 'unknown')}
728
- - Language: {intelligence.get('language', 'en')}
729
- - Complexity: {intelligence.get('complexity_score', 0.5)}
730
-
731
- Element Context:
732
- - Position: {element.get('center', [0, 0])} (center)
733
- - Size: {element.get('size', [0, 0])} (width x height)
734
- - Element {i+1} of {len(elements)} total elements'''
735
-
736
- # Get task-specific classification prompt
737
- prompt = self._get_classification_prompt(
738
- parameters.get("task", "element_classification"),
739
- context_info
740
- )
741
-
742
- result = await service.invoke(
743
- image=crop_path,
744
- prompt=prompt,
745
- task="analyze",
746
- max_tokens=layer.parameters.get("max_tokens", 300)
747
- )
748
-
749
- classification_data = self._parse_classification_result(result['text'])
750
-
751
- # Use original detection center coordinates
752
- classified_element = {
753
- **element,
754
- 'classification': classification_data.get('classification', 'other'),
755
- 'classification_confidence': classification_data.get('confidence', 0.5),
756
- 'precise_center': element.get('center', [0, 0]),
757
- 'reasoning': classification_data.get('reasoning', ''),
758
- 'visual_evidence': classification_data.get('visual_evidence', []),
759
- 'interaction_priority': classification_data.get('interaction_priority', 5),
760
- 'crop_region': [crop_x1, crop_y1, crop_x2, crop_y2]
761
- }
762
-
763
- classified_elements.append(classified_element)
764
-
765
- except Exception as e:
766
- # Keep element with basic classification
767
- classified_element = {
768
- **element,
769
- 'classification': 'other',
770
- 'classification_confidence': 0.3,
771
- 'precise_center': element.get('center', [0, 0]),
772
- 'reasoning': f'Classification failed: {str(e)}',
773
- 'visual_evidence': [],
774
- 'interaction_priority': 5,
775
- 'error': str(e)
776
- }
777
- classified_elements.append(classified_element)
778
-
779
- finally:
780
- # Cleanup temp file
781
- try:
782
- import os
783
- os.remove(crop_path)
784
- except:
785
- pass
786
-
787
- return classified_elements
788
-
789
437
  async def execute_fallback(self, layer: LayerConfig, context: Dict[str, Any], error: str) -> Optional[Any]:
790
438
  """Execute fallback logic for failed layers"""
791
439
 
@@ -980,113 +628,6 @@ Element Context:
980
628
  'interaction_priority': 5
981
629
  }
982
630
 
983
- def _get_classification_prompt(self, task: str, context_info: str) -> str:
984
- """Get task-specific classification prompt"""
985
-
986
- if task == "search_element_classification":
987
- return f'''Classify this UI element from a search interface.
988
-
989
- {context_info}
990
-
991
- Classify as one of:
992
- - search_field: for search input boxes, query fields
993
- - search_button: for search/go buttons, submit search
994
- - search_suggestion: for autocomplete suggestions
995
- - filter: for search filters, sorting options
996
- - voice_search: for voice search buttons
997
- - image_search: for image search options
998
- - advanced_search: for advanced search links/options
999
- - nav_link: for navigation menu items
1000
- - other: for unrelated elements
1001
-
1002
- Response format:
1003
- {{
1004
- "classification": "search_field|search_button|filter|nav_link|other",
1005
- "confidence": 0.1-1.0,
1006
- "reasoning": "brief explanation of classification decision",
1007
- "visual_evidence": ["text_visible", "styling_clues", "position_indicators"],
1008
- "interaction_priority": 1-10
1009
- }}'''
1010
-
1011
- elif task == "content_element_classification":
1012
- return f'''Classify this UI element from a content page.
1013
-
1014
- {context_info}
1015
-
1016
- Classify as one of:
1017
- - article_title: for main article/page titles
1018
- - article_body: for main content/article text
1019
- - sidebar_content: for sidebar information
1020
- - navigation_menu: for navigation elements
1021
- - related_links: for related articles/links
1022
- - comment: for comment sections
1023
- - share_button: for social sharing
1024
- - read_more: for read more links
1025
- - image: for content images
1026
- - video: for embedded videos
1027
- - other: for unrelated elements
1028
-
1029
- Response format:
1030
- {{
1031
- "classification": "article_title|article_body|sidebar_content|navigation_menu|other",
1032
- "confidence": 0.1-1.0,
1033
- "reasoning": "brief explanation of classification decision",
1034
- "visual_evidence": ["text_visible", "styling_clues", "position_indicators"],
1035
- "interaction_priority": 1-10
1036
- }}'''
1037
-
1038
- elif task == "navigation_element_classification":
1039
- return f'''Classify this UI element from a navigation-focused page.
1040
-
1041
- {context_info}
1042
-
1043
- Classify as one of:
1044
- - nav_link: for main navigation links
1045
- - menu_item: for menu items and categories
1046
- - breadcrumb: for breadcrumb navigation
1047
- - dropdown_menu: for dropdown menu elements
1048
- - footer_link: for footer navigation
1049
- - logo: for site logos/branding
1050
- - search_box: for site search functionality
1051
- - user_menu: for user account menus
1052
- - cta_button: for call-to-action buttons
1053
- - other: for unrelated elements
1054
-
1055
- Response format:
1056
- {{
1057
- "classification": "nav_link|menu_item|breadcrumb|dropdown_menu|other",
1058
- "confidence": 0.1-1.0,
1059
- "reasoning": "brief explanation of classification decision",
1060
- "visual_evidence": ["text_visible", "styling_clues", "position_indicators"],
1061
- "interaction_priority": 1-10
1062
- }}'''
1063
-
1064
- else:
1065
- # Default login classification
1066
- return f'''Classify this UI element from a login/authentication interface.
1067
-
1068
- {context_info}
1069
-
1070
- Classify as one of:
1071
- - username_field: for username, email, user ID inputs
1072
- - password_field: for password inputs
1073
- - confirm_password: for password confirmation fields
1074
- - login_button: for sign in/login buttons
1075
- - register_button: for sign up/register buttons
1076
- - submit_button: for general form submission
1077
- - checkbox: for remember me, terms agreement
1078
- - link: for forgot password, register links
1079
- - other: for unrelated elements
1080
-
1081
- Response format:
1082
- {{
1083
- "classification": "username_field|password_field|login_button|other",
1084
- "confidence": 0.1-1.0,
1085
- "reasoning": "brief explanation of classification decision",
1086
- "visual_evidence": ["text_visible", "styling_clues", "position_indicators"],
1087
- "interaction_priority": 1-10
1088
- }}'''
1089
-
1090
631
  def _build_action_planning_prompt(self, task_type: str, elements_summary: List[Dict], interactive_elements: List[Dict]) -> str:
1091
632
  """构建行动规划提示词"""
1092
633
 
@@ -1279,41 +820,4 @@ Response format:
1279
820
  'confidence': 0.4,
1280
821
  'type': 'form'
1281
822
  }
1282
- ]
1283
-
1284
- async def _fallback_element_detection(self, service: Any, image_path: str, intelligence: Dict[str, Any]) -> List[Dict[str, Any]]:
1285
- """Fallback element detection using generic methods"""
1286
- try:
1287
- # Try generic object detection
1288
- result = await service.detect_objects(image_path, confidence_threshold=0.5)
1289
- objects = result.get("objects", [])
1290
-
1291
- # Convert to standard format
1292
- elements = []
1293
- for i, obj in enumerate(objects):
1294
- coords = obj.get("coordinates", {})
1295
- if all(k in coords for k in ['x', 'y', 'width', 'height']):
1296
- # Convert percentage to pixels
1297
- img = Image.open(image_path)
1298
- img_width, img_height = img.size
1299
-
1300
- x = int(coords['x'] * img_width / 100)
1301
- y = int(coords['y'] * img_height / 100)
1302
- w = int(coords['width'] * img_width / 100)
1303
- h = int(coords['height'] * img_height / 100)
1304
-
1305
- element = {
1306
- 'id': f'fallback_{i}',
1307
- 'bbox': [x, y, x + w, y + h],
1308
- 'center': [x + w//2, y + h//2],
1309
- 'size': [w, h],
1310
- 'confidence': obj.get('confidence', 0.7),
1311
- 'type': 'detected'
1312
- }
1313
- elements.append(element)
1314
-
1315
- return elements
1316
-
1317
- except Exception:
1318
- # Ultimate fallback
1319
- return self._create_fallback_elements(image_path, intelligence)
823
+ ]