PyPI - isa-model - Versions diffs - 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl - Mend

isa-model 0.3.5py3-none-any.whl → 0.3.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

isa_model/__init__.py +30 -1
isa_model/client.py +770 -0
isa_model/core/config/__init__.py +16 -0
isa_model/core/config/config_manager.py +514 -0
isa_model/core/config.py +426 -0
isa_model/core/models/model_billing_tracker.py +476 -0
isa_model/core/models/model_manager.py +399 -0
isa_model/core/{storage/supabase_storage.py → models/model_repo.py} +72 -73
isa_model/core/pricing_manager.py +426 -0
isa_model/core/services/__init__.py +19 -0
isa_model/core/services/intelligent_model_selector.py +547 -0
isa_model/core/types.py +291 -0
isa_model/deployment/__init__.py +2 -0
isa_model/deployment/cloud/modal/isa_vision_doc_service.py +157 -3
isa_model/deployment/cloud/modal/isa_vision_table_service.py +532 -0
isa_model/deployment/cloud/modal/isa_vision_ui_service.py +104 -3
isa_model/deployment/cloud/modal/register_models.py +321 -0
isa_model/deployment/runtime/deployed_service.py +338 -0
isa_model/deployment/services/__init__.py +9 -0
isa_model/deployment/services/auto_deploy_vision_service.py +537 -0
isa_model/deployment/services/model_service.py +332 -0
isa_model/deployment/services/service_monitor.py +356 -0
isa_model/deployment/services/service_registry.py +527 -0
isa_model/eval/__init__.py +80 -44
isa_model/eval/config/__init__.py +10 -0
isa_model/eval/config/evaluation_config.py +108 -0
isa_model/eval/evaluators/__init__.py +18 -0
isa_model/eval/evaluators/base_evaluator.py +503 -0
isa_model/eval/evaluators/llm_evaluator.py +472 -0
isa_model/eval/factory.py +417 -709
isa_model/eval/infrastructure/__init__.py +24 -0
isa_model/eval/infrastructure/experiment_tracker.py +466 -0
isa_model/eval/metrics.py +191 -21
isa_model/inference/ai_factory.py +181 -605
isa_model/inference/services/audio/base_stt_service.py +65 -1
isa_model/inference/services/audio/base_tts_service.py +75 -1
isa_model/inference/services/audio/openai_stt_service.py +189 -151
isa_model/inference/services/audio/openai_tts_service.py +12 -10
isa_model/inference/services/audio/replicate_tts_service.py +61 -56
isa_model/inference/services/base_service.py +55 -17
isa_model/inference/services/embedding/base_embed_service.py +65 -1
isa_model/inference/services/embedding/ollama_embed_service.py +103 -43
isa_model/inference/services/embedding/openai_embed_service.py +8 -10
isa_model/inference/services/helpers/stacked_config.py +148 -0
isa_model/inference/services/img/__init__.py +18 -0
isa_model/inference/services/{vision → img}/base_image_gen_service.py +80 -1
isa_model/inference/services/{stacked → img}/flux_professional_service.py +25 -1
isa_model/inference/services/{stacked → img/helpers}/base_stacked_service.py +40 -35
isa_model/inference/services/{vision → img}/replicate_image_gen_service.py +44 -31
isa_model/inference/services/llm/__init__.py +3 -3
isa_model/inference/services/llm/base_llm_service.py +492 -40
isa_model/inference/services/llm/helpers/llm_prompts.py +258 -0
isa_model/inference/services/llm/helpers/llm_utils.py +280 -0
isa_model/inference/services/llm/ollama_llm_service.py +51 -17
isa_model/inference/services/llm/openai_llm_service.py +70 -19
isa_model/inference/services/llm/yyds_llm_service.py +24 -23
isa_model/inference/services/vision/__init__.py +38 -4
isa_model/inference/services/vision/base_vision_service.py +218 -117
isa_model/inference/services/vision/{isA_vision_service.py → disabled/isA_vision_service.py} +98 -0
isa_model/inference/services/{stacked → vision}/doc_analysis_service.py +1 -1
isa_model/inference/services/vision/helpers/base_stacked_service.py +274 -0
isa_model/inference/services/vision/helpers/image_utils.py +272 -3
isa_model/inference/services/vision/helpers/vision_prompts.py +297 -0
isa_model/inference/services/vision/openai_vision_service.py +104 -307
isa_model/inference/services/vision/replicate_vision_service.py +140 -325
isa_model/inference/services/{stacked → vision}/ui_analysis_service.py +2 -498
isa_model/scripts/register_models.py +370 -0
isa_model/scripts/register_models_with_embeddings.py +510 -0
isa_model/serving/api/fastapi_server.py +6 -1
isa_model/serving/api/routes/unified.py +202 -0
{isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/METADATA +4 -1
{isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/RECORD +77 -53
isa_model/config/__init__.py +0 -9
isa_model/config/config_manager.py +0 -213
isa_model/core/model_manager.py +0 -213
isa_model/core/model_registry.py +0 -375
isa_model/core/vision_models_init.py +0 -116
isa_model/inference/billing_tracker.py +0 -406
isa_model/inference/services/llm/triton_llm_service.py +0 -481
isa_model/inference/services/stacked/__init__.py +0 -26
isa_model/inference/services/stacked/config.py +0 -426
isa_model/inference/services/vision/ollama_vision_service.py +0 -194
/isa_model/core/{model_storage.py → models/model_storage.py} +0 -0
/isa_model/inference/services/{vision → embedding}/helpers/text_splitter.py +0 -0
/isa_model/inference/services/llm/{llm_adapter.py → helpers/llm_adapter.py} +0 -0
{isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/WHEEL +0 -0
{isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/top_level.txt +0 -0

isa_model/inference/services/{stacked → vision}/ui_analysis_service.py RENAMED Viewed

@@ -11,7 +11,7 @@ from typing import Dict, Any, List, Optional, Union, BinaryIO
 import json
 from PIL import Image, ImageDraw, ImageFont
-from .base_stacked_service import BaseStackedService, LayerConfig, LayerType, LayerResult
+from .helpers.base_stacked_service import BaseStackedService, LayerConfig, LayerType, LayerResult
 class UIAnalysisService(BaseStackedService):
     """
@@ -434,358 +434,6 @@ Return analysis as JSON with this exact structure:
 Be precise and only include elements you can clearly see.'''
-    async def _invoke_element_detection(self, service: Any, image_path: str, intelligence: Dict[str, Any], parameters: Dict[str, Any]) -> List[Dict[str, Any]]:
-        """Invoke element detection using unified interface"""
-        # Adapt parameters based on page intelligence
-        complexity = intelligence.get('complexity_score', 0.5)
-        # Check if this is omniparser service
-        if hasattr(service, 'run_omniparser'):
-            # Use replicate omniparser
-            params = {}
-            if complexity > 0.7:
-                params.update({
-                    "box_threshold": 0.03,
-                    "iou_threshold": 0.1,
-                    "imgsz": 1024
-                })
-            else:
-                params.update({
-                    "box_threshold": parameters.get("box_threshold", 0.05),
-                    "iou_threshold": parameters.get("iou_threshold", 0.1),
-                    "imgsz": parameters.get("imgsz", 640)
-                })
-            result = await service.run_omniparser(image=image_path, **params)
-            # Filter for interactive elements only
-            elements = [e for e in result.get("parsed_elements", []) if e.get("interactivity", False)]
-            return elements
-        else:
-            # Use fallback generic detection
-            return await self._fallback_element_detection(service, image_path, intelligence)
-    async def _invoke_element_classification(self, service: Any, image_path: str, elements: List[Dict[str, Any]], intelligence: Dict[str, Any], parameters: Dict[str, Any]) -> List[Dict[str, Any]]:
-        """Invoke element classification using unified interface"""
-        classified_elements = []
-        img = Image.open(image_path)
-        for i, element in enumerate(elements):
-            # Crop element region with padding
-            bbox = element['bbox']
-            x1, y1, x2, y2 = bbox
-            padding_x = max(20, int((x2 - x1) * 0.2))
-            padding_y = max(20, int((y2 - y1) * 0.2))
-            crop_x1 = max(0, x1 - padding_x)
-            crop_y1 = max(0, y1 - padding_y)
-            crop_x2 = min(img.width, x2 + padding_x)
-            crop_y2 = min(img.height, y2 + padding_y)
-            cropped_img = img.crop((crop_x1, crop_y1, crop_x2, crop_y2))
-            crop_path = f"temp_classify_{i}.png"
-            cropped_img.save(crop_path)
-            try:
-                context_info = f'''Page Context:
-- Type: {intelligence.get('page_type', 'unknown')}
-- Layout: {intelligence.get('layout_pattern', 'unknown')}
-- Language: {intelligence.get('language', 'en')}
-- Complexity: {intelligence.get('complexity_score', 0.5)}
-Element Context:
-- Position: {element.get('center', [0, 0])} (center)
-- Size: {element.get('size', [0, 0])} (width x height)
-- Element {i+1} of {len(elements)} total elements'''
-                prompt = f'''Classify this UI element from a login/authentication interface.
-{context_info}
-Classify as one of:
-- username_field: for username, email, user ID inputs
-- password_field: for password inputs
-- confirm_password: for password confirmation fields
-- login_button: for sign in/login buttons
-- register_button: for sign up/register buttons
-- submit_button: for general form submission
-- checkbox: for remember me, terms agreement
-- link: for forgot password, register links
-- other: for unrelated elements
-Response format:
-{{
-  "classification": "username_field|password_field|login_button|other",
-  "confidence": 0.1-1.0,
-  "reasoning": "brief explanation of classification decision",
-  "visual_evidence": ["text_visible", "styling_clues", "position_indicators"],
-  "interaction_priority": 1-10
-}}'''
-                # Use unified invoke method
-                result = await service.invoke(
-                    image=crop_path,
-                    prompt=prompt,
-                    task="analyze",
-                    max_tokens=parameters.get("max_tokens", 300)
-                )
-                classification_data = self._parse_classification_result(result['text'])
-                # Use original detection center coordinates
-                classified_element = {
-                    **element,
-                    'classification': classification_data.get('classification', 'other'),
-                    'classification_confidence': classification_data.get('confidence', 0.5),
-                    'precise_center': element.get('center', [0, 0]),
-                    'reasoning': classification_data.get('reasoning', ''),
-                    'visual_evidence': classification_data.get('visual_evidence', []),
-                    'interaction_priority': classification_data.get('interaction_priority', 5),
-                    'crop_region': [crop_x1, crop_y1, crop_x2, crop_y2]
-                }
-                classified_elements.append(classified_element)
-            except Exception as e:
-                # Keep element with basic classification
-                classified_element = {
-                    **element,
-                    'classification': 'other',
-                    'classification_confidence': 0.3,
-                    'precise_center': element.get('center', [0, 0]),
-                    'reasoning': f'Classification failed: {str(e)}',
-                    'visual_evidence': [],
-                    'interaction_priority': 5,
-                    'error': str(e)
-                }
-                classified_elements.append(classified_element)
-            finally:
-                # Cleanup temp file
-                try:
-                    import os
-                    os.remove(crop_path)
-                except:
-                    pass
-        return classified_elements
-    # ==================== LEGACY METHODS (for compatibility) ====================
-    async def _execute_page_intelligence(self, layer: LayerConfig, service: Any, context: Dict[str, Any]) -> Dict[str, Any]:
-        """Execute page intelligence analysis"""
-        image_path = context["input"]["image_path"]
-        prompt = '''Analyze this webpage screenshot to understand the login interface structure.
-Identify:
-1. Page type (login, register, multi-step auth, SSO)
-2. Layout pattern (vertical form, horizontal, modal, tabs)
-3. Language used in the interface
-4. Security features visible (CAPTCHA, 2FA indicators)
-5. Form complexity level
-6. Visible text elements that indicate field purposes
-Return analysis as JSON with this exact structure:
-{
-  "page_type": "login|register|multi_step|sso|other",
-  "layout_pattern": "vertical|horizontal|modal|tabs|embedded",
-  "language": "en|zh|es|fr|de|other",
-  "security_features": ["captcha", "recaptcha", "2fa_indicator", "security_questions"],
-  "complexity_score": 0.1-1.0,
-  "visible_text_elements": ["Login", "Password", "Sign In"],
-  "form_area_estimate": {"x": 0, "y": 0, "width": 0, "height": 0},
-  "confidence": 0.1-1.0,
-  "analysis_notes": "brief description of what you observe"
-}
-Be precise and only include elements you can clearly see.'''
-        result = await service.invoke(
-            image=image_path,
-            prompt=prompt,
-            task="analyze",
-            max_tokens=layer.parameters.get("max_tokens", 500)
-        )
-        # Parse JSON response
-        response_text = result['text'].strip()
-        json_start = response_text.find('{')
-        json_end = response_text.rfind('}') + 1
-        if json_start >= 0 and json_end > json_start:
-            json_text = response_text[json_start:json_end]
-            try:
-                intelligence_data = json.loads(json_text)
-            except json.JSONDecodeError:
-                # Fallback parsing
-                intelligence_data = self._parse_intelligence_fallback(response_text)
-        else:
-            intelligence_data = self._parse_intelligence_fallback(response_text)
-        return intelligence_data
-    async def _execute_element_detection(self, layer: LayerConfig, service: Any, context: Dict[str, Any]) -> List[Dict[str, Any]]:
-        """Execute element detection"""
-        image_path = context["input"]["image_path"]
-        intelligence = context["results"]["page_intelligence"].data
-        # Adapt parameters based on page intelligence
-        complexity = intelligence.get('complexity_score', 0.5)
-        params = layer.parameters.copy()
-        if complexity > 0.7:
-            params.update({
-                "box_threshold": 0.03,
-                "iou_threshold": 0.1,
-                "imgsz": 1024
-            })
-        elif complexity > 0.4:
-            params.update({
-                "box_threshold": 0.05,
-                "iou_threshold": 0.1,
-                "imgsz": 640
-            })
-        else:
-            params.update({
-                "box_threshold": 0.08,
-                "iou_threshold": 0.2,
-                "imgsz": 640
-            })
-        # Run detection based on model type
-        if layer.model_name == "omniparser":
-            if hasattr(service, 'run_omniparser'):
-                result = await service.run_omniparser(
-                    image=image_path,
-                    **params
-                )
-                # Filter for interactive elements only
-                elements = [e for e in result.get("parsed_elements", []) if e.get("interactivity", False)]
-            else:
-                # Fallback for services without omniparser support
-                elements = await self._fallback_element_detection(service, image_path, intelligence)
-        elif layer.model_name == "florence-2":
-            if hasattr(service, 'run_florence2'):
-                result = await service.run_florence2(
-                    image=image_path,
-                    task="<OPEN_VOCABULARY_DETECTION>",
-                    text_input="login form elements, input fields, buttons"
-                )
-                elements = result.get("parsed_objects", [])
-            else:
-                elements = await self._fallback_element_detection(service, image_path, intelligence)
-        elif layer.model_name == "yolov8":
-            if hasattr(service, 'run_yolo'):
-                result = await service.run_yolo(
-                    image=image_path,
-                    confidence=params.get("box_threshold", 0.5)
-                )
-                elements = result.get("detected_objects", [])
-            else:
-                elements = await self._fallback_element_detection(service, image_path, intelligence)
-        else:
-            # Fallback to generic object detection
-            elements = await self._fallback_element_detection(service, image_path, intelligence)
-        return elements
-    async def _execute_element_classification(self, layer: LayerConfig, service: Any, context: Dict[str, Any]) -> List[Dict[str, Any]]:
-        """Execute element classification"""
-        image_path = context["input"]["image_path"]
-        elements = context["results"]["element_detection"].data
-        intelligence = context["results"]["page_intelligence"].data
-        classified_elements = []
-        img = Image.open(image_path)
-        for i, element in enumerate(elements):
-            # Crop element region with padding
-            bbox = element['bbox']
-            x1, y1, x2, y2 = bbox
-            padding_x = max(20, int((x2 - x1) * 0.2))
-            padding_y = max(20, int((y2 - y1) * 0.2))
-            crop_x1 = max(0, x1 - padding_x)
-            crop_y1 = max(0, y1 - padding_y)
-            crop_x2 = min(img.width, x2 + padding_x)
-            crop_y2 = min(img.height, y2 + padding_y)
-            cropped_img = img.crop((crop_x1, crop_y1, crop_x2, crop_y2))
-            crop_path = f"temp_classify_{i}.png"
-            cropped_img.save(crop_path)
-            try:
-                context_info = f'''Page Context:
-- Type: {intelligence.get('page_type', 'unknown')}
-- Layout: {intelligence.get('layout_pattern', 'unknown')}
-- Language: {intelligence.get('language', 'en')}
-- Complexity: {intelligence.get('complexity_score', 0.5)}
-Element Context:
-- Position: {element.get('center', [0, 0])} (center)
-- Size: {element.get('size', [0, 0])} (width x height)
-- Element {i+1} of {len(elements)} total elements'''
-                # Get task-specific classification prompt
-                prompt = self._get_classification_prompt(
-                    parameters.get("task", "element_classification"),
-                    context_info
-                )
-                result = await service.invoke(
-                    image=crop_path,
-                    prompt=prompt,
-                    task="analyze",
-                    max_tokens=layer.parameters.get("max_tokens", 300)
-                )
-                classification_data = self._parse_classification_result(result['text'])
-                # Use original detection center coordinates
-                classified_element = {
-                    **element,
-                    'classification': classification_data.get('classification', 'other'),
-                    'classification_confidence': classification_data.get('confidence', 0.5),
-                    'precise_center': element.get('center', [0, 0]),
-                    'reasoning': classification_data.get('reasoning', ''),
-                    'visual_evidence': classification_data.get('visual_evidence', []),
-                    'interaction_priority': classification_data.get('interaction_priority', 5),
-                    'crop_region': [crop_x1, crop_y1, crop_x2, crop_y2]
-                }
-                classified_elements.append(classified_element)
-            except Exception as e:
-                # Keep element with basic classification
-                classified_element = {
-                    **element,
-                    'classification': 'other',
-                    'classification_confidence': 0.3,
-                    'precise_center': element.get('center', [0, 0]),
-                    'reasoning': f'Classification failed: {str(e)}',
-                    'visual_evidence': [],
-                    'interaction_priority': 5,
-                    'error': str(e)
-                }
-                classified_elements.append(classified_element)
-            finally:
-                # Cleanup temp file
-                try:
-                    import os
-                    os.remove(crop_path)
-                except:
-                    pass
-        return classified_elements
     async def execute_fallback(self, layer: LayerConfig, context: Dict[str, Any], error: str) -> Optional[Any]:
         """Execute fallback logic for failed layers"""
@@ -980,113 +628,6 @@ Element Context:
             'interaction_priority': 5
         }
-    def _get_classification_prompt(self, task: str, context_info: str) -> str:
-        """Get task-specific classification prompt"""
-        if task == "search_element_classification":
-            return f'''Classify this UI element from a search interface.
-{context_info}
-Classify as one of:
-- search_field: for search input boxes, query fields
-- search_button: for search/go buttons, submit search
-- search_suggestion: for autocomplete suggestions
-- filter: for search filters, sorting options
-- voice_search: for voice search buttons
-- image_search: for image search options
-- advanced_search: for advanced search links/options
-- nav_link: for navigation menu items
-- other: for unrelated elements
-Response format:
-{{
-  "classification": "search_field|search_button|filter|nav_link|other",
-  "confidence": 0.1-1.0,
-  "reasoning": "brief explanation of classification decision",
-  "visual_evidence": ["text_visible", "styling_clues", "position_indicators"],
-  "interaction_priority": 1-10
-}}'''
-        elif task == "content_element_classification":
-            return f'''Classify this UI element from a content page.
-{context_info}
-Classify as one of:
-- article_title: for main article/page titles
-- article_body: for main content/article text
-- sidebar_content: for sidebar information
-- navigation_menu: for navigation elements
-- related_links: for related articles/links
-- comment: for comment sections
-- share_button: for social sharing
-- read_more: for read more links
-- image: for content images
-- video: for embedded videos
-- other: for unrelated elements
-Response format:
-{{
-  "classification": "article_title|article_body|sidebar_content|navigation_menu|other",
-  "confidence": 0.1-1.0,
-  "reasoning": "brief explanation of classification decision",
-  "visual_evidence": ["text_visible", "styling_clues", "position_indicators"],
-  "interaction_priority": 1-10
-}}'''
-        elif task == "navigation_element_classification":
-            return f'''Classify this UI element from a navigation-focused page.
-{context_info}
-Classify as one of:
-- nav_link: for main navigation links
-- menu_item: for menu items and categories
-- breadcrumb: for breadcrumb navigation
-- dropdown_menu: for dropdown menu elements
-- footer_link: for footer navigation
-- logo: for site logos/branding
-- search_box: for site search functionality
-- user_menu: for user account menus
-- cta_button: for call-to-action buttons
-- other: for unrelated elements
-Response format:
-{{
-  "classification": "nav_link|menu_item|breadcrumb|dropdown_menu|other",
-  "confidence": 0.1-1.0,
-  "reasoning": "brief explanation of classification decision",
-  "visual_evidence": ["text_visible", "styling_clues", "position_indicators"],
-  "interaction_priority": 1-10
-}}'''
-        else:
-            # Default login classification
-            return f'''Classify this UI element from a login/authentication interface.
-{context_info}
-Classify as one of:
-- username_field: for username, email, user ID inputs
-- password_field: for password inputs
-- confirm_password: for password confirmation fields
-- login_button: for sign in/login buttons
-- register_button: for sign up/register buttons
-- submit_button: for general form submission
-- checkbox: for remember me, terms agreement
-- link: for forgot password, register links
-- other: for unrelated elements
-Response format:
-{{
-  "classification": "username_field|password_field|login_button|other",
-  "confidence": 0.1-1.0,
-  "reasoning": "brief explanation of classification decision",
-  "visual_evidence": ["text_visible", "styling_clues", "position_indicators"],
-  "interaction_priority": 1-10
-}}'''
     def _build_action_planning_prompt(self, task_type: str, elements_summary: List[Dict], interactive_elements: List[Dict]) -> str:
         """构建行动规划提示词"""
@@ -1279,41 +820,4 @@ Response format:
                     'confidence': 0.4,
                     'type': 'form'
                 }
-            ]
-    async def _fallback_element_detection(self, service: Any, image_path: str, intelligence: Dict[str, Any]) -> List[Dict[str, Any]]:
-        """Fallback element detection using generic methods"""
-        try:
-            # Try generic object detection
-            result = await service.detect_objects(image_path, confidence_threshold=0.5)
-            objects = result.get("objects", [])
-            # Convert to standard format
-            elements = []
-            for i, obj in enumerate(objects):
-                coords = obj.get("coordinates", {})
-                if all(k in coords for k in ['x', 'y', 'width', 'height']):
-                    # Convert percentage to pixels
-                    img = Image.open(image_path)
-                    img_width, img_height = img.size
-                    x = int(coords['x'] * img_width / 100)
-                    y = int(coords['y'] * img_height / 100)
-                    w = int(coords['width'] * img_width / 100)
-                    h = int(coords['height'] * img_height / 100)
-                    element = {
-                        'id': f'fallback_{i}',
-                        'bbox': [x, y, x + w, y + h],
-                        'center': [x + w//2, y + h//2],
-                        'size': [w, h],
-                        'confidence': obj.get('confidence', 0.7),
-                        'type': 'detected'
-                    }
-                    elements.append(element)
-            return elements
-        except Exception:
-            # Ultimate fallback
-            return self._create_fallback_elements(image_path, intelligence)
+            ]

isa-model 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl

isa-model 0.3.5py3-none-any.whl → 0.3.6py3-none-any.whl