PyPI - amd-gaia - Versions diffs - 0.15.0__py3-none-any.whl → 0.15.2__py3-none-any.whl - Mend

amd-gaia 0.15.0py3-none-any.whl → 0.15.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (185) hide show

{amd_gaia-0.15.0.dist-info → amd_gaia-0.15.2.dist-info}/METADATA +222 -223
amd_gaia-0.15.2.dist-info/RECORD +182 -0
{amd_gaia-0.15.0.dist-info → amd_gaia-0.15.2.dist-info}/WHEEL +1 -1
{amd_gaia-0.15.0.dist-info → amd_gaia-0.15.2.dist-info}/entry_points.txt +1 -0
{amd_gaia-0.15.0.dist-info → amd_gaia-0.15.2.dist-info}/licenses/LICENSE.md +20 -20
gaia/__init__.py +29 -29
gaia/agents/__init__.py +19 -19
gaia/agents/base/__init__.py +9 -9
gaia/agents/base/agent.py +2132 -2177
gaia/agents/base/api_agent.py +119 -120
gaia/agents/base/console.py +1967 -1841
gaia/agents/base/errors.py +237 -237
gaia/agents/base/mcp_agent.py +86 -86
gaia/agents/base/tools.py +88 -83
gaia/agents/blender/__init__.py +7 -0
gaia/agents/blender/agent.py +553 -556
gaia/agents/blender/agent_simple.py +133 -135
gaia/agents/blender/app.py +211 -211
gaia/agents/blender/app_simple.py +41 -41
gaia/agents/blender/core/__init__.py +16 -16
gaia/agents/blender/core/materials.py +506 -506
gaia/agents/blender/core/objects.py +316 -316
gaia/agents/blender/core/rendering.py +225 -225
gaia/agents/blender/core/scene.py +220 -220
gaia/agents/blender/core/view.py +146 -146
gaia/agents/chat/__init__.py +9 -9
gaia/agents/chat/agent.py +809 -835
gaia/agents/chat/app.py +1065 -1058
gaia/agents/chat/session.py +508 -508
gaia/agents/chat/tools/__init__.py +15 -15
gaia/agents/chat/tools/file_tools.py +96 -96
gaia/agents/chat/tools/rag_tools.py +1744 -1729
gaia/agents/chat/tools/shell_tools.py +437 -436
gaia/agents/code/__init__.py +7 -7
gaia/agents/code/agent.py +549 -549
gaia/agents/code/cli.py +377 -0
gaia/agents/code/models.py +135 -135
gaia/agents/code/orchestration/__init__.py +24 -24
gaia/agents/code/orchestration/checklist_executor.py +1763 -1763
gaia/agents/code/orchestration/checklist_generator.py +713 -713
gaia/agents/code/orchestration/factories/__init__.py +9 -9
gaia/agents/code/orchestration/factories/base.py +63 -63
gaia/agents/code/orchestration/factories/nextjs_factory.py +118 -118
gaia/agents/code/orchestration/factories/python_factory.py +106 -106
gaia/agents/code/orchestration/orchestrator.py +841 -841
gaia/agents/code/orchestration/project_analyzer.py +391 -391
gaia/agents/code/orchestration/steps/__init__.py +67 -67
gaia/agents/code/orchestration/steps/base.py +188 -188
gaia/agents/code/orchestration/steps/error_handler.py +314 -314
gaia/agents/code/orchestration/steps/nextjs.py +828 -828
gaia/agents/code/orchestration/steps/python.py +307 -307
gaia/agents/code/orchestration/template_catalog.py +469 -469
gaia/agents/code/orchestration/workflows/__init__.py +14 -14
gaia/agents/code/orchestration/workflows/base.py +80 -80
gaia/agents/code/orchestration/workflows/nextjs.py +186 -186
gaia/agents/code/orchestration/workflows/python.py +94 -94
gaia/agents/code/prompts/__init__.py +11 -11
gaia/agents/code/prompts/base_prompt.py +77 -77
gaia/agents/code/prompts/code_patterns.py +2034 -2036
gaia/agents/code/prompts/nextjs_prompt.py +40 -40
gaia/agents/code/prompts/python_prompt.py +109 -109
gaia/agents/code/schema_inference.py +365 -365
gaia/agents/code/system_prompt.py +41 -41
gaia/agents/code/tools/__init__.py +42 -42
gaia/agents/code/tools/cli_tools.py +1138 -1138
gaia/agents/code/tools/code_formatting.py +319 -319
gaia/agents/code/tools/code_tools.py +769 -769
gaia/agents/code/tools/error_fixing.py +1347 -1347
gaia/agents/code/tools/external_tools.py +180 -180
gaia/agents/code/tools/file_io.py +845 -845
gaia/agents/code/tools/prisma_tools.py +190 -190
gaia/agents/code/tools/project_management.py +1016 -1016
gaia/agents/code/tools/testing.py +321 -321
gaia/agents/code/tools/typescript_tools.py +122 -122
gaia/agents/code/tools/validation_parsing.py +461 -461
gaia/agents/code/tools/validation_tools.py +806 -806
gaia/agents/code/tools/web_dev_tools.py +1758 -1758
gaia/agents/code/validators/__init__.py +16 -16
gaia/agents/code/validators/antipattern_checker.py +241 -241
gaia/agents/code/validators/ast_analyzer.py +197 -197
gaia/agents/code/validators/requirements_validator.py +145 -145
gaia/agents/code/validators/syntax_validator.py +171 -171
gaia/agents/docker/__init__.py +7 -7
gaia/agents/docker/agent.py +643 -642
gaia/agents/emr/__init__.py +8 -8
gaia/agents/emr/agent.py +1504 -1506
gaia/agents/emr/cli.py +1322 -1322
gaia/agents/emr/constants.py +475 -475
gaia/agents/emr/dashboard/__init__.py +4 -4
gaia/agents/emr/dashboard/server.py +1972 -1974
gaia/agents/jira/__init__.py +11 -11
gaia/agents/jira/agent.py +894 -894
gaia/agents/jira/jql_templates.py +299 -299
gaia/agents/routing/__init__.py +7 -7
gaia/agents/routing/agent.py +567 -570
gaia/agents/routing/system_prompt.py +75 -75
gaia/agents/summarize/__init__.py +11 -0
gaia/agents/summarize/agent.py +885 -0
gaia/agents/summarize/prompts.py +129 -0
gaia/api/__init__.py +23 -23
gaia/api/agent_registry.py +238 -238
gaia/api/app.py +305 -305
gaia/api/openai_server.py +575 -575
gaia/api/schemas.py +186 -186
gaia/api/sse_handler.py +373 -373
gaia/apps/__init__.py +4 -4
gaia/apps/llm/__init__.py +6 -6
gaia/apps/llm/app.py +184 -169
gaia/apps/summarize/app.py +116 -633
gaia/apps/summarize/html_viewer.py +133 -133
gaia/apps/summarize/pdf_formatter.py +284 -284
gaia/audio/__init__.py +2 -2
gaia/audio/audio_client.py +439 -439
gaia/audio/audio_recorder.py +269 -269
gaia/audio/kokoro_tts.py +599 -599
gaia/audio/whisper_asr.py +432 -432
gaia/chat/__init__.py +16 -16
gaia/chat/app.py +428 -430
gaia/chat/prompts.py +522 -522
gaia/chat/sdk.py +1228 -1225
gaia/cli.py +5659 -5632
gaia/database/__init__.py +10 -10
gaia/database/agent.py +176 -176
gaia/database/mixin.py +290 -290
gaia/database/testing.py +64 -64
gaia/eval/batch_experiment.py +2332 -2332
gaia/eval/claude.py +542 -542
gaia/eval/config.py +37 -37
gaia/eval/email_generator.py +512 -512
gaia/eval/eval.py +3179 -3179
gaia/eval/groundtruth.py +1130 -1130
gaia/eval/transcript_generator.py +582 -582
gaia/eval/webapp/README.md +167 -167
gaia/eval/webapp/package-lock.json +875 -875
gaia/eval/webapp/package.json +20 -20
gaia/eval/webapp/public/app.js +3402 -3402
gaia/eval/webapp/public/index.html +87 -87
gaia/eval/webapp/public/styles.css +3661 -3661
gaia/eval/webapp/server.js +415 -415
gaia/eval/webapp/test-setup.js +72 -72
gaia/installer/__init__.py +23 -0
gaia/installer/init_command.py +1275 -0
gaia/installer/lemonade_installer.py +619 -0
gaia/llm/__init__.py +10 -2
gaia/llm/base_client.py +60 -0
gaia/llm/exceptions.py +12 -0
gaia/llm/factory.py +70 -0
gaia/llm/lemonade_client.py +3421 -3221
gaia/llm/lemonade_manager.py +294 -294
gaia/llm/providers/__init__.py +9 -0
gaia/llm/providers/claude.py +108 -0
gaia/llm/providers/lemonade.py +118 -0
gaia/llm/providers/openai_provider.py +79 -0
gaia/llm/vlm_client.py +382 -382
gaia/logger.py +189 -189
gaia/mcp/agent_mcp_server.py +245 -245
gaia/mcp/blender_mcp_client.py +138 -138
gaia/mcp/blender_mcp_server.py +648 -648
gaia/mcp/context7_cache.py +332 -332
gaia/mcp/external_services.py +518 -518
gaia/mcp/mcp_bridge.py +811 -550
gaia/mcp/servers/__init__.py +6 -6
gaia/mcp/servers/docker_mcp.py +83 -83
gaia/perf_analysis.py +361 -0
gaia/rag/__init__.py +10 -10
gaia/rag/app.py +293 -293
gaia/rag/demo.py +304 -304
gaia/rag/pdf_utils.py +235 -235
gaia/rag/sdk.py +2194 -2194
gaia/security.py +183 -163
gaia/talk/app.py +287 -289
gaia/talk/sdk.py +538 -538
gaia/testing/__init__.py +87 -87
gaia/testing/assertions.py +330 -330
gaia/testing/fixtures.py +333 -333
gaia/testing/mocks.py +493 -493
gaia/util.py +46 -46
gaia/utils/__init__.py +33 -33
gaia/utils/file_watcher.py +675 -675
gaia/utils/parsing.py +223 -223
gaia/version.py +100 -100
amd_gaia-0.15.0.dist-info/RECORD +0 -168
gaia/agents/code/app.py +0 -266
gaia/llm/llm_client.py +0 -723
{amd_gaia-0.15.0.dist-info → amd_gaia-0.15.2.dist-info}/top_level.txt +0 -0

gaia/llm/vlm_client.py CHANGED Viewed

@@ -1,382 +1,382 @@
-#!/usr/bin/env python3
-# Copyright(C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-"""
-Vision-Language Model (VLM) client for extracting text from images.
-Handles model loading/unloading and image-to-text extraction via Lemonade server.
-"""
-import base64
-import logging
-import os
-from typing import Optional
-from dotenv import load_dotenv
-# Load environment variables from .env file
-load_dotenv()
-# Default Lemonade server URL (can be overridden via LEMONADE_BASE_URL env var)
-DEFAULT_LEMONADE_URL = "http://localhost:8000/api/v1"
-logger = logging.getLogger(__name__)
-# Magic bytes for common image formats
-IMAGE_SIGNATURES = {
-    b"\x89PNG\r\n\x1a\n": "image/png",
-    b"\xff\xd8\xff": "image/jpeg",
-    b"GIF87a": "image/gif",
-    b"GIF89a": "image/gif",
-    b"RIFF": "image/webp",  # WebP starts with RIFF...WEBP
-    b"BM": "image/bmp",
-}
-def detect_image_mime_type(image_bytes: bytes) -> str:
-    """
-    Detect MIME type from image bytes using magic number signatures.
-    Args:
-        image_bytes: Raw image bytes
-    Returns:
-        MIME type string (e.g., "image/jpeg", "image/png")
-        Defaults to "image/png" if format not detected.
-    """
-    for signature, mime_type in IMAGE_SIGNATURES.items():
-        if image_bytes.startswith(signature):
-            # Special case: WebP needs additional check for WEBP marker
-            if signature == b"RIFF" and len(image_bytes) >= 12:
-                if image_bytes[8:12] != b"WEBP":
-                    continue
-            return mime_type
-    # Default to PNG if format not detected
-    logger.debug("Could not detect image format, defaulting to image/png")
-    return "image/png"
-class VLMClient:
-    """
-    VLM client for extracting text from images using Lemonade server.
-    Handles:
-    - Model loading (default: Qwen3-VL-4B-Instruct-GGUF)
-    - Image-to-markdown conversion
-    - State tracking for VLM processing
-    """
-    def __init__(
-        self,
-        vlm_model: str = "Qwen3-VL-4B-Instruct-GGUF",
-        base_url: Optional[str] = None,
-        auto_load: bool = True,
-    ):
-        """
-        Initialize VLM client.
-        Args:
-            vlm_model: Vision model to use for image extraction
-            base_url: Lemonade server API URL (defaults to LEMONADE_BASE_URL env var)
-            auto_load: Automatically load VLM model on first use
-        """
-        # Use provided base_url, fall back to env var, then default
-        if base_url is None:
-            base_url = os.getenv("LEMONADE_BASE_URL", DEFAULT_LEMONADE_URL)
-        from urllib.parse import urlparse
-        from gaia.llm.lemonade_client import LemonadeClient
-        self.vlm_model = vlm_model
-        self.base_url = base_url
-        # Parse base_url to extract host and port for LemonadeClient
-        parsed = urlparse(base_url)
-        host = parsed.hostname or "localhost"
-        port = parsed.port or 8000
-        # Get base server URL (without /api/v1) for user-facing messages
-        self.server_url = f"http://{host}:{port}"
-        self.client = LemonadeClient(model=vlm_model, host=host, port=port)
-        self.auto_load = auto_load
-        self.vlm_loaded = False
-        logger.debug(f"VLM Client initialized: {self.vlm_model} at {self.server_url}")
-    def check_availability(self) -> bool:
-        """
-        Check if VLM model is available on Lemonade server.
-        Returns:
-            True if model is available, False otherwise
-        """
-        try:
-            models_response = self.client.list_models()
-            available_models = [
-                m.get("id", "") for m in models_response.get("data", [])
-            ]
-            if self.vlm_model in available_models:
-                logger.debug(f"VLM model available: {self.vlm_model}")
-                return True
-            else:
-                logger.warning(f"❌ VLM model not found: {self.vlm_model}")
-                logger.warning("")
-                logger.warning("📥 To download this model:")
-                logger.warning(f"   1. Open Lemonade Model Manager ({self.server_url})")
-                logger.warning(f"   2. Search for: {self.vlm_model}")
-                logger.warning("   3. Click 'Download' to install the model")
-                logger.warning("")
-                logger.warning(
-                    f"   Available models: {', '.join(available_models[:3])}..."
-                )
-                return False
-        except Exception as e:
-            logger.error(f"Failed to check VLM availability: {e}")
-            logger.error(
-                f"   Make sure Lemonade server is running at {self.server_url}"
-            )
-            return False
-    def _ensure_vlm_loaded(self) -> bool:
-        """
-        Ensure VLM model is loaded, load it if necessary.
-        The model will be automatically downloaded if not available (handled by
-        lemonade_client.chat_completions with auto_download=True).
-        Returns:
-            True if VLM is loaded, False if loading failed
-        """
-        if self.vlm_loaded:
-            return True
-        if not self.auto_load:
-            logger.warning("VLM not loaded and auto_load=False")
-            return False
-        try:
-            logger.debug(f"Loading VLM model: {self.vlm_model}")
-            # Load model (auto-download handled by lemonade_client, may take hours)
-            self.client.load_model(self.vlm_model, timeout=60, auto_download=True)
-            self.vlm_loaded = True
-            logger.debug(f"VLM model loaded: {self.vlm_model}")
-            return True
-        except Exception as e:
-            logger.error(f"Failed to load VLM model: {e}")
-            logger.error(
-                f"   Make sure Lemonade server is running at {self.server_url}"
-            )
-            return False
-    def extract_from_image(
-        self,
-        image_bytes: bytes,
-        image_num: int = 1,
-        page_num: int = 1,
-        prompt: Optional[str] = None,
-    ) -> str:
-        """
-        Extract text from an image using VLM.
-        Args:
-            image_bytes: Image as PNG/JPEG bytes
-            image_num: Image number on page (for logging)
-            page_num: Page number (for logging)
-            prompt: Custom extraction prompt (optional)
-        Returns:
-            Extracted text in markdown format
-        """
-        # Ensure VLM is loaded
-        if not self._ensure_vlm_loaded():
-            error_msg = "VLM model not available"
-            logger.error(error_msg)
-            return f"[VLM extraction failed: {error_msg}]"
-        # Encode image as base64 and detect MIME type
-        # Note: Image size optimization happens in pdf_utils.py during extraction
-        image_b64 = base64.b64encode(image_bytes).decode("utf-8")
-        mime_type = detect_image_mime_type(image_bytes)
-        # Default prompt for text extraction
-        if not prompt:
-            prompt = """You are an OCR system. Extract ALL visible text from this image exactly as it appears.
-Instructions:
-1. Extract EVERY word you see - don't skip or paraphrase
-2. Preserve exact formatting (headings, bold, bullets, tables)
-3. If it's a table, format as markdown table
-4. If it's a chart, describe what you see: [CHART: ...]
-5. Do NOT add placeholders like "[Insert ...]"  - only extract actual text
-6. Do NOT generate or invent content - only extract what you see
-Output format: Clean markdown with the ACTUAL text from the image."""
-        # Format message with image (OpenAI vision format)
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": prompt},
-                    {
-                        "type": "image_url",
-                        "image_url": {"url": f"data:{mime_type};base64,{image_b64}"},
-                    },
-                ],
-            }
-        ]
-        try:
-            import time
-            start_time = time.time()
-            logger.debug(
-                f"VLM extracting from image {image_num} on page {page_num} ({mime_type})..."
-            )
-            logger.debug(
-                f"   Image: {mime_type}, {len(image_b64)} chars base64 ({len(image_bytes)} bytes raw)"
-            )
-            # Call VLM using chat completions endpoint
-            response = self.client.chat_completions(
-                model=self.vlm_model,
-                messages=messages,
-                temperature=0.1,  # Low temp for accurate extraction
-                max_completion_tokens=2048,  # Allow detailed extraction
-                timeout=300,  # VLM needs more time for complex forms (5 min)
-            )
-            elapsed = time.time() - start_time
-            # Extract text from response
-            if (
-                isinstance(response, dict)
-                and "choices" in response
-                and len(response["choices"]) > 0
-            ):
-                extracted_text = response["choices"][0]["message"]["content"]
-                size_kb = len(image_bytes) / 1024
-                logger.debug(
-                    f"Extracted {len(extracted_text)} chars from image {image_num} "
-                    f"in {elapsed:.2f}s ({size_kb:.0f}KB image)"
-                )
-                return extracted_text
-            else:
-                # Check for specific error types and provide helpful messages
-                error_msg = self._parse_vlm_error(response)
-                logger.error(error_msg)
-                return f"[VLM extraction failed: {error_msg}]"
-        except Exception as e:
-            logger.error(
-                f"VLM extraction failed for page {page_num}, image {image_num}: {e}"
-            )
-            import traceback
-            logger.debug(traceback.format_exc())
-            return f"[VLM extraction failed: {str(e)}]"
-    def _parse_vlm_error(self, response: dict) -> str:
-        """Parse VLM error response and return a helpful error message."""
-        if not isinstance(response, dict):
-            return f"Unexpected response type: {type(response)}"
-        # Check for nested error structure from Lemonade
-        error = response.get("error", {})
-        if isinstance(error, dict):
-            details = error.get("details", {})
-            inner_response = (
-                details.get("response", {}) if isinstance(details, dict) else {}
-            )
-            inner_error = (
-                inner_response.get("error", {})
-                if isinstance(inner_response, dict)
-                else {}
-            )
-            # Context size error
-            if inner_error.get("type") == "exceed_context_size_error":
-                n_ctx = inner_error.get("n_ctx", "unknown")
-                n_prompt = inner_error.get("n_prompt_tokens", "unknown")
-                return (
-                    f"Context size too small! Image requires {n_prompt} tokens "
-                    f"but model context is only {n_ctx}. "
-                    f"To fix: Right-click Lemonade tray icon → Settings → "
-                    f"set Context Size to 32768, then restart the model."
-                )
-            # Other backend errors
-            if error.get("type") == "backend_error":
-                msg = inner_error.get(
-                    "message", error.get("message", "Unknown backend error")
-                )
-                return f"Backend error: {msg}"
-        return f"Unexpected response format: {response}"
-    def extract_from_page_images(self, images: list, page_num: int) -> list:
-        """
-        Extract text from multiple images on a page.
-        Args:
-            images: List of image dicts with 'image_bytes', 'width', 'height', etc.
-            page_num: Page number
-        Returns:
-            List of dicts:
-            [
-                {
-                    "image_num": 1,
-                    "text": "extracted markdown",
-                    "dimensions": "800x600",
-                    "size_kb": 45.2
-                },
-                ...
-            ]
-        """
-        results = []
-        for img_idx, img_data in enumerate(images, 1):
-            extracted_text = self.extract_from_image(
-                image_bytes=img_data["image_bytes"],
-                image_num=img_idx,
-                page_num=page_num,
-            )
-            results.append(
-                {
-                    "image_num": img_idx,
-                    "text": extracted_text,
-                    "dimensions": f"{img_data['width']}x{img_data['height']}",
-                    "size_kb": img_data["size_kb"],
-                }
-            )
-        return results
-    def cleanup(self):
-        """
-        Cleanup VLM resources.
-        Call this after batch processing to mark VLM as unloaded.
-        Note: Model remains loaded on server; this just updates local state.
-        """
-        if self.vlm_loaded:
-            logger.info("🧹 VLM processing complete")
-            self.vlm_loaded = False
-    def __enter__(self):
-        """Context manager entry - ensure VLM loaded."""
-        self._ensure_vlm_loaded()
-        return self
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        """Context manager exit - cleanup VLM state."""
-        self.cleanup()
+#!/usr/bin/env python3
+# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+"""
+Vision-Language Model (VLM) client for extracting text from images.
+Handles model loading/unloading and image-to-text extraction via Lemonade server.
+"""
+import base64
+import logging
+import os
+from typing import Optional
+from dotenv import load_dotenv
+# Load environment variables from .env file
+load_dotenv()
+# Default Lemonade server URL (can be overridden via LEMONADE_BASE_URL env var)
+DEFAULT_LEMONADE_URL = "http://localhost:8000/api/v1"
+logger = logging.getLogger(__name__)
+# Magic bytes for common image formats
+IMAGE_SIGNATURES = {
+    b"\x89PNG\r\n\x1a\n": "image/png",
+    b"\xff\xd8\xff": "image/jpeg",
+    b"GIF87a": "image/gif",
+    b"GIF89a": "image/gif",
+    b"RIFF": "image/webp",  # WebP starts with RIFF...WEBP
+    b"BM": "image/bmp",
+}
+def detect_image_mime_type(image_bytes: bytes) -> str:
+    """
+    Detect MIME type from image bytes using magic number signatures.
+    Args:
+        image_bytes: Raw image bytes
+    Returns:
+        MIME type string (e.g., "image/jpeg", "image/png")
+        Defaults to "image/png" if format not detected.
+    """
+    for signature, mime_type in IMAGE_SIGNATURES.items():
+        if image_bytes.startswith(signature):
+            # Special case: WebP needs additional check for WEBP marker
+            if signature == b"RIFF" and len(image_bytes) >= 12:
+                if image_bytes[8:12] != b"WEBP":
+                    continue
+            return mime_type
+    # Default to PNG if format not detected
+    logger.debug("Could not detect image format, defaulting to image/png")
+    return "image/png"
+class VLMClient:
+    """
+    VLM client for extracting text from images using Lemonade server.
+    Handles:
+    - Model loading (default: Qwen3-VL-4B-Instruct-GGUF)
+    - Image-to-markdown conversion
+    - State tracking for VLM processing
+    """
+    def __init__(
+        self,
+        vlm_model: str = "Qwen3-VL-4B-Instruct-GGUF",
+        base_url: Optional[str] = None,
+        auto_load: bool = True,
+    ):
+        """
+        Initialize VLM client.
+        Args:
+            vlm_model: Vision model to use for image extraction
+            base_url: Lemonade server API URL (defaults to LEMONADE_BASE_URL env var)
+            auto_load: Automatically load VLM model on first use
+        """
+        # Use provided base_url, fall back to env var, then default
+        if base_url is None:
+            base_url = os.getenv("LEMONADE_BASE_URL", DEFAULT_LEMONADE_URL)
+        from urllib.parse import urlparse
+        from gaia.llm.lemonade_client import LemonadeClient
+        self.vlm_model = vlm_model
+        self.base_url = base_url
+        # Parse base_url to extract host and port for LemonadeClient
+        parsed = urlparse(base_url)
+        host = parsed.hostname or "localhost"
+        port = parsed.port or 8000
+        # Get base server URL (without /api/v1) for user-facing messages
+        self.server_url = f"http://{host}:{port}"
+        self.client = LemonadeClient(model=vlm_model, host=host, port=port)
+        self.auto_load = auto_load
+        self.vlm_loaded = False
+        logger.debug(f"VLM Client initialized: {self.vlm_model} at {self.server_url}")
+    def check_availability(self) -> bool:
+        """
+        Check if VLM model is available on Lemonade server.
+        Returns:
+            True if model is available, False otherwise
+        """
+        try:
+            models_response = self.client.list_models()
+            available_models = [
+                m.get("id", "") for m in models_response.get("data", [])
+            ]
+            if self.vlm_model in available_models:
+                logger.debug(f"VLM model available: {self.vlm_model}")
+                return True
+            else:
+                logger.warning(f"❌ VLM model not found: {self.vlm_model}")
+                logger.warning("")
+                logger.warning("📥 To download this model:")
+                logger.warning(f"   1. Open Lemonade Model Manager ({self.server_url})")
+                logger.warning(f"   2. Search for: {self.vlm_model}")
+                logger.warning("   3. Click 'Download' to install the model")
+                logger.warning("")
+                logger.warning(
+                    f"   Available models: {', '.join(available_models[:3])}..."
+                )
+                return False
+        except Exception as e:
+            logger.error(f"Failed to check VLM availability: {e}")
+            logger.error(
+                f"   Make sure Lemonade server is running at {self.server_url}"
+            )
+            return False
+    def _ensure_vlm_loaded(self) -> bool:
+        """
+        Ensure VLM model is loaded, load it if necessary.
+        The model will be automatically downloaded if not available (handled by
+        lemonade_client.chat_completions with auto_download=True).
+        Returns:
+            True if VLM is loaded, False if loading failed
+        """
+        if self.vlm_loaded:
+            return True
+        if not self.auto_load:
+            logger.warning("VLM not loaded and auto_load=False")
+            return False
+        try:
+            logger.debug(f"Loading VLM model: {self.vlm_model}")
+            # Load model (auto-download handled by lemonade_client, may take hours)
+            self.client.load_model(self.vlm_model, timeout=60, auto_download=True)
+            self.vlm_loaded = True
+            logger.debug(f"VLM model loaded: {self.vlm_model}")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to load VLM model: {e}")
+            logger.error(
+                f"   Make sure Lemonade server is running at {self.server_url}"
+            )
+            return False
+    def extract_from_image(
+        self,
+        image_bytes: bytes,
+        image_num: int = 1,
+        page_num: int = 1,
+        prompt: Optional[str] = None,
+    ) -> str:
+        """
+        Extract text from an image using VLM.
+        Args:
+            image_bytes: Image as PNG/JPEG bytes
+            image_num: Image number on page (for logging)
+            page_num: Page number (for logging)
+            prompt: Custom extraction prompt (optional)
+        Returns:
+            Extracted text in markdown format
+        """
+        # Ensure VLM is loaded
+        if not self._ensure_vlm_loaded():
+            error_msg = "VLM model not available"
+            logger.error(error_msg)
+            return f"[VLM extraction failed: {error_msg}]"
+        # Encode image as base64 and detect MIME type
+        # Note: Image size optimization happens in pdf_utils.py during extraction
+        image_b64 = base64.b64encode(image_bytes).decode("utf-8")
+        mime_type = detect_image_mime_type(image_bytes)
+        # Default prompt for text extraction
+        if not prompt:
+            prompt = """You are an OCR system. Extract ALL visible text from this image exactly as it appears.
+Instructions:
+1. Extract EVERY word you see - don't skip or paraphrase
+2. Preserve exact formatting (headings, bold, bullets, tables)
+3. If it's a table, format as markdown table
+4. If it's a chart, describe what you see: [CHART: ...]
+5. Do NOT add placeholders like "[Insert ...]"  - only extract actual text
+6. Do NOT generate or invent content - only extract what you see
+Output format: Clean markdown with the ACTUAL text from the image."""
+        # Format message with image (OpenAI vision format)
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:{mime_type};base64,{image_b64}"},
+                    },
+                ],
+            }
+        ]
+        try:
+            import time
+            start_time = time.time()
+            logger.debug(
+                f"VLM extracting from image {image_num} on page {page_num} ({mime_type})..."
+            )
+            logger.debug(
+                f"   Image: {mime_type}, {len(image_b64)} chars base64 ({len(image_bytes)} bytes raw)"
+            )
+            # Call VLM using chat completions endpoint
+            response = self.client.chat_completions(
+                model=self.vlm_model,
+                messages=messages,
+                temperature=0.1,  # Low temp for accurate extraction
+                max_completion_tokens=2048,  # Allow detailed extraction
+                timeout=300,  # VLM needs more time for complex forms (5 min)
+            )
+            elapsed = time.time() - start_time
+            # Extract text from response
+            if (
+                isinstance(response, dict)
+                and "choices" in response
+                and len(response["choices"]) > 0
+            ):
+                extracted_text = response["choices"][0]["message"]["content"]
+                size_kb = len(image_bytes) / 1024
+                logger.debug(
+                    f"Extracted {len(extracted_text)} chars from image {image_num} "
+                    f"in {elapsed:.2f}s ({size_kb:.0f}KB image)"
+                )
+                return extracted_text
+            else:
+                # Check for specific error types and provide helpful messages
+                error_msg = self._parse_vlm_error(response)
+                logger.error(error_msg)
+                return f"[VLM extraction failed: {error_msg}]"
+        except Exception as e:
+            logger.error(
+                f"VLM extraction failed for page {page_num}, image {image_num}: {e}"
+            )
+            import traceback
+            logger.debug(traceback.format_exc())
+            return f"[VLM extraction failed: {str(e)}]"
+    def _parse_vlm_error(self, response: dict) -> str:
+        """Parse VLM error response and return a helpful error message."""
+        if not isinstance(response, dict):
+            return f"Unexpected response type: {type(response)}"
+        # Check for nested error structure from Lemonade
+        error = response.get("error", {})
+        if isinstance(error, dict):
+            details = error.get("details", {})
+            inner_response = (
+                details.get("response", {}) if isinstance(details, dict) else {}
+            )
+            inner_error = (
+                inner_response.get("error", {})
+                if isinstance(inner_response, dict)
+                else {}
+            )
+            # Context size error
+            if inner_error.get("type") == "exceed_context_size_error":
+                n_ctx = inner_error.get("n_ctx", "unknown")
+                n_prompt = inner_error.get("n_prompt_tokens", "unknown")
+                return (
+                    f"Context size too small! Image requires {n_prompt} tokens "
+                    f"but model context is only {n_ctx}. "
+                    f"To fix: Right-click Lemonade tray icon → Settings → "
+                    f"set Context Size to 32768, then restart the model."
+                )
+            # Other backend errors
+            if error.get("type") == "backend_error":
+                msg = inner_error.get(
+                    "message", error.get("message", "Unknown backend error")
+                )
+                return f"Backend error: {msg}"
+        return f"Unexpected response format: {response}"
+    def extract_from_page_images(self, images: list, page_num: int) -> list:
+        """
+        Extract text from multiple images on a page.
+        Args:
+            images: List of image dicts with 'image_bytes', 'width', 'height', etc.
+            page_num: Page number
+        Returns:
+            List of dicts:
+            [
+                {
+                    "image_num": 1,
+                    "text": "extracted markdown",
+                    "dimensions": "800x600",
+                    "size_kb": 45.2
+                },
+                ...
+            ]
+        """
+        results = []
+        for img_idx, img_data in enumerate(images, 1):
+            extracted_text = self.extract_from_image(
+                image_bytes=img_data["image_bytes"],
+                image_num=img_idx,
+                page_num=page_num,
+            )
+            results.append(
+                {
+                    "image_num": img_idx,
+                    "text": extracted_text,
+                    "dimensions": f"{img_data['width']}x{img_data['height']}",
+                    "size_kb": img_data["size_kb"],
+                }
+            )
+        return results
+    def cleanup(self):
+        """
+        Cleanup VLM resources.
+        Call this after batch processing to mark VLM as unloaded.
+        Note: Model remains loaded on server; this just updates local state.
+        """
+        if self.vlm_loaded:
+            logger.info("🧹 VLM processing complete")
+            self.vlm_loaded = False
+    def __enter__(self):
+        """Context manager entry - ensure VLM loaded."""
+        self._ensure_vlm_loaded()
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit - cleanup VLM state."""
+        self.cleanup()

amd-gaia 0.15.0__py3-none-any.whl → 0.15.2__py3-none-any.whl

amd-gaia 0.15.0py3-none-any.whl → 0.15.2py3-none-any.whl