PyPI - abstractcore - Versions diffs - 2.5.0__py3-none-any.whl → 2.5.3__py3-none-any.whl - Mend

abstractcore 2.5.0py3-none-any.whl → 2.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

abstractcore/__init__.py +12 -0
abstractcore/apps/__main__.py +8 -1
abstractcore/apps/deepsearch.py +644 -0
abstractcore/apps/intent.py +614 -0
abstractcore/architectures/detection.py +250 -4
abstractcore/assets/architecture_formats.json +14 -1
abstractcore/assets/model_capabilities.json +583 -44
abstractcore/compression/__init__.py +29 -0
abstractcore/compression/analytics.py +420 -0
abstractcore/compression/cache.py +250 -0
abstractcore/compression/config.py +279 -0
abstractcore/compression/exceptions.py +30 -0
abstractcore/compression/glyph_processor.py +381 -0
abstractcore/compression/optimizer.py +388 -0
abstractcore/compression/orchestrator.py +380 -0
abstractcore/compression/pil_text_renderer.py +818 -0
abstractcore/compression/quality.py +226 -0
abstractcore/compression/text_formatter.py +666 -0
abstractcore/compression/vision_compressor.py +371 -0
abstractcore/config/main.py +66 -1
abstractcore/config/manager.py +111 -5
abstractcore/core/session.py +105 -5
abstractcore/events/__init__.py +1 -1
abstractcore/media/auto_handler.py +312 -18
abstractcore/media/handlers/local_handler.py +14 -2
abstractcore/media/handlers/openai_handler.py +62 -3
abstractcore/media/processors/__init__.py +11 -1
abstractcore/media/processors/direct_pdf_processor.py +210 -0
abstractcore/media/processors/glyph_pdf_processor.py +227 -0
abstractcore/media/processors/image_processor.py +7 -1
abstractcore/media/processors/text_processor.py +18 -3
abstractcore/media/types.py +164 -7
abstractcore/processing/__init__.py +5 -1
abstractcore/processing/basic_deepsearch.py +2173 -0
abstractcore/processing/basic_intent.py +690 -0
abstractcore/providers/__init__.py +18 -0
abstractcore/providers/anthropic_provider.py +29 -2
abstractcore/providers/base.py +279 -6
abstractcore/providers/huggingface_provider.py +658 -27
abstractcore/providers/lmstudio_provider.py +52 -2
abstractcore/providers/mlx_provider.py +103 -4
abstractcore/providers/model_capabilities.py +352 -0
abstractcore/providers/ollama_provider.py +44 -6
abstractcore/providers/openai_provider.py +29 -2
abstractcore/providers/registry.py +91 -19
abstractcore/server/app.py +91 -81
abstractcore/structured/handler.py +161 -1
abstractcore/tools/common_tools.py +98 -3
abstractcore/utils/__init__.py +4 -1
abstractcore/utils/cli.py +114 -1
abstractcore/utils/trace_export.py +287 -0
abstractcore/utils/version.py +1 -1
abstractcore/utils/vlm_token_calculator.py +655 -0
{abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/METADATA +140 -23
abstractcore-2.5.3.dist-info/RECORD +107 -0
{abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/entry_points.txt +4 -0
abstractcore-2.5.0.dist-info/RECORD +0 -86
{abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/WHEEL +0 -0
{abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/licenses/LICENSE +0 -0
{abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/top_level.txt +0 -0

abstractcore/providers/huggingface_provider.py CHANGED Viewed

@@ -8,6 +8,25 @@ import json
 from pathlib import Path
 from typing import List, Dict, Any, Optional, Union, Iterator, Type
+# Import config manager to respect offline-first settings
+from ..config.manager import get_config_manager
+# Get config instance and set offline environment variables if needed
+_config = get_config_manager()
+if _config.is_offline_first():
+    os.environ["TRANSFORMERS_OFFLINE"] = "1"
+    os.environ["HF_DATASETS_OFFLINE"] = "1"
+    os.environ["HF_HUB_OFFLINE"] = "1"
+# Enable MPS fallback for Apple Silicon to handle unsupported operations
+# This prevents "MPS: Unsupported Border padding mode" errors in vision models
+try:
+    import torch
+    if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+        os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+except ImportError:
+    pass  # torch not available, skip MPS setup
 try:
     from pydantic import BaseModel
     PYDANTIC_AVAILABLE = True
@@ -22,7 +41,7 @@ from ..events import EventType
 # Try to import transformers (standard HuggingFace support)
 try:
-    from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+    from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer, AutoProcessor, AutoModelForImageTextToText, pipeline
     import torch
     TRANSFORMERS_AVAILABLE = True
 except ImportError:
@@ -35,16 +54,40 @@ try:
 except ImportError:
     LLAMACPP_AVAILABLE = False
+# Try to import Outlines (native structured output for transformers models)
+try:
+    import outlines
+    OUTLINES_AVAILABLE = True
+except ImportError:
+    OUTLINES_AVAILABLE = False
 # We no longer download models - cache-only approach
 # huggingface_hub not required for basic operation
+def _get_local_model_path(model_name: str) -> Optional[str]:
+    """Get local cache path for a HuggingFace model if it exists."""
+    # Use centralized configuration for cache directory
+    config = _config
+    hf_cache_dir = Path(config.config.cache.huggingface_cache_dir).expanduser()
+    model_cache_name = f"models--{model_name.replace('/', '--')}"
+    model_cache_path = hf_cache_dir / "hub" / model_cache_name / "snapshots"
+    if model_cache_path.exists():
+        snapshot_dirs = [d for d in model_cache_path.iterdir() if d.is_dir()]
+        if snapshot_dirs:
+            return str(snapshot_dirs[0])  # Return first snapshot
+    return None
 class HuggingFaceProvider(BaseProvider):
     """HuggingFace provider with dual support for transformers and GGUF models"""
-    def __init__(self, model: str = "Qwen/Qwen3-4B",
+    def __init__(self, model: str = "unsloth/Qwen3-4B-Instruct-2507-GGUF",
                  device: Optional[str] = None,
                  n_gpu_layers: Optional[int] = None,
+                 structured_output_method: str = "auto",
                  **kwargs):
         # Handle legacy context_size parameter with deprecation warning
@@ -61,10 +104,18 @@ class HuggingFaceProvider(BaseProvider):
                 kwargs["max_tokens"] = context_size
         super().__init__(model, **kwargs)
+        self.provider = "huggingface"
         # Handle timeout parameter for local models
         self._handle_timeout_parameter(kwargs)
+        # Structured output method: "auto", "native_outlines", "prompted"
+        # auto: Use Outlines if available (for transformers), otherwise prompted (default)
+        # native_outlines: Force Outlines (error if unavailable)
+        # prompted: Always use prompted fallback (fastest for transformers, still 100% success)
+        # Note: GGUF models always use llama-cpp-python native support regardless of this setting
+        self.structured_output_method = structured_output_method
         # Initialize tool handler
         self.tool_handler = UniversalToolHandler(model)
@@ -72,9 +123,19 @@ class HuggingFaceProvider(BaseProvider):
         self.n_gpu_layers = n_gpu_layers
         self.model_type = None  # Will be "transformers" or "gguf"
         self.device = device
+        # Store transformers-specific parameters
+        self.transformers_kwargs = {
+            k: v for k, v in kwargs.items()
+            if k in ['trust_remote_code', 'torch_dtype', 'device_map', 'load_in_8bit', 'load_in_4bit', 'attn_implementation']
+        }
+        # Store device preference for custom models
+        self.preferred_device = kwargs.get('device_map', 'auto')
         # Model instances
         self.tokenizer = None
+        self.processor = None  # For vision models
         self.model_instance = None
         self.pipeline = None
         self.llm = None  # For GGUF models
@@ -111,6 +172,9 @@ class HuggingFaceProvider(BaseProvider):
             if hasattr(self, 'tokenizer') and self.tokenizer is not None:
                 self.tokenizer = None
+            if hasattr(self, 'processor') and self.processor is not None:
+                self.processor = None
             if hasattr(self, 'model') and hasattr(self, 'model') and self.model is not None:
                 # For transformers models, clear the model
@@ -153,6 +217,26 @@ class HuggingFaceProvider(BaseProvider):
         return False
+    def _is_vision_model(self, model: str) -> bool:
+        """Detect if the model is a vision model that requires special handling"""
+        model_lower = model.lower()
+        # Known vision models that require AutoModelForImageTextToText
+        vision_models = [
+            'glyph',           # zai-org/Glyph
+            'glm-4.1v',        # GLM-4.1V variants
+            'glm4v',           # GLM4V architecture
+            'qwen-vl',         # Qwen-VL models
+            'qwen2-vl',        # Qwen2-VL models
+            'qwen2.5-vl',      # Qwen2.5-VL models
+            'llava',           # LLaVA models
+            'instructblip',    # InstructBLIP models
+            'blip2',           # BLIP2 models
+            'flamingo',        # Flamingo models
+        ]
+        return any(vision_keyword in model_lower for vision_keyword in vision_models)
     def _setup_device_transformers(self):
         """Setup device for transformers models"""
         if not TRANSFORMERS_AVAILABLE:
@@ -200,24 +284,65 @@ class HuggingFaceProvider(BaseProvider):
     def _load_transformers_model(self):
         """Load standard HuggingFace transformers model"""
         try:
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model)
-            self.model_instance = AutoModelForCausalLM.from_pretrained(self.model)
+            # Check if this is a vision model that requires special handling
+            if self._is_vision_model(self.model):
+                return self._load_vision_model()
+            # Load tokenizer with transformers-specific parameters
+            tokenizer_kwargs = {k: v for k, v in self.transformers_kwargs.items()
+                              if k in ['trust_remote_code']}
+            # Respect offline-first configuration
+            if _config.should_force_local_files_only():
+                tokenizer_kwargs['local_files_only'] = True
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model, **tokenizer_kwargs)
+            # Load model with all transformers-specific parameters
+            # Try AutoModelForCausalLM first, fall back to AutoModel for custom models
+            model_kwargs = self.transformers_kwargs.copy()
+            # Respect offline-first configuration
+            if _config.should_force_local_files_only():
+                model_kwargs['local_files_only'] = True
+            try:
+                self.model_instance = AutoModelForCausalLM.from_pretrained(self.model, **model_kwargs)
+            except ValueError as e:
+                if "Unrecognized configuration class" in str(e) or "glm4v" in str(e).lower():
+                    # Fall back to AutoModel for custom models like DeepSeek-OCR
+                    self.model_instance = AutoModel.from_pretrained(self.model, **model_kwargs)
+                else:
+                    raise
-            # Move to device
-            if self.device in ["cuda", "mps"]:
+            # Move to device (only if not using device_map)
+            if self.device in ["cuda", "mps"] and 'device_map' not in self.transformers_kwargs:
                 self.model_instance = self.model_instance.to(self.device)
-            # Create pipeline
+            # Create pipeline - handle custom models that don't support text-generation
             device_arg = 0 if self.device == "cuda" else -1
             if self.device == "mps":
                 device_arg = -1
-            self.pipeline = pipeline(
-                "text-generation",
-                model=self.model_instance,
-                tokenizer=self.tokenizer,
-                device=device_arg
-            )
+            try:
+                # Don't pass device argument if using device_map (accelerate)
+                if 'device_map' in self.transformers_kwargs:
+                    self.pipeline = pipeline(
+                        "text-generation",
+                        model=self.model_instance,
+                        tokenizer=self.tokenizer
+                    )
+                else:
+                    self.pipeline = pipeline(
+                        "text-generation",
+                        model=self.model_instance,
+                        tokenizer=self.tokenizer,
+                        device=device_arg
+                    )
+            except ValueError as e:
+                if "not supported for text-generation" in str(e) or "accelerate" in str(e):
+                    # For custom models like DeepSeek-OCR, skip pipeline creation
+                    # We'll handle generation directly through the model
+                    self.pipeline = None
+                else:
+                    raise
         except Exception as e:
             error_str = str(e).lower()
@@ -229,6 +354,96 @@ class HuggingFaceProvider(BaseProvider):
             else:
                 raise RuntimeError(f"Failed to load HuggingFace model {self.model}: {str(e)}")
+    def _load_vision_model(self):
+        """Load vision model using AutoModelForImageTextToText and AutoProcessor"""
+        try:
+            # Suppress progress bars during model loading unless in debug mode
+            import os
+            from transformers.utils import logging as transformers_logging
+            if not self.debug:
+                # Disable transformers progress bars
+                os.environ['TRANSFORMERS_VERBOSITY'] = 'error'
+                transformers_logging.set_verbosity_error()
+                # Disable tqdm progress bars
+                os.environ['DISABLE_TQDM'] = '1'
+            # Load processor for vision models (handles both text and images)
+            processor_kwargs = {k: v for k, v in self.transformers_kwargs.items()
+                              if k in ['trust_remote_code']}
+            # Enable trust_remote_code for custom architectures like GLM4V
+            processor_kwargs['trust_remote_code'] = True
+            # Set use_fast=True to avoid the slow processor warning
+            processor_kwargs['use_fast'] = True
+            # Respect offline-first configuration
+            if _config.should_force_local_files_only():
+                processor_kwargs['local_files_only'] = True
+            # Use local cache path if offline mode is enabled and model is cached
+            model_path = self.model
+            if _config.should_force_local_files_only():
+                local_path = _get_local_model_path(self.model)
+                if local_path:
+                    model_path = local_path
+                    processor_kwargs.pop('local_files_only', None)  # Remove since we're using local path
+                    self.logger.debug(f"Loading processor from local cache: {local_path}")
+            self.processor = AutoProcessor.from_pretrained(model_path, **processor_kwargs)
+            # Load vision model using AutoModelForImageTextToText with trust_remote_code
+            vision_kwargs = self.transformers_kwargs.copy()
+            vision_kwargs['trust_remote_code'] = True
+            # Respect offline-first configuration
+            if _config.should_force_local_files_only():
+                vision_kwargs['local_files_only'] = True
+            # Use local cache path if offline mode is enabled and model is cached
+            model_path = self.model
+            if _config.should_force_local_files_only():
+                local_path = _get_local_model_path(self.model)
+                if local_path:
+                    model_path = local_path
+                    vision_kwargs.pop('local_files_only', None)  # Remove since we're using local path
+                    self.logger.debug(f"Loading model from local cache: {local_path}")
+            self.model_instance = AutoModelForImageTextToText.from_pretrained(model_path, **vision_kwargs)
+            # Restore logging levels if they were suppressed
+            if not self.debug:
+                # Restore transformers logging
+                transformers_logging.set_verbosity_warning()
+                # Remove tqdm suppression
+                if 'DISABLE_TQDM' in os.environ:
+                    del os.environ['DISABLE_TQDM']
+            # Move to device (only if not using device_map)
+            if self.device in ["cuda", "mps"] and 'device_map' not in self.transformers_kwargs:
+                self.model_instance = self.model_instance.to(self.device)
+            # For vision models, we don't use the standard pipeline
+            self.pipeline = None
+            self.logger.info(f"Successfully loaded vision model {self.model} using AutoModelForImageTextToText")
+        except Exception as e:
+            error_str = str(e).lower()
+            # Check for transformers version issues
+            if 'glm4v' in error_str and 'does not recognize this architecture' in error_str:
+                import transformers
+                current_version = transformers.__version__
+                raise RuntimeError(
+                    f"GLM4V architecture requires transformers>=4.57.1, but you have {current_version}. "
+                    f"Please upgrade: pip install transformers>=4.57.1"
+                )
+            elif ('not found' in error_str or 'does not exist' in error_str or
+                'not a valid model identifier' in error_str):
+                available_models = self.list_available_models()
+                error_message = format_model_error("HuggingFace", self.model, available_models)
+                raise ModelNotFoundError(error_message)
+            else:
+                raise RuntimeError(f"Failed to load HuggingFace vision model {self.model}: {str(e)}")
     def _find_gguf_in_cache(self, model_name: str) -> Optional[str]:
         """Find GGUF model in HuggingFace cache (cache-only, no downloading)"""
@@ -481,9 +696,9 @@ class HuggingFaceProvider(BaseProvider):
         """Generate response using appropriate backend"""
         if self.model_type == "gguf":
-            return self._generate_gguf(prompt, messages, system_prompt, tools, media, stream, **kwargs)
+            return self._generate_gguf(prompt, messages, system_prompt, tools, media, stream, response_model, **kwargs)
         else:
-            return self._generate_transformers(prompt, messages, system_prompt, tools, media, stream, **kwargs)
+            return self._generate_transformers(prompt, messages, system_prompt, tools, media, stream, response_model, **kwargs)
     def _generate_transformers(self,
                                prompt: str,
@@ -492,15 +707,83 @@ class HuggingFaceProvider(BaseProvider):
                                tools: Optional[List[Dict[str, Any]]] = None,
                                media: Optional[List['MediaContent']] = None,
                                stream: bool = False,
+                               response_model: Optional[Type[BaseModel]] = None,
                                **kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
-        """Generate using transformers backend (original implementation)"""
+        """Generate using transformers backend with optional Outlines native structured output"""
         if not self.pipeline:
-            return GenerateResponse(
-                content="Error: Transformers model not loaded",
-                model=self.model,
-                finish_reason="error"
-            )
+            # Handle vision models that use processor instead of pipeline
+            if self.processor and hasattr(self.model_instance, 'generate'):
+                return self._generate_vision_model(prompt, messages, system_prompt, tools, media, stream, response_model, **kwargs)
+            # Handle custom models like DeepSeek-OCR that don't support standard pipelines
+            elif hasattr(self.model_instance, 'infer'):
+                return self._generate_custom_model(prompt, messages, system_prompt, tools, media, stream, response_model, **kwargs)
+            else:
+                return GenerateResponse(
+                    content="Error: Transformers model not loaded or doesn't support generation",
+                    model=self.model,
+                    finish_reason="error"
+                )
+        # Native structured output via Outlines (if configured and available)
+        should_use_outlines = (
+            response_model and
+            PYDANTIC_AVAILABLE and
+            not stream and
+            self.structured_output_method != "prompted"  # Skip if explicitly prompted
+        )
+        if should_use_outlines:
+            # Check if Outlines is required but unavailable
+            if self.structured_output_method == "native_outlines" and not OUTLINES_AVAILABLE:
+                return GenerateResponse(
+                    content="Error: structured_output_method='native_outlines' requires Outlines library. Install with: pip install abstractcore[huggingface]",
+                    model=self.model,
+                    finish_reason="error"
+                )
+            # Try Outlines if available (auto or native_outlines mode)
+            if OUTLINES_AVAILABLE:
+                try:
+                    # Cache Outlines model wrapper to avoid re-initialization
+                    if not hasattr(self, '_outlines_model') or self._outlines_model is None:
+                        self.logger.debug("Creating Outlines model wrapper for native structured output")
+                        self._outlines_model = outlines.from_transformers(
+                            self.model_instance,
+                            self.tokenizer
+                        )
+                    # Build input text (same as normal generation)
+                    input_text = self._build_input_text_transformers(prompt, messages, system_prompt, tools)
+                    # Create constrained generator with JSON schema
+                    self.logger.debug(f"Using Outlines native structured output for {response_model.__name__}")
+                    generator = self._outlines_model(
+                        input_text,
+                        outlines.json_schema(response_model),
+                        max_tokens=kwargs.get("max_tokens", self.max_tokens or 512)
+                    )
+                    # Validate and return
+                    validated_obj = response_model.model_validate(generator)
+                    return GenerateResponse(
+                        content=validated_obj.model_dump_json(),
+                        model=self.model,
+                        finish_reason="stop",
+                        validated_object=validated_obj
+                    )
+                except Exception as e:
+                    # If native_outlines was explicitly requested, don't fall back
+                    if self.structured_output_method == "native_outlines":
+                        return GenerateResponse(
+                            content=f"Error: Outlines native structured output failed: {str(e)}",
+                            model=self.model,
+                            finish_reason="error"
+                        )
+                    # Otherwise fall back to prompted approach
+                    self.logger.debug(f"Outlines generation failed, falling back to prompted: {e}")
+                    # Continue with normal generation below
         # Build input text with tool and media support
         # Handle media content first if present
@@ -561,6 +844,311 @@ class HuggingFaceProvider(BaseProvider):
                 finish_reason="error"
             )
+    def _generate_custom_model(self,
+                              prompt: str,
+                              messages: Optional[List[Dict[str, str]]] = None,
+                              system_prompt: Optional[str] = None,
+                              tools: Optional[List[Dict[str, Any]]] = None,
+                              media: Optional[List['MediaContent']] = None,
+                              stream: bool = False,
+                              response_model: Optional[Type[BaseModel]] = None,
+                              **kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
+        """Generate using custom model methods (e.g., DeepSeek-OCR's infer method)"""
+        import time
+        import tempfile
+        import os
+        start_time = time.time()
+        try:
+            # Handle media content for vision models like DeepSeek-OCR
+            if media and len(media) > 0:
+                # Use the first image for OCR
+                media_item = media[0]
+                # DeepSeek-OCR expects image file path
+                if hasattr(media_item, 'file_path') and media_item.file_path:
+                    image_file = str(media_item.file_path)
+                else:
+                    # If no file path, save media content to temp file
+                    from PIL import Image
+                    if hasattr(media_item, 'content') and media_item.content:
+                        # Handle base64 content
+                        if media_item.content_format == 'BASE64':
+                            import base64
+                            image_data = base64.b64decode(media_item.content)
+                            temp_file = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
+                            temp_file.write(image_data)
+                            temp_file.close()
+                            image_file = temp_file.name
+                        else:
+                            return GenerateResponse(
+                                content="Error: Unsupported media format for DeepSeek-OCR",
+                                model=self.model,
+                                finish_reason="error"
+                            )
+                    else:
+                        return GenerateResponse(
+                            content="Error: No valid image content found",
+                            model=self.model,
+                            finish_reason="error"
+                        )
+                # Use DeepSeek-OCR's infer method
+                try:
+                    # Create temporary output directory for DeepSeek-OCR
+                    temp_output_dir = tempfile.mkdtemp()
+                    # Patch DeepSeek-OCR for MPS/CPU compatibility if needed
+                    if self.device == "mps" or (self.device is None and hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()):
+                        self._patch_deepseek_for_mps()
+                    result = self.model_instance.infer(
+                        self.tokenizer,
+                        prompt=prompt,
+                        image_file=image_file,
+                        output_path=temp_output_dir,  # DeepSeek-OCR requires output path
+                        base_size=1024,
+                        image_size=640,
+                        crop_mode=True,
+                        save_results=False,
+                        test_compress=False
+                    )
+                    # Clean up temp output directory
+                    import shutil
+                    shutil.rmtree(temp_output_dir, ignore_errors=True)
+                    # Clean up temp file if created
+                    if 'temp_file' in locals() and os.path.exists(image_file):
+                        os.unlink(image_file)
+                    # Calculate generation time
+                    gen_time = (time.time() - start_time) * 1000
+                    return GenerateResponse(
+                        content=result if isinstance(result, str) else str(result),
+                        model=self.model,
+                        finish_reason="stop",
+                        input_tokens=len(prompt.split()),  # Rough estimate
+                        output_tokens=len(str(result).split()) if result else 0,
+                        gen_time=gen_time
+                    )
+                except Exception as e:
+                    return GenerateResponse(
+                        content=f"Error during DeepSeek-OCR inference: {str(e)}",
+                        model=self.model,
+                        finish_reason="error"
+                    )
+            else:
+                return GenerateResponse(
+                    content="Error: DeepSeek-OCR requires image input",
+                    model=self.model,
+                    finish_reason="error"
+                )
+        except Exception as e:
+            return GenerateResponse(
+                content=f"Error in custom model generation: {str(e)}",
+                model=self.model,
+                finish_reason="error"
+            )
+    def _generate_vision_model(self,
+                              prompt: str,
+                              messages: Optional[List[Dict[str, str]]] = None,
+                              system_prompt: Optional[str] = None,
+                              tools: Optional[List[Dict[str, Any]]] = None,
+                              media: Optional[List['MediaContent']] = None,
+                              stream: bool = False,
+                              response_model: Optional[Type[BaseModel]] = None,
+                              **kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
+        """Generate using vision model (Glyph, GLM-4.1V, etc.)"""
+        import time
+        start_time = time.time()
+        # Import torch safely
+        try:
+            import torch
+        except ImportError:
+            return GenerateResponse(
+                content="Error: PyTorch not available for vision model generation",
+                model=self.model,
+                finish_reason="error",
+                gen_time=0.0
+            )
+        try:
+            # Build messages for vision model
+            chat_messages = []
+            if system_prompt:
+                chat_messages.append({"role": "system", "content": system_prompt})
+            if messages:
+                chat_messages.extend(messages)
+            # Build user message with media content
+            user_content = []
+            # Add text content
+            if prompt:
+                user_content.append({"type": "text", "text": prompt})
+            # Add media content (images)
+            if media:
+                for media_item in media:
+                    if hasattr(media_item, 'file_path') and media_item.file_path:
+                        # Use file path directly
+                        user_content.append({
+                            "type": "image",
+                            "url": str(media_item.file_path)
+                        })
+                    elif hasattr(media_item, 'content') and media_item.content:
+                        # Handle base64 content
+                        if media_item.content_format == 'BASE64':
+                            # Create data URL for base64 content
+                            mime_type = getattr(media_item, 'mime_type', 'image/png')
+                            data_url = f"data:{mime_type};base64,{media_item.content}"
+                            user_content.append({
+                                "type": "image",
+                                "url": data_url
+                            })
+            # Add user message
+            chat_messages.append({
+                "role": "user",
+                "content": user_content
+            })
+            # Process messages using the processor
+            inputs = self.processor.apply_chat_template(
+                chat_messages,
+                tokenize=True,
+                add_generation_prompt=True,
+                return_dict=True,
+                return_tensors="pt"
+            ).to(self.model_instance.device)
+            # Generation parameters
+            generation_kwargs = {
+                "max_new_tokens": kwargs.get("max_tokens", self.max_output_tokens or 512),
+                "temperature": kwargs.get("temperature", self.temperature),
+                "do_sample": True,
+                "pad_token_id": self.processor.tokenizer.eos_token_id,
+            }
+            # Add seed if provided
+            seed_value = kwargs.get("seed", self.seed)
+            if seed_value is not None:
+                torch.manual_seed(seed_value)
+                if torch.cuda.is_available():
+                    torch.cuda.manual_seed_all(seed_value)
+            # Generate response
+            # For Apple Silicon, move inputs to CPU if MPS causes issues
+            if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+                try:
+                    generated_ids = self.model_instance.generate(**inputs, **generation_kwargs)
+                except RuntimeError as e:
+                    if "MPS: Unsupported Border padding mode" in str(e):
+                        self.logger.warning("MPS Border padding mode error detected, falling back to CPU")
+                        # Move model and inputs to CPU
+                        cpu_model = self.model_instance.to('cpu')
+                        cpu_inputs = {k: v.to('cpu') if hasattr(v, 'to') else v for k, v in inputs.items()}
+                        generated_ids = cpu_model.generate(**cpu_inputs, **generation_kwargs)
+                        # Move model back to original device
+                        self.model_instance.to(self.model_instance.device)
+                    else:
+                        raise e
+            else:
+                generated_ids = self.model_instance.generate(**inputs, **generation_kwargs)
+            # Decode response
+            output_text = self.processor.decode(
+                generated_ids[0][inputs["input_ids"].shape[1]:],
+                skip_special_tokens=True
+            )
+            # Calculate generation time
+            gen_time = (time.time() - start_time) * 1000
+            # Calculate token usage
+            input_tokens = inputs["input_ids"].shape[1]
+            output_tokens = len(generated_ids[0]) - input_tokens
+            return GenerateResponse(
+                content=output_text.strip(),
+                model=self.model,
+                finish_reason="stop",
+                usage={
+                    "input_tokens": input_tokens,
+                    "output_tokens": output_tokens,
+                    "total_tokens": input_tokens + output_tokens,
+                    "prompt_tokens": input_tokens,
+                    "completion_tokens": output_tokens
+                },
+                gen_time=gen_time
+            )
+        except Exception as e:
+            gen_time = (time.time() - start_time) * 1000 if 'start_time' in locals() else 0.0
+            return GenerateResponse(
+                content=f"Error in vision model generation: {str(e)}",
+                model=self.model,
+                finish_reason="error",
+                gen_time=gen_time
+            )
+    def _patch_deepseek_for_mps(self):
+        """Patch DeepSeek-OCR model to work with MPS instead of CUDA"""
+        import types
+        def patched_infer(self, tokenizer, prompt='', image_file='', output_path='', base_size=1024, image_size=640, crop_mode=True, test_compress=False, save_results=False, eval_mode=False):
+            """Patched infer method that uses MPS instead of CUDA"""
+            import torch
+            # Determine the best available device
+            if torch.backends.mps.is_available():
+                device = torch.device('mps')
+            elif torch.cuda.is_available():
+                device = torch.device('cuda')
+            else:
+                device = torch.device('cpu')
+            # Call the original infer method but patch tensor.cuda() calls
+            original_cuda = torch.Tensor.cuda
+            def patched_cuda(tensor, device=None, non_blocking=False, **kwargs):
+                """Redirect .cuda() calls to the appropriate device"""
+                if device == 'mps' or (device is None and torch.backends.mps.is_available()):
+                    return tensor.to('mps', non_blocking=non_blocking)
+                elif torch.cuda.is_available():
+                    return original_cuda(tensor, device, non_blocking, **kwargs)
+                else:
+                    return tensor.to('cpu', non_blocking=non_blocking)
+            # Temporarily patch the cuda method
+            torch.Tensor.cuda = patched_cuda
+            try:
+                # Move model to the appropriate device first
+                self.to(device)
+                # Call original infer with device patching
+                return self._original_infer(tokenizer, prompt, image_file, output_path, base_size, image_size, crop_mode, test_compress, save_results, eval_mode)
+            finally:
+                # Restore original cuda method
+                torch.Tensor.cuda = original_cuda
+        # Only patch if not already patched
+        if not hasattr(self.model_instance, '_original_infer'):
+            self.model_instance._original_infer = self.model_instance.infer
+            self.model_instance.infer = types.MethodType(patched_infer, self.model_instance)
     def _generate_gguf(self,
                        prompt: str,
                        messages: Optional[List[Dict[str, str]]] = None,
@@ -568,6 +1156,7 @@ class HuggingFaceProvider(BaseProvider):
                        tools: Optional[List[Dict[str, Any]]] = None,
                        media: Optional[List['MediaContent']] = None,
                        stream: bool = False,
+                       response_model: Optional[Type[BaseModel]] = None,
                        **kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
         """Generate using GGUF backend with llama-cpp-python"""
@@ -663,6 +1252,19 @@ class HuggingFaceProvider(BaseProvider):
         if seed_value is not None:
             generation_kwargs["seed"] = seed_value
+        # Add native structured output support (llama-cpp-python format)
+        # llama-cpp-python supports native structured outputs using the response_format parameter
+        # This provides server-side guaranteed schema compliance
+        if response_model and PYDANTIC_AVAILABLE:
+            json_schema = response_model.model_json_schema()
+            generation_kwargs["response_format"] = {
+                "type": "json_schema",
+                "json_schema": {
+                    "name": response_model.__name__,
+                    "schema": json_schema
+                }
+            }
         # Handle tools - both native and prompted support
         has_native_tools = False
         if tools:
@@ -858,10 +1460,13 @@ class HuggingFaceProvider(BaseProvider):
         try:
             # Set seed for deterministic generation if provided
             if seed is not None:
-                import torch
-                torch.manual_seed(seed)
-                if torch.cuda.is_available():
-                    torch.cuda.manual_seed_all(seed)
+                try:
+                    import torch
+                    torch.manual_seed(seed)
+                    if torch.cuda.is_available():
+                        torch.cuda.manual_seed_all(seed)
+                except ImportError:
+                    pass  # Skip seeding if torch not available
             # Track generation time
             start_time = time.time()
@@ -1147,8 +1752,20 @@ class HuggingFaceProvider(BaseProvider):
     @classmethod
     def list_available_models(cls, **kwargs) -> List[str]:
-        """List available HuggingFace models from local cache (excluding MLX models)."""
+        """
+        List available HuggingFace models from local cache (excluding MLX models).
+        Args:
+            **kwargs: Optional parameters including:
+                - input_capabilities: List of ModelInputCapability enums to filter by input capability
+                - output_capabilities: List of ModelOutputCapability enums to filter by output capability
+        Returns:
+            List of model names, optionally filtered by capabilities
+        """
         try:
+            from .model_capabilities import filter_models_by_capabilities
             hf_cache = Path.home() / ".cache" / "huggingface" / "hub"
             if not hf_cache.exists():
                 return []
@@ -1164,7 +1781,21 @@ class HuggingFaceProvider(BaseProvider):
                     if "mlx" not in model_name.lower():
                         models.append(model_name)
-            return sorted(models)
+            models = sorted(models)
+            # Apply new capability filtering if provided
+            input_capabilities = kwargs.get('input_capabilities')
+            output_capabilities = kwargs.get('output_capabilities')
+            if input_capabilities or output_capabilities:
+                models = filter_models_by_capabilities(
+                    models,
+                    input_capabilities=input_capabilities,
+                    output_capabilities=output_capabilities
+                )
+            return models
         except Exception:
             return []

abstractcore 2.5.0__py3-none-any.whl → 2.5.3__py3-none-any.whl

abstractcore 2.5.0py3-none-any.whl → 2.5.3py3-none-any.whl