abstractcore 2.5.0__py3-none-any.whl → 2.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. abstractcore/__init__.py +12 -0
  2. abstractcore/apps/__main__.py +8 -1
  3. abstractcore/apps/deepsearch.py +644 -0
  4. abstractcore/apps/intent.py +614 -0
  5. abstractcore/architectures/detection.py +250 -4
  6. abstractcore/assets/architecture_formats.json +14 -1
  7. abstractcore/assets/model_capabilities.json +583 -44
  8. abstractcore/compression/__init__.py +29 -0
  9. abstractcore/compression/analytics.py +420 -0
  10. abstractcore/compression/cache.py +250 -0
  11. abstractcore/compression/config.py +279 -0
  12. abstractcore/compression/exceptions.py +30 -0
  13. abstractcore/compression/glyph_processor.py +381 -0
  14. abstractcore/compression/optimizer.py +388 -0
  15. abstractcore/compression/orchestrator.py +380 -0
  16. abstractcore/compression/pil_text_renderer.py +818 -0
  17. abstractcore/compression/quality.py +226 -0
  18. abstractcore/compression/text_formatter.py +666 -0
  19. abstractcore/compression/vision_compressor.py +371 -0
  20. abstractcore/config/main.py +66 -1
  21. abstractcore/config/manager.py +111 -5
  22. abstractcore/core/session.py +105 -5
  23. abstractcore/events/__init__.py +1 -1
  24. abstractcore/media/auto_handler.py +312 -18
  25. abstractcore/media/handlers/local_handler.py +14 -2
  26. abstractcore/media/handlers/openai_handler.py +62 -3
  27. abstractcore/media/processors/__init__.py +11 -1
  28. abstractcore/media/processors/direct_pdf_processor.py +210 -0
  29. abstractcore/media/processors/glyph_pdf_processor.py +227 -0
  30. abstractcore/media/processors/image_processor.py +7 -1
  31. abstractcore/media/processors/text_processor.py +18 -3
  32. abstractcore/media/types.py +164 -7
  33. abstractcore/processing/__init__.py +5 -1
  34. abstractcore/processing/basic_deepsearch.py +2173 -0
  35. abstractcore/processing/basic_intent.py +690 -0
  36. abstractcore/providers/__init__.py +18 -0
  37. abstractcore/providers/anthropic_provider.py +29 -2
  38. abstractcore/providers/base.py +279 -6
  39. abstractcore/providers/huggingface_provider.py +658 -27
  40. abstractcore/providers/lmstudio_provider.py +52 -2
  41. abstractcore/providers/mlx_provider.py +103 -4
  42. abstractcore/providers/model_capabilities.py +352 -0
  43. abstractcore/providers/ollama_provider.py +44 -6
  44. abstractcore/providers/openai_provider.py +29 -2
  45. abstractcore/providers/registry.py +91 -19
  46. abstractcore/server/app.py +91 -81
  47. abstractcore/structured/handler.py +161 -1
  48. abstractcore/tools/common_tools.py +98 -3
  49. abstractcore/utils/__init__.py +4 -1
  50. abstractcore/utils/cli.py +114 -1
  51. abstractcore/utils/trace_export.py +287 -0
  52. abstractcore/utils/version.py +1 -1
  53. abstractcore/utils/vlm_token_calculator.py +655 -0
  54. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/METADATA +140 -23
  55. abstractcore-2.5.3.dist-info/RECORD +107 -0
  56. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/entry_points.txt +4 -0
  57. abstractcore-2.5.0.dist-info/RECORD +0 -86
  58. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/WHEEL +0 -0
  59. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/licenses/LICENSE +0 -0
  60. {abstractcore-2.5.0.dist-info → abstractcore-2.5.3.dist-info}/top_level.txt +0 -0
@@ -8,6 +8,25 @@ import json
8
8
  from pathlib import Path
9
9
  from typing import List, Dict, Any, Optional, Union, Iterator, Type
10
10
 
11
+ # Import config manager to respect offline-first settings
12
+ from ..config.manager import get_config_manager
13
+
14
+ # Get config instance and set offline environment variables if needed
15
+ _config = get_config_manager()
16
+ if _config.is_offline_first():
17
+ os.environ["TRANSFORMERS_OFFLINE"] = "1"
18
+ os.environ["HF_DATASETS_OFFLINE"] = "1"
19
+ os.environ["HF_HUB_OFFLINE"] = "1"
20
+
21
+ # Enable MPS fallback for Apple Silicon to handle unsupported operations
22
+ # This prevents "MPS: Unsupported Border padding mode" errors in vision models
23
+ try:
24
+ import torch
25
+ if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
26
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
27
+ except ImportError:
28
+ pass # torch not available, skip MPS setup
29
+
11
30
  try:
12
31
  from pydantic import BaseModel
13
32
  PYDANTIC_AVAILABLE = True
@@ -22,7 +41,7 @@ from ..events import EventType
22
41
 
23
42
  # Try to import transformers (standard HuggingFace support)
24
43
  try:
25
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
44
+ from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer, AutoProcessor, AutoModelForImageTextToText, pipeline
26
45
  import torch
27
46
  TRANSFORMERS_AVAILABLE = True
28
47
  except ImportError:
@@ -35,16 +54,40 @@ try:
35
54
  except ImportError:
36
55
  LLAMACPP_AVAILABLE = False
37
56
 
57
+ # Try to import Outlines (native structured output for transformers models)
58
+ try:
59
+ import outlines
60
+ OUTLINES_AVAILABLE = True
61
+ except ImportError:
62
+ OUTLINES_AVAILABLE = False
63
+
38
64
  # We no longer download models - cache-only approach
39
65
  # huggingface_hub not required for basic operation
40
66
 
41
67
 
68
+ def _get_local_model_path(model_name: str) -> Optional[str]:
69
+ """Get local cache path for a HuggingFace model if it exists."""
70
+ # Use centralized configuration for cache directory
71
+ config = _config
72
+ hf_cache_dir = Path(config.config.cache.huggingface_cache_dir).expanduser()
73
+
74
+ model_cache_name = f"models--{model_name.replace('/', '--')}"
75
+ model_cache_path = hf_cache_dir / "hub" / model_cache_name / "snapshots"
76
+
77
+ if model_cache_path.exists():
78
+ snapshot_dirs = [d for d in model_cache_path.iterdir() if d.is_dir()]
79
+ if snapshot_dirs:
80
+ return str(snapshot_dirs[0]) # Return first snapshot
81
+ return None
82
+
83
+
42
84
  class HuggingFaceProvider(BaseProvider):
43
85
  """HuggingFace provider with dual support for transformers and GGUF models"""
44
86
 
45
- def __init__(self, model: str = "Qwen/Qwen3-4B",
87
+ def __init__(self, model: str = "unsloth/Qwen3-4B-Instruct-2507-GGUF",
46
88
  device: Optional[str] = None,
47
89
  n_gpu_layers: Optional[int] = None,
90
+ structured_output_method: str = "auto",
48
91
  **kwargs):
49
92
 
50
93
  # Handle legacy context_size parameter with deprecation warning
@@ -61,10 +104,18 @@ class HuggingFaceProvider(BaseProvider):
61
104
  kwargs["max_tokens"] = context_size
62
105
 
63
106
  super().__init__(model, **kwargs)
107
+ self.provider = "huggingface"
64
108
 
65
109
  # Handle timeout parameter for local models
66
110
  self._handle_timeout_parameter(kwargs)
67
111
 
112
+ # Structured output method: "auto", "native_outlines", "prompted"
113
+ # auto: Use Outlines if available (for transformers), otherwise prompted (default)
114
+ # native_outlines: Force Outlines (error if unavailable)
115
+ # prompted: Always use prompted fallback (fastest for transformers, still 100% success)
116
+ # Note: GGUF models always use llama-cpp-python native support regardless of this setting
117
+ self.structured_output_method = structured_output_method
118
+
68
119
  # Initialize tool handler
69
120
  self.tool_handler = UniversalToolHandler(model)
70
121
 
@@ -72,9 +123,19 @@ class HuggingFaceProvider(BaseProvider):
72
123
  self.n_gpu_layers = n_gpu_layers
73
124
  self.model_type = None # Will be "transformers" or "gguf"
74
125
  self.device = device
126
+
127
+ # Store transformers-specific parameters
128
+ self.transformers_kwargs = {
129
+ k: v for k, v in kwargs.items()
130
+ if k in ['trust_remote_code', 'torch_dtype', 'device_map', 'load_in_8bit', 'load_in_4bit', 'attn_implementation']
131
+ }
132
+
133
+ # Store device preference for custom models
134
+ self.preferred_device = kwargs.get('device_map', 'auto')
75
135
 
76
136
  # Model instances
77
137
  self.tokenizer = None
138
+ self.processor = None # For vision models
78
139
  self.model_instance = None
79
140
  self.pipeline = None
80
141
  self.llm = None # For GGUF models
@@ -111,6 +172,9 @@ class HuggingFaceProvider(BaseProvider):
111
172
 
112
173
  if hasattr(self, 'tokenizer') and self.tokenizer is not None:
113
174
  self.tokenizer = None
175
+
176
+ if hasattr(self, 'processor') and self.processor is not None:
177
+ self.processor = None
114
178
 
115
179
  if hasattr(self, 'model') and hasattr(self, 'model') and self.model is not None:
116
180
  # For transformers models, clear the model
@@ -153,6 +217,26 @@ class HuggingFaceProvider(BaseProvider):
153
217
 
154
218
  return False
155
219
 
220
+ def _is_vision_model(self, model: str) -> bool:
221
+ """Detect if the model is a vision model that requires special handling"""
222
+ model_lower = model.lower()
223
+
224
+ # Known vision models that require AutoModelForImageTextToText
225
+ vision_models = [
226
+ 'glyph', # zai-org/Glyph
227
+ 'glm-4.1v', # GLM-4.1V variants
228
+ 'glm4v', # GLM4V architecture
229
+ 'qwen-vl', # Qwen-VL models
230
+ 'qwen2-vl', # Qwen2-VL models
231
+ 'qwen2.5-vl', # Qwen2.5-VL models
232
+ 'llava', # LLaVA models
233
+ 'instructblip', # InstructBLIP models
234
+ 'blip2', # BLIP2 models
235
+ 'flamingo', # Flamingo models
236
+ ]
237
+
238
+ return any(vision_keyword in model_lower for vision_keyword in vision_models)
239
+
156
240
  def _setup_device_transformers(self):
157
241
  """Setup device for transformers models"""
158
242
  if not TRANSFORMERS_AVAILABLE:
@@ -200,24 +284,65 @@ class HuggingFaceProvider(BaseProvider):
200
284
  def _load_transformers_model(self):
201
285
  """Load standard HuggingFace transformers model"""
202
286
  try:
203
- self.tokenizer = AutoTokenizer.from_pretrained(self.model)
204
- self.model_instance = AutoModelForCausalLM.from_pretrained(self.model)
287
+ # Check if this is a vision model that requires special handling
288
+ if self._is_vision_model(self.model):
289
+ return self._load_vision_model()
290
+
291
+ # Load tokenizer with transformers-specific parameters
292
+ tokenizer_kwargs = {k: v for k, v in self.transformers_kwargs.items()
293
+ if k in ['trust_remote_code']}
294
+ # Respect offline-first configuration
295
+ if _config.should_force_local_files_only():
296
+ tokenizer_kwargs['local_files_only'] = True
297
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model, **tokenizer_kwargs)
298
+
299
+ # Load model with all transformers-specific parameters
300
+ # Try AutoModelForCausalLM first, fall back to AutoModel for custom models
301
+ model_kwargs = self.transformers_kwargs.copy()
302
+ # Respect offline-first configuration
303
+ if _config.should_force_local_files_only():
304
+ model_kwargs['local_files_only'] = True
305
+
306
+ try:
307
+ self.model_instance = AutoModelForCausalLM.from_pretrained(self.model, **model_kwargs)
308
+ except ValueError as e:
309
+ if "Unrecognized configuration class" in str(e) or "glm4v" in str(e).lower():
310
+ # Fall back to AutoModel for custom models like DeepSeek-OCR
311
+ self.model_instance = AutoModel.from_pretrained(self.model, **model_kwargs)
312
+ else:
313
+ raise
205
314
 
206
- # Move to device
207
- if self.device in ["cuda", "mps"]:
315
+ # Move to device (only if not using device_map)
316
+ if self.device in ["cuda", "mps"] and 'device_map' not in self.transformers_kwargs:
208
317
  self.model_instance = self.model_instance.to(self.device)
209
318
 
210
- # Create pipeline
319
+ # Create pipeline - handle custom models that don't support text-generation
211
320
  device_arg = 0 if self.device == "cuda" else -1
212
321
  if self.device == "mps":
213
322
  device_arg = -1
214
323
 
215
- self.pipeline = pipeline(
216
- "text-generation",
217
- model=self.model_instance,
218
- tokenizer=self.tokenizer,
219
- device=device_arg
220
- )
324
+ try:
325
+ # Don't pass device argument if using device_map (accelerate)
326
+ if 'device_map' in self.transformers_kwargs:
327
+ self.pipeline = pipeline(
328
+ "text-generation",
329
+ model=self.model_instance,
330
+ tokenizer=self.tokenizer
331
+ )
332
+ else:
333
+ self.pipeline = pipeline(
334
+ "text-generation",
335
+ model=self.model_instance,
336
+ tokenizer=self.tokenizer,
337
+ device=device_arg
338
+ )
339
+ except ValueError as e:
340
+ if "not supported for text-generation" in str(e) or "accelerate" in str(e):
341
+ # For custom models like DeepSeek-OCR, skip pipeline creation
342
+ # We'll handle generation directly through the model
343
+ self.pipeline = None
344
+ else:
345
+ raise
221
346
 
222
347
  except Exception as e:
223
348
  error_str = str(e).lower()
@@ -229,6 +354,96 @@ class HuggingFaceProvider(BaseProvider):
229
354
  else:
230
355
  raise RuntimeError(f"Failed to load HuggingFace model {self.model}: {str(e)}")
231
356
 
357
+ def _load_vision_model(self):
358
+ """Load vision model using AutoModelForImageTextToText and AutoProcessor"""
359
+ try:
360
+ # Suppress progress bars during model loading unless in debug mode
361
+ import os
362
+ from transformers.utils import logging as transformers_logging
363
+
364
+ if not self.debug:
365
+ # Disable transformers progress bars
366
+ os.environ['TRANSFORMERS_VERBOSITY'] = 'error'
367
+ transformers_logging.set_verbosity_error()
368
+ # Disable tqdm progress bars
369
+ os.environ['DISABLE_TQDM'] = '1'
370
+
371
+ # Load processor for vision models (handles both text and images)
372
+ processor_kwargs = {k: v for k, v in self.transformers_kwargs.items()
373
+ if k in ['trust_remote_code']}
374
+ # Enable trust_remote_code for custom architectures like GLM4V
375
+ processor_kwargs['trust_remote_code'] = True
376
+ # Set use_fast=True to avoid the slow processor warning
377
+ processor_kwargs['use_fast'] = True
378
+ # Respect offline-first configuration
379
+ if _config.should_force_local_files_only():
380
+ processor_kwargs['local_files_only'] = True
381
+
382
+ # Use local cache path if offline mode is enabled and model is cached
383
+ model_path = self.model
384
+ if _config.should_force_local_files_only():
385
+ local_path = _get_local_model_path(self.model)
386
+ if local_path:
387
+ model_path = local_path
388
+ processor_kwargs.pop('local_files_only', None) # Remove since we're using local path
389
+ self.logger.debug(f"Loading processor from local cache: {local_path}")
390
+
391
+ self.processor = AutoProcessor.from_pretrained(model_path, **processor_kwargs)
392
+
393
+ # Load vision model using AutoModelForImageTextToText with trust_remote_code
394
+ vision_kwargs = self.transformers_kwargs.copy()
395
+ vision_kwargs['trust_remote_code'] = True
396
+ # Respect offline-first configuration
397
+ if _config.should_force_local_files_only():
398
+ vision_kwargs['local_files_only'] = True
399
+
400
+ # Use local cache path if offline mode is enabled and model is cached
401
+ model_path = self.model
402
+ if _config.should_force_local_files_only():
403
+ local_path = _get_local_model_path(self.model)
404
+ if local_path:
405
+ model_path = local_path
406
+ vision_kwargs.pop('local_files_only', None) # Remove since we're using local path
407
+ self.logger.debug(f"Loading model from local cache: {local_path}")
408
+
409
+ self.model_instance = AutoModelForImageTextToText.from_pretrained(model_path, **vision_kwargs)
410
+
411
+ # Restore logging levels if they were suppressed
412
+ if not self.debug:
413
+ # Restore transformers logging
414
+ transformers_logging.set_verbosity_warning()
415
+ # Remove tqdm suppression
416
+ if 'DISABLE_TQDM' in os.environ:
417
+ del os.environ['DISABLE_TQDM']
418
+
419
+ # Move to device (only if not using device_map)
420
+ if self.device in ["cuda", "mps"] and 'device_map' not in self.transformers_kwargs:
421
+ self.model_instance = self.model_instance.to(self.device)
422
+
423
+ # For vision models, we don't use the standard pipeline
424
+ self.pipeline = None
425
+
426
+ self.logger.info(f"Successfully loaded vision model {self.model} using AutoModelForImageTextToText")
427
+
428
+ except Exception as e:
429
+ error_str = str(e).lower()
430
+
431
+ # Check for transformers version issues
432
+ if 'glm4v' in error_str and 'does not recognize this architecture' in error_str:
433
+ import transformers
434
+ current_version = transformers.__version__
435
+ raise RuntimeError(
436
+ f"GLM4V architecture requires transformers>=4.57.1, but you have {current_version}. "
437
+ f"Please upgrade: pip install transformers>=4.57.1"
438
+ )
439
+ elif ('not found' in error_str or 'does not exist' in error_str or
440
+ 'not a valid model identifier' in error_str):
441
+ available_models = self.list_available_models()
442
+ error_message = format_model_error("HuggingFace", self.model, available_models)
443
+ raise ModelNotFoundError(error_message)
444
+ else:
445
+ raise RuntimeError(f"Failed to load HuggingFace vision model {self.model}: {str(e)}")
446
+
232
447
  def _find_gguf_in_cache(self, model_name: str) -> Optional[str]:
233
448
  """Find GGUF model in HuggingFace cache (cache-only, no downloading)"""
234
449
 
@@ -481,9 +696,9 @@ class HuggingFaceProvider(BaseProvider):
481
696
  """Generate response using appropriate backend"""
482
697
 
483
698
  if self.model_type == "gguf":
484
- return self._generate_gguf(prompt, messages, system_prompt, tools, media, stream, **kwargs)
699
+ return self._generate_gguf(prompt, messages, system_prompt, tools, media, stream, response_model, **kwargs)
485
700
  else:
486
- return self._generate_transformers(prompt, messages, system_prompt, tools, media, stream, **kwargs)
701
+ return self._generate_transformers(prompt, messages, system_prompt, tools, media, stream, response_model, **kwargs)
487
702
 
488
703
  def _generate_transformers(self,
489
704
  prompt: str,
@@ -492,15 +707,83 @@ class HuggingFaceProvider(BaseProvider):
492
707
  tools: Optional[List[Dict[str, Any]]] = None,
493
708
  media: Optional[List['MediaContent']] = None,
494
709
  stream: bool = False,
710
+ response_model: Optional[Type[BaseModel]] = None,
495
711
  **kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
496
- """Generate using transformers backend (original implementation)"""
712
+ """Generate using transformers backend with optional Outlines native structured output"""
497
713
 
498
714
  if not self.pipeline:
499
- return GenerateResponse(
500
- content="Error: Transformers model not loaded",
501
- model=self.model,
502
- finish_reason="error"
503
- )
715
+ # Handle vision models that use processor instead of pipeline
716
+ if self.processor and hasattr(self.model_instance, 'generate'):
717
+ return self._generate_vision_model(prompt, messages, system_prompt, tools, media, stream, response_model, **kwargs)
718
+ # Handle custom models like DeepSeek-OCR that don't support standard pipelines
719
+ elif hasattr(self.model_instance, 'infer'):
720
+ return self._generate_custom_model(prompt, messages, system_prompt, tools, media, stream, response_model, **kwargs)
721
+ else:
722
+ return GenerateResponse(
723
+ content="Error: Transformers model not loaded or doesn't support generation",
724
+ model=self.model,
725
+ finish_reason="error"
726
+ )
727
+
728
+ # Native structured output via Outlines (if configured and available)
729
+ should_use_outlines = (
730
+ response_model and
731
+ PYDANTIC_AVAILABLE and
732
+ not stream and
733
+ self.structured_output_method != "prompted" # Skip if explicitly prompted
734
+ )
735
+
736
+ if should_use_outlines:
737
+ # Check if Outlines is required but unavailable
738
+ if self.structured_output_method == "native_outlines" and not OUTLINES_AVAILABLE:
739
+ return GenerateResponse(
740
+ content="Error: structured_output_method='native_outlines' requires Outlines library. Install with: pip install abstractcore[huggingface]",
741
+ model=self.model,
742
+ finish_reason="error"
743
+ )
744
+
745
+ # Try Outlines if available (auto or native_outlines mode)
746
+ if OUTLINES_AVAILABLE:
747
+ try:
748
+ # Cache Outlines model wrapper to avoid re-initialization
749
+ if not hasattr(self, '_outlines_model') or self._outlines_model is None:
750
+ self.logger.debug("Creating Outlines model wrapper for native structured output")
751
+ self._outlines_model = outlines.from_transformers(
752
+ self.model_instance,
753
+ self.tokenizer
754
+ )
755
+
756
+ # Build input text (same as normal generation)
757
+ input_text = self._build_input_text_transformers(prompt, messages, system_prompt, tools)
758
+
759
+ # Create constrained generator with JSON schema
760
+ self.logger.debug(f"Using Outlines native structured output for {response_model.__name__}")
761
+ generator = self._outlines_model(
762
+ input_text,
763
+ outlines.json_schema(response_model),
764
+ max_tokens=kwargs.get("max_tokens", self.max_tokens or 512)
765
+ )
766
+
767
+ # Validate and return
768
+ validated_obj = response_model.model_validate(generator)
769
+
770
+ return GenerateResponse(
771
+ content=validated_obj.model_dump_json(),
772
+ model=self.model,
773
+ finish_reason="stop",
774
+ validated_object=validated_obj
775
+ )
776
+ except Exception as e:
777
+ # If native_outlines was explicitly requested, don't fall back
778
+ if self.structured_output_method == "native_outlines":
779
+ return GenerateResponse(
780
+ content=f"Error: Outlines native structured output failed: {str(e)}",
781
+ model=self.model,
782
+ finish_reason="error"
783
+ )
784
+ # Otherwise fall back to prompted approach
785
+ self.logger.debug(f"Outlines generation failed, falling back to prompted: {e}")
786
+ # Continue with normal generation below
504
787
 
505
788
  # Build input text with tool and media support
506
789
  # Handle media content first if present
@@ -561,6 +844,311 @@ class HuggingFaceProvider(BaseProvider):
561
844
  finish_reason="error"
562
845
  )
563
846
 
847
+ def _generate_custom_model(self,
848
+ prompt: str,
849
+ messages: Optional[List[Dict[str, str]]] = None,
850
+ system_prompt: Optional[str] = None,
851
+ tools: Optional[List[Dict[str, Any]]] = None,
852
+ media: Optional[List['MediaContent']] = None,
853
+ stream: bool = False,
854
+ response_model: Optional[Type[BaseModel]] = None,
855
+ **kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
856
+ """Generate using custom model methods (e.g., DeepSeek-OCR's infer method)"""
857
+
858
+ import time
859
+ import tempfile
860
+ import os
861
+ start_time = time.time()
862
+
863
+ try:
864
+ # Handle media content for vision models like DeepSeek-OCR
865
+ if media and len(media) > 0:
866
+ # Use the first image for OCR
867
+ media_item = media[0]
868
+
869
+ # DeepSeek-OCR expects image file path
870
+ if hasattr(media_item, 'file_path') and media_item.file_path:
871
+ image_file = str(media_item.file_path)
872
+ else:
873
+ # If no file path, save media content to temp file
874
+ from PIL import Image
875
+
876
+ if hasattr(media_item, 'content') and media_item.content:
877
+ # Handle base64 content
878
+ if media_item.content_format == 'BASE64':
879
+ import base64
880
+ image_data = base64.b64decode(media_item.content)
881
+ temp_file = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
882
+ temp_file.write(image_data)
883
+ temp_file.close()
884
+ image_file = temp_file.name
885
+ else:
886
+ return GenerateResponse(
887
+ content="Error: Unsupported media format for DeepSeek-OCR",
888
+ model=self.model,
889
+ finish_reason="error"
890
+ )
891
+ else:
892
+ return GenerateResponse(
893
+ content="Error: No valid image content found",
894
+ model=self.model,
895
+ finish_reason="error"
896
+ )
897
+
898
+ # Use DeepSeek-OCR's infer method
899
+ try:
900
+ # Create temporary output directory for DeepSeek-OCR
901
+ temp_output_dir = tempfile.mkdtemp()
902
+
903
+ # Patch DeepSeek-OCR for MPS/CPU compatibility if needed
904
+ if self.device == "mps" or (self.device is None and hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()):
905
+ self._patch_deepseek_for_mps()
906
+
907
+ result = self.model_instance.infer(
908
+ self.tokenizer,
909
+ prompt=prompt,
910
+ image_file=image_file,
911
+ output_path=temp_output_dir, # DeepSeek-OCR requires output path
912
+ base_size=1024,
913
+ image_size=640,
914
+ crop_mode=True,
915
+ save_results=False,
916
+ test_compress=False
917
+ )
918
+
919
+ # Clean up temp output directory
920
+ import shutil
921
+ shutil.rmtree(temp_output_dir, ignore_errors=True)
922
+
923
+ # Clean up temp file if created
924
+ if 'temp_file' in locals() and os.path.exists(image_file):
925
+ os.unlink(image_file)
926
+
927
+ # Calculate generation time
928
+ gen_time = (time.time() - start_time) * 1000
929
+
930
+ return GenerateResponse(
931
+ content=result if isinstance(result, str) else str(result),
932
+ model=self.model,
933
+ finish_reason="stop",
934
+ input_tokens=len(prompt.split()), # Rough estimate
935
+ output_tokens=len(str(result).split()) if result else 0,
936
+ gen_time=gen_time
937
+ )
938
+
939
+ except Exception as e:
940
+ return GenerateResponse(
941
+ content=f"Error during DeepSeek-OCR inference: {str(e)}",
942
+ model=self.model,
943
+ finish_reason="error"
944
+ )
945
+ else:
946
+ return GenerateResponse(
947
+ content="Error: DeepSeek-OCR requires image input",
948
+ model=self.model,
949
+ finish_reason="error"
950
+ )
951
+
952
+ except Exception as e:
953
+ return GenerateResponse(
954
+ content=f"Error in custom model generation: {str(e)}",
955
+ model=self.model,
956
+ finish_reason="error"
957
+ )
958
+
959
+ def _generate_vision_model(self,
960
+ prompt: str,
961
+ messages: Optional[List[Dict[str, str]]] = None,
962
+ system_prompt: Optional[str] = None,
963
+ tools: Optional[List[Dict[str, Any]]] = None,
964
+ media: Optional[List['MediaContent']] = None,
965
+ stream: bool = False,
966
+ response_model: Optional[Type[BaseModel]] = None,
967
+ **kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
968
+ """Generate using vision model (Glyph, GLM-4.1V, etc.)"""
969
+
970
+ import time
971
+ start_time = time.time()
972
+
973
+ # Import torch safely
974
+ try:
975
+ import torch
976
+ except ImportError:
977
+ return GenerateResponse(
978
+ content="Error: PyTorch not available for vision model generation",
979
+ model=self.model,
980
+ finish_reason="error",
981
+ gen_time=0.0
982
+ )
983
+
984
+ try:
985
+ # Build messages for vision model
986
+ chat_messages = []
987
+
988
+ if system_prompt:
989
+ chat_messages.append({"role": "system", "content": system_prompt})
990
+
991
+ if messages:
992
+ chat_messages.extend(messages)
993
+
994
+ # Build user message with media content
995
+ user_content = []
996
+
997
+ # Add text content
998
+ if prompt:
999
+ user_content.append({"type": "text", "text": prompt})
1000
+
1001
+ # Add media content (images)
1002
+ if media:
1003
+ for media_item in media:
1004
+ if hasattr(media_item, 'file_path') and media_item.file_path:
1005
+ # Use file path directly
1006
+ user_content.append({
1007
+ "type": "image",
1008
+ "url": str(media_item.file_path)
1009
+ })
1010
+ elif hasattr(media_item, 'content') and media_item.content:
1011
+ # Handle base64 content
1012
+ if media_item.content_format == 'BASE64':
1013
+ # Create data URL for base64 content
1014
+ mime_type = getattr(media_item, 'mime_type', 'image/png')
1015
+ data_url = f"data:{mime_type};base64,{media_item.content}"
1016
+ user_content.append({
1017
+ "type": "image",
1018
+ "url": data_url
1019
+ })
1020
+
1021
+ # Add user message
1022
+ chat_messages.append({
1023
+ "role": "user",
1024
+ "content": user_content
1025
+ })
1026
+
1027
+ # Process messages using the processor
1028
+ inputs = self.processor.apply_chat_template(
1029
+ chat_messages,
1030
+ tokenize=True,
1031
+ add_generation_prompt=True,
1032
+ return_dict=True,
1033
+ return_tensors="pt"
1034
+ ).to(self.model_instance.device)
1035
+
1036
+ # Generation parameters
1037
+ generation_kwargs = {
1038
+ "max_new_tokens": kwargs.get("max_tokens", self.max_output_tokens or 512),
1039
+ "temperature": kwargs.get("temperature", self.temperature),
1040
+ "do_sample": True,
1041
+ "pad_token_id": self.processor.tokenizer.eos_token_id,
1042
+ }
1043
+
1044
+ # Add seed if provided
1045
+ seed_value = kwargs.get("seed", self.seed)
1046
+ if seed_value is not None:
1047
+ torch.manual_seed(seed_value)
1048
+ if torch.cuda.is_available():
1049
+ torch.cuda.manual_seed_all(seed_value)
1050
+
1051
+ # Generate response
1052
+ # For Apple Silicon, move inputs to CPU if MPS causes issues
1053
+ if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
1054
+ try:
1055
+ generated_ids = self.model_instance.generate(**inputs, **generation_kwargs)
1056
+ except RuntimeError as e:
1057
+ if "MPS: Unsupported Border padding mode" in str(e):
1058
+ self.logger.warning("MPS Border padding mode error detected, falling back to CPU")
1059
+ # Move model and inputs to CPU
1060
+ cpu_model = self.model_instance.to('cpu')
1061
+ cpu_inputs = {k: v.to('cpu') if hasattr(v, 'to') else v for k, v in inputs.items()}
1062
+ generated_ids = cpu_model.generate(**cpu_inputs, **generation_kwargs)
1063
+ # Move model back to original device
1064
+ self.model_instance.to(self.model_instance.device)
1065
+ else:
1066
+ raise e
1067
+ else:
1068
+ generated_ids = self.model_instance.generate(**inputs, **generation_kwargs)
1069
+
1070
+ # Decode response
1071
+ output_text = self.processor.decode(
1072
+ generated_ids[0][inputs["input_ids"].shape[1]:],
1073
+ skip_special_tokens=True
1074
+ )
1075
+
1076
+ # Calculate generation time
1077
+ gen_time = (time.time() - start_time) * 1000
1078
+
1079
+ # Calculate token usage
1080
+ input_tokens = inputs["input_ids"].shape[1]
1081
+ output_tokens = len(generated_ids[0]) - input_tokens
1082
+
1083
+ return GenerateResponse(
1084
+ content=output_text.strip(),
1085
+ model=self.model,
1086
+ finish_reason="stop",
1087
+ usage={
1088
+ "input_tokens": input_tokens,
1089
+ "output_tokens": output_tokens,
1090
+ "total_tokens": input_tokens + output_tokens,
1091
+ "prompt_tokens": input_tokens,
1092
+ "completion_tokens": output_tokens
1093
+ },
1094
+ gen_time=gen_time
1095
+ )
1096
+
1097
+ except Exception as e:
1098
+ gen_time = (time.time() - start_time) * 1000 if 'start_time' in locals() else 0.0
1099
+ return GenerateResponse(
1100
+ content=f"Error in vision model generation: {str(e)}",
1101
+ model=self.model,
1102
+ finish_reason="error",
1103
+ gen_time=gen_time
1104
+ )
1105
+
1106
+ def _patch_deepseek_for_mps(self):
1107
+ """Patch DeepSeek-OCR model to work with MPS instead of CUDA"""
1108
+ import types
1109
+
1110
+ def patched_infer(self, tokenizer, prompt='', image_file='', output_path='', base_size=1024, image_size=640, crop_mode=True, test_compress=False, save_results=False, eval_mode=False):
1111
+ """Patched infer method that uses MPS instead of CUDA"""
1112
+ import torch
1113
+
1114
+ # Determine the best available device
1115
+ if torch.backends.mps.is_available():
1116
+ device = torch.device('mps')
1117
+ elif torch.cuda.is_available():
1118
+ device = torch.device('cuda')
1119
+ else:
1120
+ device = torch.device('cpu')
1121
+
1122
+ # Call the original infer method but patch tensor.cuda() calls
1123
+ original_cuda = torch.Tensor.cuda
1124
+
1125
+ def patched_cuda(tensor, device=None, non_blocking=False, **kwargs):
1126
+ """Redirect .cuda() calls to the appropriate device"""
1127
+ if device == 'mps' or (device is None and torch.backends.mps.is_available()):
1128
+ return tensor.to('mps', non_blocking=non_blocking)
1129
+ elif torch.cuda.is_available():
1130
+ return original_cuda(tensor, device, non_blocking, **kwargs)
1131
+ else:
1132
+ return tensor.to('cpu', non_blocking=non_blocking)
1133
+
1134
+ # Temporarily patch the cuda method
1135
+ torch.Tensor.cuda = patched_cuda
1136
+
1137
+ try:
1138
+ # Move model to the appropriate device first
1139
+ self.to(device)
1140
+
1141
+ # Call original infer with device patching
1142
+ return self._original_infer(tokenizer, prompt, image_file, output_path, base_size, image_size, crop_mode, test_compress, save_results, eval_mode)
1143
+ finally:
1144
+ # Restore original cuda method
1145
+ torch.Tensor.cuda = original_cuda
1146
+
1147
+ # Only patch if not already patched
1148
+ if not hasattr(self.model_instance, '_original_infer'):
1149
+ self.model_instance._original_infer = self.model_instance.infer
1150
+ self.model_instance.infer = types.MethodType(patched_infer, self.model_instance)
1151
+
564
1152
  def _generate_gguf(self,
565
1153
  prompt: str,
566
1154
  messages: Optional[List[Dict[str, str]]] = None,
@@ -568,6 +1156,7 @@ class HuggingFaceProvider(BaseProvider):
568
1156
  tools: Optional[List[Dict[str, Any]]] = None,
569
1157
  media: Optional[List['MediaContent']] = None,
570
1158
  stream: bool = False,
1159
+ response_model: Optional[Type[BaseModel]] = None,
571
1160
  **kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
572
1161
  """Generate using GGUF backend with llama-cpp-python"""
573
1162
 
@@ -663,6 +1252,19 @@ class HuggingFaceProvider(BaseProvider):
663
1252
  if seed_value is not None:
664
1253
  generation_kwargs["seed"] = seed_value
665
1254
 
1255
+ # Add native structured output support (llama-cpp-python format)
1256
+ # llama-cpp-python supports native structured outputs using the response_format parameter
1257
+ # This provides server-side guaranteed schema compliance
1258
+ if response_model and PYDANTIC_AVAILABLE:
1259
+ json_schema = response_model.model_json_schema()
1260
+ generation_kwargs["response_format"] = {
1261
+ "type": "json_schema",
1262
+ "json_schema": {
1263
+ "name": response_model.__name__,
1264
+ "schema": json_schema
1265
+ }
1266
+ }
1267
+
666
1268
  # Handle tools - both native and prompted support
667
1269
  has_native_tools = False
668
1270
  if tools:
@@ -858,10 +1460,13 @@ class HuggingFaceProvider(BaseProvider):
858
1460
  try:
859
1461
  # Set seed for deterministic generation if provided
860
1462
  if seed is not None:
861
- import torch
862
- torch.manual_seed(seed)
863
- if torch.cuda.is_available():
864
- torch.cuda.manual_seed_all(seed)
1463
+ try:
1464
+ import torch
1465
+ torch.manual_seed(seed)
1466
+ if torch.cuda.is_available():
1467
+ torch.cuda.manual_seed_all(seed)
1468
+ except ImportError:
1469
+ pass # Skip seeding if torch not available
865
1470
 
866
1471
  # Track generation time
867
1472
  start_time = time.time()
@@ -1147,8 +1752,20 @@ class HuggingFaceProvider(BaseProvider):
1147
1752
 
1148
1753
  @classmethod
1149
1754
  def list_available_models(cls, **kwargs) -> List[str]:
1150
- """List available HuggingFace models from local cache (excluding MLX models)."""
1755
+ """
1756
+ List available HuggingFace models from local cache (excluding MLX models).
1757
+
1758
+ Args:
1759
+ **kwargs: Optional parameters including:
1760
+ - input_capabilities: List of ModelInputCapability enums to filter by input capability
1761
+ - output_capabilities: List of ModelOutputCapability enums to filter by output capability
1762
+
1763
+ Returns:
1764
+ List of model names, optionally filtered by capabilities
1765
+ """
1151
1766
  try:
1767
+ from .model_capabilities import filter_models_by_capabilities
1768
+
1152
1769
  hf_cache = Path.home() / ".cache" / "huggingface" / "hub"
1153
1770
  if not hf_cache.exists():
1154
1771
  return []
@@ -1164,7 +1781,21 @@ class HuggingFaceProvider(BaseProvider):
1164
1781
  if "mlx" not in model_name.lower():
1165
1782
  models.append(model_name)
1166
1783
 
1167
- return sorted(models)
1784
+ models = sorted(models)
1785
+
1786
+ # Apply new capability filtering if provided
1787
+ input_capabilities = kwargs.get('input_capabilities')
1788
+ output_capabilities = kwargs.get('output_capabilities')
1789
+
1790
+ if input_capabilities or output_capabilities:
1791
+ models = filter_models_by_capabilities(
1792
+ models,
1793
+ input_capabilities=input_capabilities,
1794
+ output_capabilities=output_capabilities
1795
+ )
1796
+
1797
+
1798
+ return models
1168
1799
 
1169
1800
  except Exception:
1170
1801
  return []