abstractcore 2.5.2__py3-none-any.whl → 2.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. abstractcore/__init__.py +12 -0
  2. abstractcore/architectures/detection.py +250 -4
  3. abstractcore/assets/architecture_formats.json +14 -1
  4. abstractcore/assets/model_capabilities.json +533 -10
  5. abstractcore/compression/__init__.py +29 -0
  6. abstractcore/compression/analytics.py +420 -0
  7. abstractcore/compression/cache.py +250 -0
  8. abstractcore/compression/config.py +279 -0
  9. abstractcore/compression/exceptions.py +30 -0
  10. abstractcore/compression/glyph_processor.py +381 -0
  11. abstractcore/compression/optimizer.py +388 -0
  12. abstractcore/compression/orchestrator.py +380 -0
  13. abstractcore/compression/pil_text_renderer.py +818 -0
  14. abstractcore/compression/quality.py +226 -0
  15. abstractcore/compression/text_formatter.py +666 -0
  16. abstractcore/compression/vision_compressor.py +371 -0
  17. abstractcore/config/main.py +64 -0
  18. abstractcore/config/manager.py +100 -5
  19. abstractcore/core/session.py +61 -6
  20. abstractcore/events/__init__.py +1 -1
  21. abstractcore/media/auto_handler.py +312 -18
  22. abstractcore/media/handlers/local_handler.py +14 -2
  23. abstractcore/media/handlers/openai_handler.py +62 -3
  24. abstractcore/media/processors/__init__.py +11 -1
  25. abstractcore/media/processors/direct_pdf_processor.py +210 -0
  26. abstractcore/media/processors/glyph_pdf_processor.py +227 -0
  27. abstractcore/media/processors/image_processor.py +7 -1
  28. abstractcore/media/processors/text_processor.py +18 -3
  29. abstractcore/media/types.py +164 -7
  30. abstractcore/providers/__init__.py +18 -0
  31. abstractcore/providers/anthropic_provider.py +28 -2
  32. abstractcore/providers/base.py +278 -6
  33. abstractcore/providers/huggingface_provider.py +563 -23
  34. abstractcore/providers/lmstudio_provider.py +38 -2
  35. abstractcore/providers/mlx_provider.py +27 -2
  36. abstractcore/providers/model_capabilities.py +352 -0
  37. abstractcore/providers/ollama_provider.py +38 -4
  38. abstractcore/providers/openai_provider.py +28 -2
  39. abstractcore/providers/registry.py +85 -13
  40. abstractcore/server/app.py +91 -81
  41. abstractcore/utils/__init__.py +4 -1
  42. abstractcore/utils/trace_export.py +287 -0
  43. abstractcore/utils/version.py +1 -1
  44. abstractcore/utils/vlm_token_calculator.py +655 -0
  45. {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/METADATA +107 -6
  46. {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/RECORD +50 -33
  47. {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/WHEEL +0 -0
  48. {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/entry_points.txt +0 -0
  49. {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/licenses/LICENSE +0 -0
  50. {abstractcore-2.5.2.dist-info → abstractcore-2.5.3.dist-info}/top_level.txt +0 -0
@@ -8,6 +8,25 @@ import json
8
8
  from pathlib import Path
9
9
  from typing import List, Dict, Any, Optional, Union, Iterator, Type
10
10
 
11
+ # Import config manager to respect offline-first settings
12
+ from ..config.manager import get_config_manager
13
+
14
+ # Get config instance and set offline environment variables if needed
15
+ _config = get_config_manager()
16
+ if _config.is_offline_first():
17
+ os.environ["TRANSFORMERS_OFFLINE"] = "1"
18
+ os.environ["HF_DATASETS_OFFLINE"] = "1"
19
+ os.environ["HF_HUB_OFFLINE"] = "1"
20
+
21
+ # Enable MPS fallback for Apple Silicon to handle unsupported operations
22
+ # This prevents "MPS: Unsupported Border padding mode" errors in vision models
23
+ try:
24
+ import torch
25
+ if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
26
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
27
+ except ImportError:
28
+ pass # torch not available, skip MPS setup
29
+
11
30
  try:
12
31
  from pydantic import BaseModel
13
32
  PYDANTIC_AVAILABLE = True
@@ -22,7 +41,7 @@ from ..events import EventType
22
41
 
23
42
  # Try to import transformers (standard HuggingFace support)
24
43
  try:
25
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
44
+ from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer, AutoProcessor, AutoModelForImageTextToText, pipeline
26
45
  import torch
27
46
  TRANSFORMERS_AVAILABLE = True
28
47
  except ImportError:
@@ -46,6 +65,22 @@ except ImportError:
46
65
  # huggingface_hub not required for basic operation
47
66
 
48
67
 
68
+ def _get_local_model_path(model_name: str) -> Optional[str]:
69
+ """Get local cache path for a HuggingFace model if it exists."""
70
+ # Use centralized configuration for cache directory
71
+ config = _config
72
+ hf_cache_dir = Path(config.config.cache.huggingface_cache_dir).expanduser()
73
+
74
+ model_cache_name = f"models--{model_name.replace('/', '--')}"
75
+ model_cache_path = hf_cache_dir / "hub" / model_cache_name / "snapshots"
76
+
77
+ if model_cache_path.exists():
78
+ snapshot_dirs = [d for d in model_cache_path.iterdir() if d.is_dir()]
79
+ if snapshot_dirs:
80
+ return str(snapshot_dirs[0]) # Return first snapshot
81
+ return None
82
+
83
+
49
84
  class HuggingFaceProvider(BaseProvider):
50
85
  """HuggingFace provider with dual support for transformers and GGUF models"""
51
86
 
@@ -88,9 +123,19 @@ class HuggingFaceProvider(BaseProvider):
88
123
  self.n_gpu_layers = n_gpu_layers
89
124
  self.model_type = None # Will be "transformers" or "gguf"
90
125
  self.device = device
126
+
127
+ # Store transformers-specific parameters
128
+ self.transformers_kwargs = {
129
+ k: v for k, v in kwargs.items()
130
+ if k in ['trust_remote_code', 'torch_dtype', 'device_map', 'load_in_8bit', 'load_in_4bit', 'attn_implementation']
131
+ }
132
+
133
+ # Store device preference for custom models
134
+ self.preferred_device = kwargs.get('device_map', 'auto')
91
135
 
92
136
  # Model instances
93
137
  self.tokenizer = None
138
+ self.processor = None # For vision models
94
139
  self.model_instance = None
95
140
  self.pipeline = None
96
141
  self.llm = None # For GGUF models
@@ -127,6 +172,9 @@ class HuggingFaceProvider(BaseProvider):
127
172
 
128
173
  if hasattr(self, 'tokenizer') and self.tokenizer is not None:
129
174
  self.tokenizer = None
175
+
176
+ if hasattr(self, 'processor') and self.processor is not None:
177
+ self.processor = None
130
178
 
131
179
  if hasattr(self, 'model') and hasattr(self, 'model') and self.model is not None:
132
180
  # For transformers models, clear the model
@@ -169,6 +217,26 @@ class HuggingFaceProvider(BaseProvider):
169
217
 
170
218
  return False
171
219
 
220
+ def _is_vision_model(self, model: str) -> bool:
221
+ """Detect if the model is a vision model that requires special handling"""
222
+ model_lower = model.lower()
223
+
224
+ # Known vision models that require AutoModelForImageTextToText
225
+ vision_models = [
226
+ 'glyph', # zai-org/Glyph
227
+ 'glm-4.1v', # GLM-4.1V variants
228
+ 'glm4v', # GLM4V architecture
229
+ 'qwen-vl', # Qwen-VL models
230
+ 'qwen2-vl', # Qwen2-VL models
231
+ 'qwen2.5-vl', # Qwen2.5-VL models
232
+ 'llava', # LLaVA models
233
+ 'instructblip', # InstructBLIP models
234
+ 'blip2', # BLIP2 models
235
+ 'flamingo', # Flamingo models
236
+ ]
237
+
238
+ return any(vision_keyword in model_lower for vision_keyword in vision_models)
239
+
172
240
  def _setup_device_transformers(self):
173
241
  """Setup device for transformers models"""
174
242
  if not TRANSFORMERS_AVAILABLE:
@@ -216,24 +284,65 @@ class HuggingFaceProvider(BaseProvider):
216
284
  def _load_transformers_model(self):
217
285
  """Load standard HuggingFace transformers model"""
218
286
  try:
219
- self.tokenizer = AutoTokenizer.from_pretrained(self.model)
220
- self.model_instance = AutoModelForCausalLM.from_pretrained(self.model)
287
+ # Check if this is a vision model that requires special handling
288
+ if self._is_vision_model(self.model):
289
+ return self._load_vision_model()
290
+
291
+ # Load tokenizer with transformers-specific parameters
292
+ tokenizer_kwargs = {k: v for k, v in self.transformers_kwargs.items()
293
+ if k in ['trust_remote_code']}
294
+ # Respect offline-first configuration
295
+ if _config.should_force_local_files_only():
296
+ tokenizer_kwargs['local_files_only'] = True
297
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model, **tokenizer_kwargs)
298
+
299
+ # Load model with all transformers-specific parameters
300
+ # Try AutoModelForCausalLM first, fall back to AutoModel for custom models
301
+ model_kwargs = self.transformers_kwargs.copy()
302
+ # Respect offline-first configuration
303
+ if _config.should_force_local_files_only():
304
+ model_kwargs['local_files_only'] = True
305
+
306
+ try:
307
+ self.model_instance = AutoModelForCausalLM.from_pretrained(self.model, **model_kwargs)
308
+ except ValueError as e:
309
+ if "Unrecognized configuration class" in str(e) or "glm4v" in str(e).lower():
310
+ # Fall back to AutoModel for custom models like DeepSeek-OCR
311
+ self.model_instance = AutoModel.from_pretrained(self.model, **model_kwargs)
312
+ else:
313
+ raise
221
314
 
222
- # Move to device
223
- if self.device in ["cuda", "mps"]:
315
+ # Move to device (only if not using device_map)
316
+ if self.device in ["cuda", "mps"] and 'device_map' not in self.transformers_kwargs:
224
317
  self.model_instance = self.model_instance.to(self.device)
225
318
 
226
- # Create pipeline
319
+ # Create pipeline - handle custom models that don't support text-generation
227
320
  device_arg = 0 if self.device == "cuda" else -1
228
321
  if self.device == "mps":
229
322
  device_arg = -1
230
323
 
231
- self.pipeline = pipeline(
232
- "text-generation",
233
- model=self.model_instance,
234
- tokenizer=self.tokenizer,
235
- device=device_arg
236
- )
324
+ try:
325
+ # Don't pass device argument if using device_map (accelerate)
326
+ if 'device_map' in self.transformers_kwargs:
327
+ self.pipeline = pipeline(
328
+ "text-generation",
329
+ model=self.model_instance,
330
+ tokenizer=self.tokenizer
331
+ )
332
+ else:
333
+ self.pipeline = pipeline(
334
+ "text-generation",
335
+ model=self.model_instance,
336
+ tokenizer=self.tokenizer,
337
+ device=device_arg
338
+ )
339
+ except ValueError as e:
340
+ if "not supported for text-generation" in str(e) or "accelerate" in str(e):
341
+ # For custom models like DeepSeek-OCR, skip pipeline creation
342
+ # We'll handle generation directly through the model
343
+ self.pipeline = None
344
+ else:
345
+ raise
237
346
 
238
347
  except Exception as e:
239
348
  error_str = str(e).lower()
@@ -245,6 +354,96 @@ class HuggingFaceProvider(BaseProvider):
245
354
  else:
246
355
  raise RuntimeError(f"Failed to load HuggingFace model {self.model}: {str(e)}")
247
356
 
357
+ def _load_vision_model(self):
358
+ """Load vision model using AutoModelForImageTextToText and AutoProcessor"""
359
+ try:
360
+ # Suppress progress bars during model loading unless in debug mode
361
+ import os
362
+ from transformers.utils import logging as transformers_logging
363
+
364
+ if not self.debug:
365
+ # Disable transformers progress bars
366
+ os.environ['TRANSFORMERS_VERBOSITY'] = 'error'
367
+ transformers_logging.set_verbosity_error()
368
+ # Disable tqdm progress bars
369
+ os.environ['DISABLE_TQDM'] = '1'
370
+
371
+ # Load processor for vision models (handles both text and images)
372
+ processor_kwargs = {k: v for k, v in self.transformers_kwargs.items()
373
+ if k in ['trust_remote_code']}
374
+ # Enable trust_remote_code for custom architectures like GLM4V
375
+ processor_kwargs['trust_remote_code'] = True
376
+ # Set use_fast=True to avoid the slow processor warning
377
+ processor_kwargs['use_fast'] = True
378
+ # Respect offline-first configuration
379
+ if _config.should_force_local_files_only():
380
+ processor_kwargs['local_files_only'] = True
381
+
382
+ # Use local cache path if offline mode is enabled and model is cached
383
+ model_path = self.model
384
+ if _config.should_force_local_files_only():
385
+ local_path = _get_local_model_path(self.model)
386
+ if local_path:
387
+ model_path = local_path
388
+ processor_kwargs.pop('local_files_only', None) # Remove since we're using local path
389
+ self.logger.debug(f"Loading processor from local cache: {local_path}")
390
+
391
+ self.processor = AutoProcessor.from_pretrained(model_path, **processor_kwargs)
392
+
393
+ # Load vision model using AutoModelForImageTextToText with trust_remote_code
394
+ vision_kwargs = self.transformers_kwargs.copy()
395
+ vision_kwargs['trust_remote_code'] = True
396
+ # Respect offline-first configuration
397
+ if _config.should_force_local_files_only():
398
+ vision_kwargs['local_files_only'] = True
399
+
400
+ # Use local cache path if offline mode is enabled and model is cached
401
+ model_path = self.model
402
+ if _config.should_force_local_files_only():
403
+ local_path = _get_local_model_path(self.model)
404
+ if local_path:
405
+ model_path = local_path
406
+ vision_kwargs.pop('local_files_only', None) # Remove since we're using local path
407
+ self.logger.debug(f"Loading model from local cache: {local_path}")
408
+
409
+ self.model_instance = AutoModelForImageTextToText.from_pretrained(model_path, **vision_kwargs)
410
+
411
+ # Restore logging levels if they were suppressed
412
+ if not self.debug:
413
+ # Restore transformers logging
414
+ transformers_logging.set_verbosity_warning()
415
+ # Remove tqdm suppression
416
+ if 'DISABLE_TQDM' in os.environ:
417
+ del os.environ['DISABLE_TQDM']
418
+
419
+ # Move to device (only if not using device_map)
420
+ if self.device in ["cuda", "mps"] and 'device_map' not in self.transformers_kwargs:
421
+ self.model_instance = self.model_instance.to(self.device)
422
+
423
+ # For vision models, we don't use the standard pipeline
424
+ self.pipeline = None
425
+
426
+ self.logger.info(f"Successfully loaded vision model {self.model} using AutoModelForImageTextToText")
427
+
428
+ except Exception as e:
429
+ error_str = str(e).lower()
430
+
431
+ # Check for transformers version issues
432
+ if 'glm4v' in error_str and 'does not recognize this architecture' in error_str:
433
+ import transformers
434
+ current_version = transformers.__version__
435
+ raise RuntimeError(
436
+ f"GLM4V architecture requires transformers>=4.57.1, but you have {current_version}. "
437
+ f"Please upgrade: pip install transformers>=4.57.1"
438
+ )
439
+ elif ('not found' in error_str or 'does not exist' in error_str or
440
+ 'not a valid model identifier' in error_str):
441
+ available_models = self.list_available_models()
442
+ error_message = format_model_error("HuggingFace", self.model, available_models)
443
+ raise ModelNotFoundError(error_message)
444
+ else:
445
+ raise RuntimeError(f"Failed to load HuggingFace vision model {self.model}: {str(e)}")
446
+
248
447
  def _find_gguf_in_cache(self, model_name: str) -> Optional[str]:
249
448
  """Find GGUF model in HuggingFace cache (cache-only, no downloading)"""
250
449
 
@@ -513,11 +712,18 @@ class HuggingFaceProvider(BaseProvider):
513
712
  """Generate using transformers backend with optional Outlines native structured output"""
514
713
 
515
714
  if not self.pipeline:
516
- return GenerateResponse(
517
- content="Error: Transformers model not loaded",
518
- model=self.model,
519
- finish_reason="error"
520
- )
715
+ # Handle vision models that use processor instead of pipeline
716
+ if self.processor and hasattr(self.model_instance, 'generate'):
717
+ return self._generate_vision_model(prompt, messages, system_prompt, tools, media, stream, response_model, **kwargs)
718
+ # Handle custom models like DeepSeek-OCR that don't support standard pipelines
719
+ elif hasattr(self.model_instance, 'infer'):
720
+ return self._generate_custom_model(prompt, messages, system_prompt, tools, media, stream, response_model, **kwargs)
721
+ else:
722
+ return GenerateResponse(
723
+ content="Error: Transformers model not loaded or doesn't support generation",
724
+ model=self.model,
725
+ finish_reason="error"
726
+ )
521
727
 
522
728
  # Native structured output via Outlines (if configured and available)
523
729
  should_use_outlines = (
@@ -638,6 +844,311 @@ class HuggingFaceProvider(BaseProvider):
638
844
  finish_reason="error"
639
845
  )
640
846
 
847
+ def _generate_custom_model(self,
848
+ prompt: str,
849
+ messages: Optional[List[Dict[str, str]]] = None,
850
+ system_prompt: Optional[str] = None,
851
+ tools: Optional[List[Dict[str, Any]]] = None,
852
+ media: Optional[List['MediaContent']] = None,
853
+ stream: bool = False,
854
+ response_model: Optional[Type[BaseModel]] = None,
855
+ **kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
856
+ """Generate using custom model methods (e.g., DeepSeek-OCR's infer method)"""
857
+
858
+ import time
859
+ import tempfile
860
+ import os
861
+ start_time = time.time()
862
+
863
+ try:
864
+ # Handle media content for vision models like DeepSeek-OCR
865
+ if media and len(media) > 0:
866
+ # Use the first image for OCR
867
+ media_item = media[0]
868
+
869
+ # DeepSeek-OCR expects image file path
870
+ if hasattr(media_item, 'file_path') and media_item.file_path:
871
+ image_file = str(media_item.file_path)
872
+ else:
873
+ # If no file path, save media content to temp file
874
+ from PIL import Image
875
+
876
+ if hasattr(media_item, 'content') and media_item.content:
877
+ # Handle base64 content
878
+ if media_item.content_format == 'BASE64':
879
+ import base64
880
+ image_data = base64.b64decode(media_item.content)
881
+ temp_file = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
882
+ temp_file.write(image_data)
883
+ temp_file.close()
884
+ image_file = temp_file.name
885
+ else:
886
+ return GenerateResponse(
887
+ content="Error: Unsupported media format for DeepSeek-OCR",
888
+ model=self.model,
889
+ finish_reason="error"
890
+ )
891
+ else:
892
+ return GenerateResponse(
893
+ content="Error: No valid image content found",
894
+ model=self.model,
895
+ finish_reason="error"
896
+ )
897
+
898
+ # Use DeepSeek-OCR's infer method
899
+ try:
900
+ # Create temporary output directory for DeepSeek-OCR
901
+ temp_output_dir = tempfile.mkdtemp()
902
+
903
+ # Patch DeepSeek-OCR for MPS/CPU compatibility if needed
904
+ if self.device == "mps" or (self.device is None and hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()):
905
+ self._patch_deepseek_for_mps()
906
+
907
+ result = self.model_instance.infer(
908
+ self.tokenizer,
909
+ prompt=prompt,
910
+ image_file=image_file,
911
+ output_path=temp_output_dir, # DeepSeek-OCR requires output path
912
+ base_size=1024,
913
+ image_size=640,
914
+ crop_mode=True,
915
+ save_results=False,
916
+ test_compress=False
917
+ )
918
+
919
+ # Clean up temp output directory
920
+ import shutil
921
+ shutil.rmtree(temp_output_dir, ignore_errors=True)
922
+
923
+ # Clean up temp file if created
924
+ if 'temp_file' in locals() and os.path.exists(image_file):
925
+ os.unlink(image_file)
926
+
927
+ # Calculate generation time
928
+ gen_time = (time.time() - start_time) * 1000
929
+
930
+ return GenerateResponse(
931
+ content=result if isinstance(result, str) else str(result),
932
+ model=self.model,
933
+ finish_reason="stop",
934
+ input_tokens=len(prompt.split()), # Rough estimate
935
+ output_tokens=len(str(result).split()) if result else 0,
936
+ gen_time=gen_time
937
+ )
938
+
939
+ except Exception as e:
940
+ return GenerateResponse(
941
+ content=f"Error during DeepSeek-OCR inference: {str(e)}",
942
+ model=self.model,
943
+ finish_reason="error"
944
+ )
945
+ else:
946
+ return GenerateResponse(
947
+ content="Error: DeepSeek-OCR requires image input",
948
+ model=self.model,
949
+ finish_reason="error"
950
+ )
951
+
952
+ except Exception as e:
953
+ return GenerateResponse(
954
+ content=f"Error in custom model generation: {str(e)}",
955
+ model=self.model,
956
+ finish_reason="error"
957
+ )
958
+
959
+ def _generate_vision_model(self,
960
+ prompt: str,
961
+ messages: Optional[List[Dict[str, str]]] = None,
962
+ system_prompt: Optional[str] = None,
963
+ tools: Optional[List[Dict[str, Any]]] = None,
964
+ media: Optional[List['MediaContent']] = None,
965
+ stream: bool = False,
966
+ response_model: Optional[Type[BaseModel]] = None,
967
+ **kwargs) -> Union[GenerateResponse, Iterator[GenerateResponse]]:
968
+ """Generate using vision model (Glyph, GLM-4.1V, etc.)"""
969
+
970
+ import time
971
+ start_time = time.time()
972
+
973
+ # Import torch safely
974
+ try:
975
+ import torch
976
+ except ImportError:
977
+ return GenerateResponse(
978
+ content="Error: PyTorch not available for vision model generation",
979
+ model=self.model,
980
+ finish_reason="error",
981
+ gen_time=0.0
982
+ )
983
+
984
+ try:
985
+ # Build messages for vision model
986
+ chat_messages = []
987
+
988
+ if system_prompt:
989
+ chat_messages.append({"role": "system", "content": system_prompt})
990
+
991
+ if messages:
992
+ chat_messages.extend(messages)
993
+
994
+ # Build user message with media content
995
+ user_content = []
996
+
997
+ # Add text content
998
+ if prompt:
999
+ user_content.append({"type": "text", "text": prompt})
1000
+
1001
+ # Add media content (images)
1002
+ if media:
1003
+ for media_item in media:
1004
+ if hasattr(media_item, 'file_path') and media_item.file_path:
1005
+ # Use file path directly
1006
+ user_content.append({
1007
+ "type": "image",
1008
+ "url": str(media_item.file_path)
1009
+ })
1010
+ elif hasattr(media_item, 'content') and media_item.content:
1011
+ # Handle base64 content
1012
+ if media_item.content_format == 'BASE64':
1013
+ # Create data URL for base64 content
1014
+ mime_type = getattr(media_item, 'mime_type', 'image/png')
1015
+ data_url = f"data:{mime_type};base64,{media_item.content}"
1016
+ user_content.append({
1017
+ "type": "image",
1018
+ "url": data_url
1019
+ })
1020
+
1021
+ # Add user message
1022
+ chat_messages.append({
1023
+ "role": "user",
1024
+ "content": user_content
1025
+ })
1026
+
1027
+ # Process messages using the processor
1028
+ inputs = self.processor.apply_chat_template(
1029
+ chat_messages,
1030
+ tokenize=True,
1031
+ add_generation_prompt=True,
1032
+ return_dict=True,
1033
+ return_tensors="pt"
1034
+ ).to(self.model_instance.device)
1035
+
1036
+ # Generation parameters
1037
+ generation_kwargs = {
1038
+ "max_new_tokens": kwargs.get("max_tokens", self.max_output_tokens or 512),
1039
+ "temperature": kwargs.get("temperature", self.temperature),
1040
+ "do_sample": True,
1041
+ "pad_token_id": self.processor.tokenizer.eos_token_id,
1042
+ }
1043
+
1044
+ # Add seed if provided
1045
+ seed_value = kwargs.get("seed", self.seed)
1046
+ if seed_value is not None:
1047
+ torch.manual_seed(seed_value)
1048
+ if torch.cuda.is_available():
1049
+ torch.cuda.manual_seed_all(seed_value)
1050
+
1051
+ # Generate response
1052
+ # For Apple Silicon, move inputs to CPU if MPS causes issues
1053
+ if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
1054
+ try:
1055
+ generated_ids = self.model_instance.generate(**inputs, **generation_kwargs)
1056
+ except RuntimeError as e:
1057
+ if "MPS: Unsupported Border padding mode" in str(e):
1058
+ self.logger.warning("MPS Border padding mode error detected, falling back to CPU")
1059
+ # Move model and inputs to CPU
1060
+ cpu_model = self.model_instance.to('cpu')
1061
+ cpu_inputs = {k: v.to('cpu') if hasattr(v, 'to') else v for k, v in inputs.items()}
1062
+ generated_ids = cpu_model.generate(**cpu_inputs, **generation_kwargs)
1063
+ # Move model back to original device
1064
+ self.model_instance.to(self.model_instance.device)
1065
+ else:
1066
+ raise e
1067
+ else:
1068
+ generated_ids = self.model_instance.generate(**inputs, **generation_kwargs)
1069
+
1070
+ # Decode response
1071
+ output_text = self.processor.decode(
1072
+ generated_ids[0][inputs["input_ids"].shape[1]:],
1073
+ skip_special_tokens=True
1074
+ )
1075
+
1076
+ # Calculate generation time
1077
+ gen_time = (time.time() - start_time) * 1000
1078
+
1079
+ # Calculate token usage
1080
+ input_tokens = inputs["input_ids"].shape[1]
1081
+ output_tokens = len(generated_ids[0]) - input_tokens
1082
+
1083
+ return GenerateResponse(
1084
+ content=output_text.strip(),
1085
+ model=self.model,
1086
+ finish_reason="stop",
1087
+ usage={
1088
+ "input_tokens": input_tokens,
1089
+ "output_tokens": output_tokens,
1090
+ "total_tokens": input_tokens + output_tokens,
1091
+ "prompt_tokens": input_tokens,
1092
+ "completion_tokens": output_tokens
1093
+ },
1094
+ gen_time=gen_time
1095
+ )
1096
+
1097
+ except Exception as e:
1098
+ gen_time = (time.time() - start_time) * 1000 if 'start_time' in locals() else 0.0
1099
+ return GenerateResponse(
1100
+ content=f"Error in vision model generation: {str(e)}",
1101
+ model=self.model,
1102
+ finish_reason="error",
1103
+ gen_time=gen_time
1104
+ )
1105
+
1106
+ def _patch_deepseek_for_mps(self):
1107
+ """Patch DeepSeek-OCR model to work with MPS instead of CUDA"""
1108
+ import types
1109
+
1110
+ def patched_infer(self, tokenizer, prompt='', image_file='', output_path='', base_size=1024, image_size=640, crop_mode=True, test_compress=False, save_results=False, eval_mode=False):
1111
+ """Patched infer method that uses MPS instead of CUDA"""
1112
+ import torch
1113
+
1114
+ # Determine the best available device
1115
+ if torch.backends.mps.is_available():
1116
+ device = torch.device('mps')
1117
+ elif torch.cuda.is_available():
1118
+ device = torch.device('cuda')
1119
+ else:
1120
+ device = torch.device('cpu')
1121
+
1122
+ # Call the original infer method but patch tensor.cuda() calls
1123
+ original_cuda = torch.Tensor.cuda
1124
+
1125
+ def patched_cuda(tensor, device=None, non_blocking=False, **kwargs):
1126
+ """Redirect .cuda() calls to the appropriate device"""
1127
+ if device == 'mps' or (device is None and torch.backends.mps.is_available()):
1128
+ return tensor.to('mps', non_blocking=non_blocking)
1129
+ elif torch.cuda.is_available():
1130
+ return original_cuda(tensor, device, non_blocking, **kwargs)
1131
+ else:
1132
+ return tensor.to('cpu', non_blocking=non_blocking)
1133
+
1134
+ # Temporarily patch the cuda method
1135
+ torch.Tensor.cuda = patched_cuda
1136
+
1137
+ try:
1138
+ # Move model to the appropriate device first
1139
+ self.to(device)
1140
+
1141
+ # Call original infer with device patching
1142
+ return self._original_infer(tokenizer, prompt, image_file, output_path, base_size, image_size, crop_mode, test_compress, save_results, eval_mode)
1143
+ finally:
1144
+ # Restore original cuda method
1145
+ torch.Tensor.cuda = original_cuda
1146
+
1147
+ # Only patch if not already patched
1148
+ if not hasattr(self.model_instance, '_original_infer'):
1149
+ self.model_instance._original_infer = self.model_instance.infer
1150
+ self.model_instance.infer = types.MethodType(patched_infer, self.model_instance)
1151
+
641
1152
  def _generate_gguf(self,
642
1153
  prompt: str,
643
1154
  messages: Optional[List[Dict[str, str]]] = None,
@@ -949,10 +1460,13 @@ class HuggingFaceProvider(BaseProvider):
949
1460
  try:
950
1461
  # Set seed for deterministic generation if provided
951
1462
  if seed is not None:
952
- import torch
953
- torch.manual_seed(seed)
954
- if torch.cuda.is_available():
955
- torch.cuda.manual_seed_all(seed)
1463
+ try:
1464
+ import torch
1465
+ torch.manual_seed(seed)
1466
+ if torch.cuda.is_available():
1467
+ torch.cuda.manual_seed_all(seed)
1468
+ except ImportError:
1469
+ pass # Skip seeding if torch not available
956
1470
 
957
1471
  # Track generation time
958
1472
  start_time = time.time()
@@ -1238,8 +1752,20 @@ class HuggingFaceProvider(BaseProvider):
1238
1752
 
1239
1753
  @classmethod
1240
1754
  def list_available_models(cls, **kwargs) -> List[str]:
1241
- """List available HuggingFace models from local cache (excluding MLX models)."""
1755
+ """
1756
+ List available HuggingFace models from local cache (excluding MLX models).
1757
+
1758
+ Args:
1759
+ **kwargs: Optional parameters including:
1760
+ - input_capabilities: List of ModelInputCapability enums to filter by input capability
1761
+ - output_capabilities: List of ModelOutputCapability enums to filter by output capability
1762
+
1763
+ Returns:
1764
+ List of model names, optionally filtered by capabilities
1765
+ """
1242
1766
  try:
1767
+ from .model_capabilities import filter_models_by_capabilities
1768
+
1243
1769
  hf_cache = Path.home() / ".cache" / "huggingface" / "hub"
1244
1770
  if not hf_cache.exists():
1245
1771
  return []
@@ -1255,7 +1781,21 @@ class HuggingFaceProvider(BaseProvider):
1255
1781
  if "mlx" not in model_name.lower():
1256
1782
  models.append(model_name)
1257
1783
 
1258
- return sorted(models)
1784
+ models = sorted(models)
1785
+
1786
+ # Apply new capability filtering if provided
1787
+ input_capabilities = kwargs.get('input_capabilities')
1788
+ output_capabilities = kwargs.get('output_capabilities')
1789
+
1790
+ if input_capabilities or output_capabilities:
1791
+ models = filter_models_by_capabilities(
1792
+ models,
1793
+ input_capabilities=input_capabilities,
1794
+ output_capabilities=output_capabilities
1795
+ )
1796
+
1797
+
1798
+ return models
1259
1799
 
1260
1800
  except Exception:
1261
1801
  return []