abstractcore 2.9.1__py3-none-any.whl → 2.11.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. abstractcore/__init__.py +7 -27
  2. abstractcore/apps/extractor.py +33 -100
  3. abstractcore/apps/intent.py +19 -0
  4. abstractcore/apps/judge.py +20 -1
  5. abstractcore/apps/summarizer.py +20 -1
  6. abstractcore/architectures/detection.py +34 -1
  7. abstractcore/architectures/response_postprocessing.py +313 -0
  8. abstractcore/assets/architecture_formats.json +38 -8
  9. abstractcore/assets/model_capabilities.json +781 -160
  10. abstractcore/compression/__init__.py +1 -2
  11. abstractcore/compression/glyph_processor.py +6 -4
  12. abstractcore/config/main.py +31 -19
  13. abstractcore/config/manager.py +389 -11
  14. abstractcore/config/vision_config.py +5 -5
  15. abstractcore/core/interface.py +151 -3
  16. abstractcore/core/session.py +16 -10
  17. abstractcore/download.py +1 -1
  18. abstractcore/embeddings/manager.py +20 -6
  19. abstractcore/endpoint/__init__.py +2 -0
  20. abstractcore/endpoint/app.py +458 -0
  21. abstractcore/mcp/client.py +3 -1
  22. abstractcore/media/__init__.py +52 -17
  23. abstractcore/media/auto_handler.py +42 -22
  24. abstractcore/media/base.py +44 -1
  25. abstractcore/media/capabilities.py +12 -33
  26. abstractcore/media/enrichment.py +105 -0
  27. abstractcore/media/handlers/anthropic_handler.py +19 -28
  28. abstractcore/media/handlers/local_handler.py +124 -70
  29. abstractcore/media/handlers/openai_handler.py +19 -31
  30. abstractcore/media/processors/__init__.py +4 -2
  31. abstractcore/media/processors/audio_processor.py +57 -0
  32. abstractcore/media/processors/office_processor.py +8 -3
  33. abstractcore/media/processors/pdf_processor.py +46 -3
  34. abstractcore/media/processors/text_processor.py +22 -24
  35. abstractcore/media/processors/video_processor.py +58 -0
  36. abstractcore/media/types.py +97 -4
  37. abstractcore/media/utils/image_scaler.py +20 -2
  38. abstractcore/media/utils/video_frames.py +219 -0
  39. abstractcore/media/vision_fallback.py +136 -22
  40. abstractcore/processing/__init__.py +32 -3
  41. abstractcore/processing/basic_deepsearch.py +15 -10
  42. abstractcore/processing/basic_intent.py +3 -2
  43. abstractcore/processing/basic_judge.py +3 -2
  44. abstractcore/processing/basic_summarizer.py +1 -1
  45. abstractcore/providers/__init__.py +3 -1
  46. abstractcore/providers/anthropic_provider.py +95 -8
  47. abstractcore/providers/base.py +1516 -81
  48. abstractcore/providers/huggingface_provider.py +546 -69
  49. abstractcore/providers/lmstudio_provider.py +35 -923
  50. abstractcore/providers/mlx_provider.py +382 -35
  51. abstractcore/providers/model_capabilities.py +5 -1
  52. abstractcore/providers/ollama_provider.py +99 -15
  53. abstractcore/providers/openai_compatible_provider.py +406 -180
  54. abstractcore/providers/openai_provider.py +188 -44
  55. abstractcore/providers/openrouter_provider.py +76 -0
  56. abstractcore/providers/registry.py +61 -5
  57. abstractcore/providers/streaming.py +138 -33
  58. abstractcore/providers/vllm_provider.py +92 -817
  59. abstractcore/server/app.py +461 -13
  60. abstractcore/server/audio_endpoints.py +139 -0
  61. abstractcore/server/vision_endpoints.py +1319 -0
  62. abstractcore/structured/handler.py +316 -41
  63. abstractcore/tools/common_tools.py +5501 -2012
  64. abstractcore/tools/comms_tools.py +1641 -0
  65. abstractcore/tools/core.py +37 -7
  66. abstractcore/tools/handler.py +4 -9
  67. abstractcore/tools/parser.py +49 -2
  68. abstractcore/tools/tag_rewriter.py +2 -1
  69. abstractcore/tools/telegram_tdlib.py +407 -0
  70. abstractcore/tools/telegram_tools.py +261 -0
  71. abstractcore/utils/cli.py +1085 -72
  72. abstractcore/utils/token_utils.py +2 -0
  73. abstractcore/utils/truncation.py +29 -0
  74. abstractcore/utils/version.py +3 -4
  75. abstractcore/utils/vlm_token_calculator.py +12 -2
  76. abstractcore-2.11.2.dist-info/METADATA +562 -0
  77. abstractcore-2.11.2.dist-info/RECORD +133 -0
  78. {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/WHEEL +1 -1
  79. {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/entry_points.txt +1 -0
  80. abstractcore-2.9.1.dist-info/METADATA +0 -1190
  81. abstractcore-2.9.1.dist-info/RECORD +0 -119
  82. {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/licenses/LICENSE +0 -0
  83. {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,7 @@ Supports both transformers models and GGUF models via llama-cpp-python.
5
5
 
6
6
  import os
7
7
  import json
8
+ import threading
8
9
  from pathlib import Path
9
10
  from typing import List, Dict, Any, Optional, Union, Iterator, Type
10
11
 
@@ -39,6 +40,9 @@ from ..exceptions import ModelNotFoundError, format_model_error
39
40
  from ..tools import UniversalToolHandler, execute_tools
40
41
  from ..events import EventType
41
42
 
43
+
44
+ _MPS_GENERATION_LOCK = threading.Lock()
45
+
42
46
  # Try to import transformers (standard HuggingFace support)
43
47
  try:
44
48
  from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer, AutoProcessor, AutoModelForImageTextToText, pipeline
@@ -84,6 +88,22 @@ def _get_local_model_path(model_name: str) -> Optional[str]:
84
88
  class HuggingFaceProvider(BaseProvider):
85
89
  """HuggingFace provider with dual support for transformers and GGUF models"""
86
90
 
91
+ @staticmethod
92
+ def _resolve_requested_device(device: Optional[str]) -> Optional[str]:
93
+ """Resolve the requested device from explicit arg or env override.
94
+
95
+ Supported env var: ABSTRACTCORE_HF_DEVICE=cpu|mps|cuda|auto
96
+ """
97
+ if isinstance(device, str) and device.strip():
98
+ return device.strip().lower()
99
+
100
+ env_device = os.environ.get("ABSTRACTCORE_HF_DEVICE")
101
+ if isinstance(env_device, str) and env_device.strip():
102
+ val = env_device.strip().lower()
103
+ if val in {"auto", "cpu", "mps", "cuda"}:
104
+ return val
105
+ return None
106
+
87
107
  def __init__(self, model: str = "unsloth/Qwen3-4B-Instruct-2507-GGUF",
88
108
  device: Optional[str] = None,
89
109
  n_gpu_layers: Optional[int] = None,
@@ -122,7 +142,7 @@ class HuggingFaceProvider(BaseProvider):
122
142
  # Store provider-specific configuration
123
143
  self.n_gpu_layers = n_gpu_layers
124
144
  self.model_type = None # Will be "transformers" or "gguf"
125
- self.device = device
145
+ self.device = self._resolve_requested_device(device)
126
146
 
127
147
  # Store transformers-specific parameters
128
148
  self.transformers_kwargs = {
@@ -154,7 +174,7 @@ class HuggingFaceProvider(BaseProvider):
154
174
  self._setup_device_transformers()
155
175
  self._load_transformers_model()
156
176
 
157
- def unload(self) -> None:
177
+ def unload_model(self, model_name: str) -> None:
158
178
  """
159
179
  Unload the model from memory.
160
180
 
@@ -187,14 +207,61 @@ class HuggingFaceProvider(BaseProvider):
187
207
  if hasattr(self, 'logger'):
188
208
  self.logger.warning(f"Error during unload: {e}")
189
209
 
190
- def __del__(self):
191
- """Properly clean up resources to minimize garbage collection issues"""
210
+ def supports_prompt_cache(self) -> bool:
211
+ """GGUF backends can use llama.cpp prompt caching (prefix state cache)."""
212
+ return getattr(self, "model_type", None) == "gguf"
213
+
214
+ def prompt_cache_set(
215
+ self,
216
+ key: str,
217
+ *,
218
+ make_default: bool = True,
219
+ ttl_s: Optional[float] = None,
220
+ capacity_bytes: Optional[int] = None,
221
+ **kwargs,
222
+ ) -> bool:
223
+ """Create/reset a llama.cpp prompt cache for the given key (GGUF only)."""
224
+ _ = kwargs
225
+ normalized = self._normalize_prompt_cache_key(key)
226
+ if normalized is None:
227
+ return False
228
+ if not self.supports_prompt_cache():
229
+ return False
230
+ if not super().prompt_cache_set(normalized, make_default=make_default):
231
+ return False
232
+
192
233
  try:
193
- self.unload()
234
+ from llama_cpp.llama_cache import LlamaRAMCache
235
+ except Exception:
236
+ return False
237
+
238
+ cap = int(capacity_bytes) if isinstance(capacity_bytes, int) and capacity_bytes > 0 else (512 << 20)
239
+ cache_obj = LlamaRAMCache(capacity_bytes=cap)
240
+
241
+ try:
242
+ self._prompt_cache_store.set(normalized, cache_obj, ttl_s=ttl_s, meta={"backend": "llama_cpp"})
243
+ except Exception:
244
+ return False
245
+
246
+ # Best-effort: activate this cache on the shared llama instance.
247
+ try:
248
+ if getattr(self, "llm", None) is not None and hasattr(self.llm, "set_cache"):
249
+ self.llm.set_cache(cache_obj)
194
250
  except Exception:
195
- # Silently handle any cleanup errors - this is expected during shutdown
196
251
  pass
197
252
 
253
+ return True
254
+
255
+ def prompt_cache_clear(self, key: Optional[str] = None) -> bool:
256
+ """Clear llama.cpp prompt caches (GGUF only; best-effort)."""
257
+ cleared = super().prompt_cache_clear(key)
258
+ try:
259
+ if getattr(self, "llm", None) is not None and hasattr(self.llm, "set_cache"):
260
+ self.llm.set_cache(None)
261
+ except Exception:
262
+ pass
263
+ return cleared
264
+
198
265
  def _is_gguf_model(self, model: str) -> bool:
199
266
  """Detect if the model is a GGUF model"""
200
267
  # Check if it's a .gguf file path
@@ -238,19 +305,75 @@ class HuggingFaceProvider(BaseProvider):
238
305
  return any(vision_keyword in model_lower for vision_keyword in vision_models)
239
306
 
240
307
  def _setup_device_transformers(self):
241
- """Setup device for transformers models"""
308
+ """Setup device for transformers models (best-effort).
309
+
310
+ We validate explicit device requests even when Transformers isn't available,
311
+ since Torch availability (MPS/CUDA) may still matter for downstream behavior.
312
+ """
313
+ try:
314
+ import torch # type: ignore
315
+ except Exception:
316
+ self.device = "cpu"
317
+ return
318
+
319
+ requested = str(self.device or "").strip().lower() if isinstance(self.device, str) else ""
320
+ if requested and requested != "auto":
321
+ # Respect explicit user/env request, but fall back safely if unavailable.
322
+ if requested == "mps":
323
+ if hasattr(torch.backends, "mps") and torch.backends.mps.is_built() and not torch.backends.mps.is_available():
324
+ self.logger.warning(
325
+ "HuggingFaceProvider requested device=mps but MPS is not available. "
326
+ "This usually means the process cannot see Metal devices (sandboxed execution). "
327
+ "Falling back to CPU. To silence this, set ABSTRACTCORE_HF_DEVICE=cpu."
328
+ )
329
+ self.device = "cpu"
330
+ else:
331
+ self.device = "mps"
332
+ elif requested == "cuda":
333
+ if torch.cuda.is_available():
334
+ self.device = "cuda"
335
+ else:
336
+ self.logger.warning(
337
+ "HuggingFaceProvider requested device=cuda but CUDA is not available; falling back to CPU."
338
+ )
339
+ self.device = "cpu"
340
+ else:
341
+ self.device = "cpu"
342
+ return
343
+
242
344
  if not TRANSFORMERS_AVAILABLE:
345
+ # Without transformers, default to CPU for safety.
346
+ self.device = "cpu"
243
347
  return
244
348
 
245
- if self.device:
246
- self.device = self.device
247
- elif torch.backends.mps.is_available():
349
+ # Auto device selection.
350
+ if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
248
351
  self.device = "mps"
249
352
  elif torch.cuda.is_available():
250
353
  self.device = "cuda"
251
354
  else:
252
355
  self.device = "cpu"
253
356
 
357
+ # Apple Silicon: MPS built but unavailable is usually a sandbox / Metal visibility issue.
358
+ try:
359
+ import platform
360
+
361
+ if (
362
+ self.device == "cpu"
363
+ and platform.system() == "Darwin"
364
+ and platform.machine() == "arm64"
365
+ and hasattr(torch.backends, "mps")
366
+ and torch.backends.mps.is_built()
367
+ and not torch.backends.mps.is_available()
368
+ ):
369
+ self.logger.warning(
370
+ "PyTorch was built with MPS support, but MPS is not available. "
371
+ "This often indicates the process cannot access Metal devices (sandboxed execution). "
372
+ "Run outside the sandbox or force CPU via ABSTRACTCORE_HF_DEVICE=cpu."
373
+ )
374
+ except Exception:
375
+ pass
376
+
254
377
  def _setup_device_gguf(self):
255
378
  """Setup device for GGUF models"""
256
379
  # Auto-detect GPU layers if not specified
@@ -396,6 +519,15 @@ class HuggingFaceProvider(BaseProvider):
396
519
  # Respect offline-first configuration
397
520
  if _config.should_force_local_files_only():
398
521
  vision_kwargs['local_files_only'] = True
522
+
523
+ # Safer defaults on GPU backends: float16 unless caller provided torch_dtype.
524
+ try:
525
+ if self.device in {"mps", "cuda"} and "torch_dtype" not in vision_kwargs:
526
+ import torch as _torch
527
+
528
+ vision_kwargs["torch_dtype"] = _torch.float16
529
+ except Exception:
530
+ pass
399
531
 
400
532
  # Use local cache path if offline mode is enabled and model is cached
401
533
  model_path = self.model
@@ -419,6 +551,11 @@ class HuggingFaceProvider(BaseProvider):
419
551
  # Move to device (only if not using device_map)
420
552
  if self.device in ["cuda", "mps"] and 'device_map' not in self.transformers_kwargs:
421
553
  self.model_instance = self.model_instance.to(self.device)
554
+
555
+ try:
556
+ self.model_instance.eval()
557
+ except Exception:
558
+ pass
422
559
 
423
560
  # For vision models, we don't use the standard pipeline
424
561
  self.pipeline = None
@@ -737,7 +874,7 @@ class HuggingFaceProvider(BaseProvider):
737
874
  # Check if Outlines is required but unavailable
738
875
  if self.structured_output_method == "native_outlines" and not OUTLINES_AVAILABLE:
739
876
  return GenerateResponse(
740
- content="Error: structured_output_method='native_outlines' requires Outlines library. Install with: pip install abstractcore[huggingface]",
877
+ content="Error: structured_output_method='native_outlines' requires Outlines library. Install with: pip install \"abstractcore[huggingface]\"",
741
878
  model=self.model,
742
879
  finish_reason="error"
743
880
  )
@@ -787,6 +924,7 @@ class HuggingFaceProvider(BaseProvider):
787
924
 
788
925
  # Build input text with tool and media support
789
926
  # Handle media content first if present
927
+ media_enrichment = None
790
928
  if media:
791
929
  try:
792
930
  from ..media.handlers import LocalMediaHandler
@@ -794,6 +932,7 @@ class HuggingFaceProvider(BaseProvider):
794
932
 
795
933
  # Create multimodal message combining text and media
796
934
  multimodal_message = media_handler.create_multimodal_message(prompt, media)
935
+ media_enrichment = getattr(media_handler, "media_enrichment", None)
797
936
 
798
937
  # For local providers, we get text-embedded content
799
938
  if isinstance(multimodal_message, str):
@@ -812,7 +951,7 @@ class HuggingFaceProvider(BaseProvider):
812
951
  else:
813
952
  prompt = str(multimodal_message["content"])
814
953
  except ImportError:
815
- self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
954
+ self.logger.warning("Media processing not available. Install with: pip install \"abstractcore[media]\"")
816
955
  except Exception as e:
817
956
  self.logger.warning(f"Failed to process media content: {e}")
818
957
 
@@ -821,15 +960,19 @@ class HuggingFaceProvider(BaseProvider):
821
960
  # Generation parameters using unified system
822
961
  generation_kwargs = self._prepare_generation_kwargs(**kwargs)
823
962
  max_new_tokens = self._get_provider_max_tokens_param(generation_kwargs)
824
- temperature = kwargs.get("temperature", self.temperature)
963
+ temperature = generation_kwargs.get("temperature", self.temperature)
825
964
  top_p = kwargs.get("top_p", 0.9)
826
- seed_value = kwargs.get("seed", self.seed)
965
+ seed_value = generation_kwargs.get("seed")
827
966
 
828
967
  try:
829
968
  if stream:
830
969
  return self._stream_generate_transformers_with_tools(input_text, max_new_tokens, temperature, top_p, tools, kwargs.get('tool_call_tags'), seed_value)
831
970
  else:
832
971
  response = self._single_generate_transformers(input_text, max_new_tokens, temperature, top_p, seed_value)
972
+ if media_enrichment:
973
+ from ..media.enrichment import merge_enrichment_metadata
974
+
975
+ response.metadata = merge_enrichment_metadata(response.metadata, media_enrichment)
833
976
 
834
977
  # Handle tool execution for prompted models
835
978
  if tools and self.tool_handler.supports_prompted and response.content:
@@ -982,41 +1125,101 @@ class HuggingFaceProvider(BaseProvider):
982
1125
  )
983
1126
 
984
1127
  try:
1128
+ # Server/gateway sometimes call providers with prompt="" + messages=[...] + media=[...].
1129
+ # For multimodal models, the user text and the media must live in the SAME user turn.
1130
+ # Best-effort: if prompt is empty, lift the last user message text into the prompt and
1131
+ # remove that message from the history to avoid duplication.
1132
+ prompt_text = prompt
1133
+ messages_for_context = list(messages) if isinstance(messages, list) else None
1134
+ if (not isinstance(prompt_text, str) or not prompt_text.strip()) and media and messages_for_context:
1135
+ for i in range(len(messages_for_context) - 1, -1, -1):
1136
+ msg = messages_for_context[i] or {}
1137
+ role = str(msg.get("role", "") or "").strip().lower()
1138
+ if role != "user":
1139
+ continue
1140
+ content = msg.get("content", "")
1141
+ lifted = None
1142
+ if isinstance(content, str) and content.strip():
1143
+ lifted = content.strip()
1144
+ elif isinstance(content, list):
1145
+ # OpenAI-style list content: [{"type":"text","text":"..."}, ...]
1146
+ for item in content:
1147
+ if not isinstance(item, dict):
1148
+ continue
1149
+ if str(item.get("type", "") or "").strip().lower() == "text":
1150
+ text_val = item.get("text")
1151
+ if isinstance(text_val, str) and text_val.strip():
1152
+ lifted = text_val.strip()
1153
+ break
1154
+ if lifted:
1155
+ prompt_text = lifted
1156
+ del messages_for_context[i]
1157
+ break
1158
+
985
1159
  # Build messages for vision model
986
1160
  chat_messages = []
987
1161
 
988
1162
  if system_prompt:
989
1163
  chat_messages.append({"role": "system", "content": system_prompt})
990
1164
 
991
- if messages:
992
- chat_messages.extend(messages)
1165
+ if messages_for_context:
1166
+ chat_messages.extend(messages_for_context)
993
1167
 
994
1168
  # Build user message with media content
995
1169
  user_content = []
996
1170
 
997
1171
  # Add text content
998
- if prompt:
999
- user_content.append({"type": "text", "text": prompt})
1172
+ if isinstance(prompt_text, str) and prompt_text.strip():
1173
+ user_content.append({"type": "text", "text": prompt_text.strip()})
1000
1174
 
1001
- # Add media content (images)
1175
+ # Add media content (images, video)
1176
+ has_video = False
1177
+ try:
1178
+ from ..media.types import MediaType, ContentFormat
1179
+ except Exception:
1180
+ MediaType = None # type: ignore[assignment]
1181
+ ContentFormat = None # type: ignore[assignment]
1182
+
1002
1183
  if media:
1003
1184
  for media_item in media:
1004
- if hasattr(media_item, 'file_path') and media_item.file_path:
1005
- # Use file path directly
1006
- user_content.append({
1007
- "type": "image",
1008
- "url": str(media_item.file_path)
1009
- })
1010
- elif hasattr(media_item, 'content') and media_item.content:
1011
- # Handle base64 content
1012
- if media_item.content_format == 'BASE64':
1013
- # Create data URL for base64 content
1014
- mime_type = getattr(media_item, 'mime_type', 'image/png')
1015
- data_url = f"data:{mime_type};base64,{media_item.content}"
1016
- user_content.append({
1017
- "type": "image",
1018
- "url": data_url
1019
- })
1185
+ media_type = getattr(media_item, "media_type", None)
1186
+
1187
+ # Text markers (e.g. provenance / policy annotations) should be preserved for the model.
1188
+ if MediaType is not None and media_type == MediaType.TEXT:
1189
+ txt = getattr(media_item, "content", None)
1190
+ if isinstance(txt, str) and txt.strip():
1191
+ user_content.append({"type": "text", "text": txt.strip()})
1192
+ continue
1193
+
1194
+ # Video inputs
1195
+ if MediaType is not None and media_type == MediaType.VIDEO:
1196
+ has_video = True
1197
+ # The actual video content is provided to the processor via `videos=...`;
1198
+ # the chat template only needs a `<video>` placeholder token.
1199
+ user_content.append({"type": "video"})
1200
+ continue
1201
+
1202
+ # Image inputs
1203
+ if MediaType is None or media_type == MediaType.IMAGE:
1204
+ if getattr(media_item, "file_path", None):
1205
+ user_content.append({"type": "image", "url": str(media_item.file_path)})
1206
+ continue
1207
+
1208
+ content = getattr(media_item, "content", None)
1209
+ if not content:
1210
+ continue
1211
+
1212
+ content_format = getattr(media_item, "content_format", None)
1213
+ is_base64 = False
1214
+ if ContentFormat is not None and content_format == ContentFormat.BASE64:
1215
+ is_base64 = True
1216
+ elif isinstance(content_format, str) and content_format.strip().lower() == "base64":
1217
+ is_base64 = True
1218
+
1219
+ if is_base64:
1220
+ mime_type = getattr(media_item, "mime_type", "image/png")
1221
+ data_url = f"data:{mime_type};base64,{content}"
1222
+ user_content.append({"type": "image", "url": data_url})
1020
1223
 
1021
1224
  # Add user message
1022
1225
  chat_messages.append({
@@ -1024,48 +1227,285 @@ class HuggingFaceProvider(BaseProvider):
1024
1227
  "content": user_content
1025
1228
  })
1026
1229
 
1027
- # Process messages using the processor
1028
- inputs = self.processor.apply_chat_template(
1029
- chat_messages,
1030
- tokenize=True,
1031
- add_generation_prompt=True,
1032
- return_dict=True,
1033
- return_tensors="pt"
1034
- ).to(self.model_instance.device)
1230
+ # Process messages using the processor.
1231
+ #
1232
+ # Some multimodal processors (e.g. LlavaNextVideoProcessor) return a *string*
1233
+ # from apply_chat_template; for those we must call the processor separately
1234
+ # with explicit images/videos tensors and keep video frame counts bounded.
1235
+ if has_video:
1236
+ # Resolve max frames for video sampling (keep small to avoid huge context).
1237
+ max_frames_raw = kwargs.get("video_max_frames", None)
1238
+ if max_frames_raw is None:
1239
+ try:
1240
+ from ..config.manager import get_config_manager
1241
+
1242
+ cfg_video = getattr(get_config_manager().config, "video", None)
1243
+ max_frames_raw = getattr(cfg_video, "max_frames_native", None) if cfg_video is not None else None
1244
+ if max_frames_raw is None:
1245
+ max_frames_raw = getattr(cfg_video, "max_frames", None) if cfg_video is not None else None
1246
+ except Exception:
1247
+ max_frames_raw = 3
1248
+ try:
1249
+ max_video_frames = max(1, int(max_frames_raw))
1250
+ except Exception:
1251
+ max_video_frames = 3
1252
+
1253
+ sampling_strategy_raw = kwargs.get("video_sampling_strategy", None)
1254
+ if sampling_strategy_raw is None:
1255
+ try:
1256
+ from ..config.manager import get_config_manager
1257
+
1258
+ sampling_strategy_raw = getattr(get_config_manager().config, "video", None).sampling_strategy # type: ignore[union-attr]
1259
+ except Exception:
1260
+ sampling_strategy_raw = "uniform"
1261
+ sampling_strategy = str(sampling_strategy_raw or "uniform").strip().lower()
1262
+ if sampling_strategy not in {"uniform", "keyframes"}:
1263
+ sampling_strategy = "uniform"
1264
+
1265
+ max_frame_side_raw = kwargs.get("video_max_frame_side", None)
1266
+ if max_frame_side_raw is None:
1267
+ try:
1268
+ from ..config.manager import get_config_manager
1269
+
1270
+ max_frame_side_raw = getattr(get_config_manager().config, "video", None).max_frame_side # type: ignore[union-attr]
1271
+ except Exception:
1272
+ max_frame_side_raw = 1024
1273
+ try:
1274
+ max_frame_side = int(max_frame_side_raw) if max_frame_side_raw is not None else None
1275
+ except Exception:
1276
+ max_frame_side = 1024
1277
+ if isinstance(max_frame_side, int) and max_frame_side <= 0:
1278
+ max_frame_side = None
1279
+
1280
+ # Build multimodal-typed messages for chat_template renderers that expect list content.
1281
+ # NOTE: Many HF native-video VLMs are brittle in multi-turn mode if prior turns
1282
+ # referenced media but we only retained text history (no `<video>` placeholders).
1283
+ # This can cause follow-ups like "and this one?" to over-weight the previous
1284
+ # text-only answer and ignore the newly attached video.
1285
+ #
1286
+ # To make follow-ups robust, collapse prior USER/ASSISTANT turns into a single
1287
+ # text block inside the current user message, and keep exactly one `<video>`
1288
+ # placeholder (the current attachment) in the chat template input.
1289
+ history_lines = []
1290
+ if messages_for_context:
1291
+ for msg in messages_for_context:
1292
+ role = str(msg.get("role", "user") or "").strip().lower()
1293
+ if role not in {"user", "assistant"}:
1294
+ continue
1295
+ content = msg.get("content", "")
1296
+ text = ""
1297
+ if isinstance(content, str):
1298
+ text = content
1299
+ elif isinstance(content, list):
1300
+ # OpenAI-style list content: [{"type":"text","text":"..."}, ...]
1301
+ for item in content:
1302
+ if not isinstance(item, dict):
1303
+ continue
1304
+ if str(item.get("type", "") or "").strip().lower() != "text":
1305
+ continue
1306
+ v = item.get("text")
1307
+ if isinstance(v, str) and v.strip():
1308
+ text = v
1309
+ break
1310
+ else:
1311
+ text = str(content)
1312
+
1313
+ text = str(text or "").strip()
1314
+ if not text:
1315
+ continue
1316
+ prefix = "USER" if role == "user" else "ASSISTANT"
1317
+ history_lines.append(f"{prefix}: {text}")
1318
+
1319
+ if history_lines:
1320
+ history_block = "Prior chat context (text-only):\n" + "\n".join(history_lines) + "\n\n"
1321
+ # Cap to avoid pathological prompt growth; keep the most recent tail.
1322
+ if len(history_block) > 8_000:
1323
+ history_block = "Prior chat context (text-only; truncated):\n…\n" + history_block[-7_800:]
1324
+ user_content = [{"type": "text", "text": history_block}] + list(user_content)
1325
+
1326
+ mm_messages = []
1327
+ if system_prompt:
1328
+ mm_messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
1329
+ mm_messages.append({"role": "user", "content": user_content})
1330
+
1331
+ prompt_text = self.processor.apply_chat_template(mm_messages, add_generation_prompt=True)
1332
+
1333
+ # Prepare explicit video inputs for the processor.
1334
+ #
1335
+ # Prefer ffmpeg-sampled frames (our own extraction) over relying on torchvision/torchcodec
1336
+ # decoding inside Transformers, which can vary by platform/codec support (notably for .mov).
1337
+ video_paths = []
1338
+ image_inputs = []
1339
+ for media_item in (media or []):
1340
+ if MediaType is not None and getattr(media_item, "media_type", None) == MediaType.VIDEO:
1341
+ video_path = getattr(media_item, "file_path", None) or getattr(media_item, "content", None)
1342
+ if not isinstance(video_path, str) or not video_path.strip():
1343
+ raise ValueError("Video MediaContent must provide file_path for HuggingFace video models.")
1344
+ video_paths.append(video_path)
1345
+ elif MediaType is not None and getattr(media_item, "media_type", None) == MediaType.IMAGE:
1346
+ fp = getattr(media_item, "file_path", None)
1347
+ if isinstance(fp, str) and fp.strip():
1348
+ try:
1349
+ from PIL import Image as PILImage
1350
+ except ImportError as e:
1351
+ raise RuntimeError(f"PIL is required for HuggingFace image inputs: {e}")
1352
+ image_inputs.append(PILImage.open(fp).convert("RGB"))
1353
+
1354
+ processor_call: Dict[str, Any] = {"text": prompt_text, "return_tensors": "pt"}
1355
+ if image_inputs:
1356
+ processor_call["images"] = image_inputs if len(image_inputs) > 1 else image_inputs[0]
1357
+ if video_paths:
1358
+ # Try ffmpeg frame sampling first.
1359
+ video_frame_inputs = []
1360
+ temp_dirs = []
1361
+ try:
1362
+ from pathlib import Path
1363
+ import tempfile
1364
+
1365
+ from ..media.utils.video_frames import extract_video_frames
1366
+ from PIL import Image as PILImage
1367
+
1368
+ for vp in video_paths:
1369
+ out_dir = Path(tempfile.mkdtemp(prefix="abstractcore_hf_video_frames_"))
1370
+ temp_dirs.append(out_dir)
1371
+ frames, _timestamps_s = extract_video_frames(
1372
+ Path(vp),
1373
+ max_frames=max_video_frames,
1374
+ frame_format="jpg",
1375
+ sampling_strategy=sampling_strategy,
1376
+ max_side=max_frame_side,
1377
+ output_dir=out_dir,
1378
+ )
1379
+ if not frames:
1380
+ raise RuntimeError("No frames extracted")
1381
+ video_frame_inputs.append([PILImage.open(p).convert("RGB") for p in frames])
1382
+
1383
+ # Single video -> pass list[PIL]; multiple videos -> list[list[PIL]]
1384
+ processor_call["videos"] = (
1385
+ video_frame_inputs[0]
1386
+ if len(video_frame_inputs) == 1
1387
+ else video_frame_inputs
1388
+ )
1389
+ except Exception:
1390
+ # If anything goes wrong with ffmpeg sampling, fall back to transformers decode.
1391
+ processor_call["videos"] = video_paths if len(video_paths) > 1 else video_paths[0]
1392
+ processor_call["videos_kwargs"] = {"do_sample_frames": True, "num_frames": max_video_frames}
1393
+ finally:
1394
+ # Cleanup extracted frames directories (frames are already loaded into memory as PIL).
1395
+ for d in temp_dirs:
1396
+ try:
1397
+ import shutil
1398
+
1399
+ shutil.rmtree(d, ignore_errors=True)
1400
+ except Exception:
1401
+ pass
1402
+
1403
+ inputs = self.processor(**processor_call)
1404
+ if hasattr(inputs, "to"):
1405
+ inputs = inputs.to(self.model_instance.device)
1406
+ else:
1407
+ templated = self.processor.apply_chat_template(
1408
+ chat_messages,
1409
+ tokenize=True,
1410
+ add_generation_prompt=True,
1411
+ return_dict=True,
1412
+ return_tensors="pt",
1413
+ )
1414
+ if isinstance(templated, str):
1415
+ # Processor returned a prompt string; fall back to explicit processor call.
1416
+ image_inputs = []
1417
+ for media_item in (media or []):
1418
+ if MediaType is not None and getattr(media_item, "media_type", None) == MediaType.IMAGE:
1419
+ fp = getattr(media_item, "file_path", None)
1420
+ if isinstance(fp, str) and fp.strip():
1421
+ try:
1422
+ from PIL import Image as PILImage
1423
+ except ImportError as e:
1424
+ raise RuntimeError(f"PIL is required for HuggingFace image inputs: {e}")
1425
+ image_inputs.append(PILImage.open(fp).convert("RGB"))
1426
+
1427
+ processor_call: Dict[str, Any] = {"text": templated, "return_tensors": "pt"}
1428
+ if image_inputs:
1429
+ processor_call["images"] = image_inputs if len(image_inputs) > 1 else image_inputs[0]
1430
+ inputs = self.processor(**processor_call)
1431
+ if hasattr(inputs, "to"):
1432
+ inputs = inputs.to(self.model_instance.device)
1433
+ else:
1434
+ inputs = templated.to(self.model_instance.device)
1035
1435
 
1036
- # Generation parameters
1436
+ temperature_value = kwargs.get("temperature", self.temperature)
1437
+ # For HF multimodal video models, default to greedy decoding unless the caller explicitly
1438
+ # provided a temperature. This avoids premature EOS producing unusably short answers.
1439
+ if has_video and ("temperature" in kwargs) and kwargs.get("temperature") is None:
1440
+ temperature_value = 0.0
1441
+ if temperature_value is None:
1442
+ temperature_value = self.temperature
1443
+
1444
+ max_new_tokens_raw = kwargs.get("max_output_tokens", None)
1445
+ if max_new_tokens_raw is None:
1446
+ max_new_tokens_raw = kwargs.get("max_tokens", None)
1447
+ if max_new_tokens_raw is None:
1448
+ max_new_tokens_raw = self.max_output_tokens or 512
1449
+ try:
1450
+ max_new_tokens_value = max(1, int(max_new_tokens_raw))
1451
+ except Exception:
1452
+ max_new_tokens_value = int(self.max_output_tokens or 512)
1453
+
1454
+ do_sample = True
1455
+ try:
1456
+ if temperature_value is None or float(temperature_value) <= 0:
1457
+ do_sample = False
1458
+ temperature_value = 0.0
1459
+ except Exception:
1460
+ do_sample = True
1461
+
1037
1462
  generation_kwargs = {
1038
- "max_new_tokens": kwargs.get("max_tokens", self.max_output_tokens or 512),
1039
- "temperature": kwargs.get("temperature", self.temperature),
1040
- "do_sample": True,
1463
+ "max_new_tokens": max_new_tokens_value,
1464
+ "temperature": temperature_value,
1465
+ "do_sample": do_sample,
1041
1466
  "pad_token_id": self.processor.tokenizer.eos_token_id,
1042
1467
  }
1043
1468
 
1044
1469
  # Add seed if provided
1045
- seed_value = kwargs.get("seed", self.seed)
1470
+ seed_value = self._normalize_seed(kwargs.get("seed", self.seed))
1046
1471
  if seed_value is not None:
1047
1472
  torch.manual_seed(seed_value)
1048
1473
  if torch.cuda.is_available():
1049
1474
  torch.cuda.manual_seed_all(seed_value)
1050
1475
 
1051
1476
  # Generate response
1052
- # For Apple Silicon, move inputs to CPU if MPS causes issues
1053
- if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
1054
- try:
1055
- generated_ids = self.model_instance.generate(**inputs, **generation_kwargs)
1056
- except RuntimeError as e:
1057
- if "MPS: Unsupported Border padding mode" in str(e):
1058
- self.logger.warning("MPS Border padding mode error detected, falling back to CPU")
1059
- # Move model and inputs to CPU
1060
- cpu_model = self.model_instance.to('cpu')
1061
- cpu_inputs = {k: v.to('cpu') if hasattr(v, 'to') else v for k, v in inputs.items()}
1062
- generated_ids = cpu_model.generate(**cpu_inputs, **generation_kwargs)
1063
- # Move model back to original device
1064
- self.model_instance.to(self.model_instance.device)
1477
+ generated_ids = None
1478
+ try:
1479
+ with torch.inference_mode():
1480
+ use_mps_lock = str(getattr(self, "device", "") or "").strip().lower() == "mps"
1481
+ if use_mps_lock:
1482
+ with _MPS_GENERATION_LOCK:
1483
+ generated_ids = self.model_instance.generate(**inputs, **generation_kwargs)
1065
1484
  else:
1066
- raise e
1067
- else:
1068
- generated_ids = self.model_instance.generate(**inputs, **generation_kwargs)
1485
+ generated_ids = self.model_instance.generate(**inputs, **generation_kwargs)
1486
+ except RuntimeError as e:
1487
+ if str(getattr(self, "device", "") or "").strip().lower() == "mps":
1488
+ raise RuntimeError(
1489
+ "HuggingFaceProvider vision/video generation failed on MPS. "
1490
+ "If this persists, force CPU via ABSTRACTCORE_HF_DEVICE=cpu."
1491
+ ) from e
1492
+ raise
1493
+ finally:
1494
+ # Best-effort: keep MPS memory pressure low between calls.
1495
+ try:
1496
+ if hasattr(torch, "mps") and hasattr(torch.mps, "empty_cache"):
1497
+ if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
1498
+ if hasattr(torch.mps, "synchronize"):
1499
+ torch.mps.synchronize()
1500
+ torch.mps.empty_cache()
1501
+ except Exception:
1502
+ pass
1503
+ try:
1504
+ import gc
1505
+
1506
+ gc.collect()
1507
+ except Exception:
1508
+ pass
1069
1509
 
1070
1510
  # Decode response
1071
1511
  output_text = self.processor.decode(
@@ -1080,7 +1520,7 @@ class HuggingFaceProvider(BaseProvider):
1080
1520
  input_tokens = inputs["input_ids"].shape[1]
1081
1521
  output_tokens = len(generated_ids[0]) - input_tokens
1082
1522
 
1083
- return GenerateResponse(
1523
+ response = GenerateResponse(
1084
1524
  content=output_text.strip(),
1085
1525
  model=self.model,
1086
1526
  finish_reason="stop",
@@ -1093,15 +1533,25 @@ class HuggingFaceProvider(BaseProvider):
1093
1533
  },
1094
1534
  gen_time=gen_time
1095
1535
  )
1536
+ if stream:
1537
+ def _single_chunk_stream() -> Iterator[GenerateResponse]:
1538
+ yield response
1539
+ return _single_chunk_stream()
1540
+ return response
1096
1541
 
1097
1542
  except Exception as e:
1098
1543
  gen_time = (time.time() - start_time) * 1000 if 'start_time' in locals() else 0.0
1099
- return GenerateResponse(
1544
+ error_resp = GenerateResponse(
1100
1545
  content=f"Error in vision model generation: {str(e)}",
1101
1546
  model=self.model,
1102
1547
  finish_reason="error",
1103
1548
  gen_time=gen_time
1104
1549
  )
1550
+ if stream:
1551
+ def _error_stream() -> Iterator[GenerateResponse]:
1552
+ yield error_resp
1553
+ return _error_stream()
1554
+ return error_resp
1105
1555
 
1106
1556
  def _patch_deepseek_for_mps(self):
1107
1557
  """Patch DeepSeek-OCR model to work with MPS instead of CUDA"""
@@ -1177,6 +1627,7 @@ class HuggingFaceProvider(BaseProvider):
1177
1627
  chat_messages.extend(messages)
1178
1628
 
1179
1629
  # Handle media content for the user message - use proper vision format for GGUF models
1630
+ media_enrichment = None
1180
1631
  if media:
1181
1632
  try:
1182
1633
  from ..architectures.detection import supports_vision
@@ -1222,10 +1673,11 @@ class HuggingFaceProvider(BaseProvider):
1222
1673
  from ..media.handlers import LocalMediaHandler
1223
1674
  media_handler = LocalMediaHandler("huggingface", self.model_capabilities, model_name=self.model)
1224
1675
  multimodal_message = media_handler.create_multimodal_message(prompt, media)
1676
+ media_enrichment = getattr(media_handler, "media_enrichment", None)
1225
1677
  user_message_content = multimodal_message if isinstance(multimodal_message, str) else prompt
1226
1678
 
1227
1679
  except ImportError:
1228
- self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
1680
+ self.logger.warning("Media processing not available. Install with: pip install \"abstractcore[media]\"")
1229
1681
  user_message_content = prompt
1230
1682
  except Exception as e:
1231
1683
  self.logger.warning(f"Failed to process media content: {e}")
@@ -1235,6 +1687,27 @@ class HuggingFaceProvider(BaseProvider):
1235
1687
 
1236
1688
  chat_messages.append({"role": "user", "content": user_message_content})
1237
1689
 
1690
+ # Prompt caching (GGUF/llama.cpp): best-effort per-key cache selection.
1691
+ prompt_cache_key = kwargs.get("prompt_cache_key")
1692
+ if isinstance(prompt_cache_key, str) and prompt_cache_key.strip():
1693
+ key = prompt_cache_key.strip()
1694
+ cache_obj = self._prompt_cache_store.get(key)
1695
+ if cache_obj is None:
1696
+ self.prompt_cache_set(key, make_default=False)
1697
+ cache_obj = self._prompt_cache_store.get(key)
1698
+ try:
1699
+ if cache_obj is not None and hasattr(self.llm, "set_cache"):
1700
+ self.llm.set_cache(cache_obj)
1701
+ except Exception:
1702
+ pass
1703
+ else:
1704
+ # Disable cache for this request when no key is provided.
1705
+ try:
1706
+ if hasattr(self.llm, "set_cache"):
1707
+ self.llm.set_cache(None)
1708
+ except Exception:
1709
+ pass
1710
+
1238
1711
  # Prepare parameters using unified system
1239
1712
  unified_kwargs = self._prepare_generation_kwargs(**kwargs)
1240
1713
  max_output_tokens = self._get_provider_max_tokens_param(unified_kwargs)
@@ -1242,13 +1715,13 @@ class HuggingFaceProvider(BaseProvider):
1242
1715
  generation_kwargs = {
1243
1716
  "messages": chat_messages,
1244
1717
  "max_tokens": max_output_tokens, # This is max_output_tokens for llama-cpp
1245
- "temperature": kwargs.get("temperature", self.temperature),
1718
+ "temperature": unified_kwargs.get("temperature", self.temperature),
1246
1719
  "top_p": kwargs.get("top_p", 0.9),
1247
1720
  "stream": stream
1248
1721
  }
1249
1722
 
1250
1723
  # Add seed if provided (GGUF/llama-cpp supports seed)
1251
- seed_value = kwargs.get("seed", self.seed)
1724
+ seed_value = unified_kwargs.get("seed")
1252
1725
  if seed_value is not None:
1253
1726
  generation_kwargs["seed"] = seed_value
1254
1727
 
@@ -1305,6 +1778,10 @@ class HuggingFaceProvider(BaseProvider):
1305
1778
  return self._stream_generate_gguf_with_tools(generation_kwargs, tools, has_native_tools, kwargs.get('tool_call_tags'))
1306
1779
  else:
1307
1780
  response = self._single_generate_gguf(generation_kwargs)
1781
+ if media_enrichment:
1782
+ from ..media.enrichment import merge_enrichment_metadata
1783
+
1784
+ response.metadata = merge_enrichment_metadata(response.metadata, media_enrichment)
1308
1785
 
1309
1786
  # Handle tool execution for both native and prompted responses
1310
1787
  if tools and (response.has_tool_calls() or