abstractcore 2.9.1__py3-none-any.whl → 2.11.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractcore/__init__.py +7 -27
- abstractcore/apps/extractor.py +33 -100
- abstractcore/apps/intent.py +19 -0
- abstractcore/apps/judge.py +20 -1
- abstractcore/apps/summarizer.py +20 -1
- abstractcore/architectures/detection.py +34 -1
- abstractcore/architectures/response_postprocessing.py +313 -0
- abstractcore/assets/architecture_formats.json +38 -8
- abstractcore/assets/model_capabilities.json +781 -160
- abstractcore/compression/__init__.py +1 -2
- abstractcore/compression/glyph_processor.py +6 -4
- abstractcore/config/main.py +31 -19
- abstractcore/config/manager.py +389 -11
- abstractcore/config/vision_config.py +5 -5
- abstractcore/core/interface.py +151 -3
- abstractcore/core/session.py +16 -10
- abstractcore/download.py +1 -1
- abstractcore/embeddings/manager.py +20 -6
- abstractcore/endpoint/__init__.py +2 -0
- abstractcore/endpoint/app.py +458 -0
- abstractcore/mcp/client.py +3 -1
- abstractcore/media/__init__.py +52 -17
- abstractcore/media/auto_handler.py +42 -22
- abstractcore/media/base.py +44 -1
- abstractcore/media/capabilities.py +12 -33
- abstractcore/media/enrichment.py +105 -0
- abstractcore/media/handlers/anthropic_handler.py +19 -28
- abstractcore/media/handlers/local_handler.py +124 -70
- abstractcore/media/handlers/openai_handler.py +19 -31
- abstractcore/media/processors/__init__.py +4 -2
- abstractcore/media/processors/audio_processor.py +57 -0
- abstractcore/media/processors/office_processor.py +8 -3
- abstractcore/media/processors/pdf_processor.py +46 -3
- abstractcore/media/processors/text_processor.py +22 -24
- abstractcore/media/processors/video_processor.py +58 -0
- abstractcore/media/types.py +97 -4
- abstractcore/media/utils/image_scaler.py +20 -2
- abstractcore/media/utils/video_frames.py +219 -0
- abstractcore/media/vision_fallback.py +136 -22
- abstractcore/processing/__init__.py +32 -3
- abstractcore/processing/basic_deepsearch.py +15 -10
- abstractcore/processing/basic_intent.py +3 -2
- abstractcore/processing/basic_judge.py +3 -2
- abstractcore/processing/basic_summarizer.py +1 -1
- abstractcore/providers/__init__.py +3 -1
- abstractcore/providers/anthropic_provider.py +95 -8
- abstractcore/providers/base.py +1516 -81
- abstractcore/providers/huggingface_provider.py +546 -69
- abstractcore/providers/lmstudio_provider.py +35 -923
- abstractcore/providers/mlx_provider.py +382 -35
- abstractcore/providers/model_capabilities.py +5 -1
- abstractcore/providers/ollama_provider.py +99 -15
- abstractcore/providers/openai_compatible_provider.py +406 -180
- abstractcore/providers/openai_provider.py +188 -44
- abstractcore/providers/openrouter_provider.py +76 -0
- abstractcore/providers/registry.py +61 -5
- abstractcore/providers/streaming.py +138 -33
- abstractcore/providers/vllm_provider.py +92 -817
- abstractcore/server/app.py +461 -13
- abstractcore/server/audio_endpoints.py +139 -0
- abstractcore/server/vision_endpoints.py +1319 -0
- abstractcore/structured/handler.py +316 -41
- abstractcore/tools/common_tools.py +5501 -2012
- abstractcore/tools/comms_tools.py +1641 -0
- abstractcore/tools/core.py +37 -7
- abstractcore/tools/handler.py +4 -9
- abstractcore/tools/parser.py +49 -2
- abstractcore/tools/tag_rewriter.py +2 -1
- abstractcore/tools/telegram_tdlib.py +407 -0
- abstractcore/tools/telegram_tools.py +261 -0
- abstractcore/utils/cli.py +1085 -72
- abstractcore/utils/token_utils.py +2 -0
- abstractcore/utils/truncation.py +29 -0
- abstractcore/utils/version.py +3 -4
- abstractcore/utils/vlm_token_calculator.py +12 -2
- abstractcore-2.11.2.dist-info/METADATA +562 -0
- abstractcore-2.11.2.dist-info/RECORD +133 -0
- {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/WHEEL +1 -1
- {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/entry_points.txt +1 -0
- abstractcore-2.9.1.dist-info/METADATA +0 -1190
- abstractcore-2.9.1.dist-info/RECORD +0 -119
- {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/licenses/LICENSE +0 -0
- {abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/top_level.txt +0 -0
|
@@ -5,6 +5,7 @@ Supports both transformers models and GGUF models via llama-cpp-python.
|
|
|
5
5
|
|
|
6
6
|
import os
|
|
7
7
|
import json
|
|
8
|
+
import threading
|
|
8
9
|
from pathlib import Path
|
|
9
10
|
from typing import List, Dict, Any, Optional, Union, Iterator, Type
|
|
10
11
|
|
|
@@ -39,6 +40,9 @@ from ..exceptions import ModelNotFoundError, format_model_error
|
|
|
39
40
|
from ..tools import UniversalToolHandler, execute_tools
|
|
40
41
|
from ..events import EventType
|
|
41
42
|
|
|
43
|
+
|
|
44
|
+
_MPS_GENERATION_LOCK = threading.Lock()
|
|
45
|
+
|
|
42
46
|
# Try to import transformers (standard HuggingFace support)
|
|
43
47
|
try:
|
|
44
48
|
from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer, AutoProcessor, AutoModelForImageTextToText, pipeline
|
|
@@ -84,6 +88,22 @@ def _get_local_model_path(model_name: str) -> Optional[str]:
|
|
|
84
88
|
class HuggingFaceProvider(BaseProvider):
|
|
85
89
|
"""HuggingFace provider with dual support for transformers and GGUF models"""
|
|
86
90
|
|
|
91
|
+
@staticmethod
|
|
92
|
+
def _resolve_requested_device(device: Optional[str]) -> Optional[str]:
|
|
93
|
+
"""Resolve the requested device from explicit arg or env override.
|
|
94
|
+
|
|
95
|
+
Supported env var: ABSTRACTCORE_HF_DEVICE=cpu|mps|cuda|auto
|
|
96
|
+
"""
|
|
97
|
+
if isinstance(device, str) and device.strip():
|
|
98
|
+
return device.strip().lower()
|
|
99
|
+
|
|
100
|
+
env_device = os.environ.get("ABSTRACTCORE_HF_DEVICE")
|
|
101
|
+
if isinstance(env_device, str) and env_device.strip():
|
|
102
|
+
val = env_device.strip().lower()
|
|
103
|
+
if val in {"auto", "cpu", "mps", "cuda"}:
|
|
104
|
+
return val
|
|
105
|
+
return None
|
|
106
|
+
|
|
87
107
|
def __init__(self, model: str = "unsloth/Qwen3-4B-Instruct-2507-GGUF",
|
|
88
108
|
device: Optional[str] = None,
|
|
89
109
|
n_gpu_layers: Optional[int] = None,
|
|
@@ -122,7 +142,7 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
122
142
|
# Store provider-specific configuration
|
|
123
143
|
self.n_gpu_layers = n_gpu_layers
|
|
124
144
|
self.model_type = None # Will be "transformers" or "gguf"
|
|
125
|
-
self.device = device
|
|
145
|
+
self.device = self._resolve_requested_device(device)
|
|
126
146
|
|
|
127
147
|
# Store transformers-specific parameters
|
|
128
148
|
self.transformers_kwargs = {
|
|
@@ -154,7 +174,7 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
154
174
|
self._setup_device_transformers()
|
|
155
175
|
self._load_transformers_model()
|
|
156
176
|
|
|
157
|
-
def
|
|
177
|
+
def unload_model(self, model_name: str) -> None:
|
|
158
178
|
"""
|
|
159
179
|
Unload the model from memory.
|
|
160
180
|
|
|
@@ -187,14 +207,61 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
187
207
|
if hasattr(self, 'logger'):
|
|
188
208
|
self.logger.warning(f"Error during unload: {e}")
|
|
189
209
|
|
|
190
|
-
def
|
|
191
|
-
"""
|
|
210
|
+
def supports_prompt_cache(self) -> bool:
|
|
211
|
+
"""GGUF backends can use llama.cpp prompt caching (prefix state cache)."""
|
|
212
|
+
return getattr(self, "model_type", None) == "gguf"
|
|
213
|
+
|
|
214
|
+
def prompt_cache_set(
|
|
215
|
+
self,
|
|
216
|
+
key: str,
|
|
217
|
+
*,
|
|
218
|
+
make_default: bool = True,
|
|
219
|
+
ttl_s: Optional[float] = None,
|
|
220
|
+
capacity_bytes: Optional[int] = None,
|
|
221
|
+
**kwargs,
|
|
222
|
+
) -> bool:
|
|
223
|
+
"""Create/reset a llama.cpp prompt cache for the given key (GGUF only)."""
|
|
224
|
+
_ = kwargs
|
|
225
|
+
normalized = self._normalize_prompt_cache_key(key)
|
|
226
|
+
if normalized is None:
|
|
227
|
+
return False
|
|
228
|
+
if not self.supports_prompt_cache():
|
|
229
|
+
return False
|
|
230
|
+
if not super().prompt_cache_set(normalized, make_default=make_default):
|
|
231
|
+
return False
|
|
232
|
+
|
|
192
233
|
try:
|
|
193
|
-
|
|
234
|
+
from llama_cpp.llama_cache import LlamaRAMCache
|
|
235
|
+
except Exception:
|
|
236
|
+
return False
|
|
237
|
+
|
|
238
|
+
cap = int(capacity_bytes) if isinstance(capacity_bytes, int) and capacity_bytes > 0 else (512 << 20)
|
|
239
|
+
cache_obj = LlamaRAMCache(capacity_bytes=cap)
|
|
240
|
+
|
|
241
|
+
try:
|
|
242
|
+
self._prompt_cache_store.set(normalized, cache_obj, ttl_s=ttl_s, meta={"backend": "llama_cpp"})
|
|
243
|
+
except Exception:
|
|
244
|
+
return False
|
|
245
|
+
|
|
246
|
+
# Best-effort: activate this cache on the shared llama instance.
|
|
247
|
+
try:
|
|
248
|
+
if getattr(self, "llm", None) is not None and hasattr(self.llm, "set_cache"):
|
|
249
|
+
self.llm.set_cache(cache_obj)
|
|
194
250
|
except Exception:
|
|
195
|
-
# Silently handle any cleanup errors - this is expected during shutdown
|
|
196
251
|
pass
|
|
197
252
|
|
|
253
|
+
return True
|
|
254
|
+
|
|
255
|
+
def prompt_cache_clear(self, key: Optional[str] = None) -> bool:
|
|
256
|
+
"""Clear llama.cpp prompt caches (GGUF only; best-effort)."""
|
|
257
|
+
cleared = super().prompt_cache_clear(key)
|
|
258
|
+
try:
|
|
259
|
+
if getattr(self, "llm", None) is not None and hasattr(self.llm, "set_cache"):
|
|
260
|
+
self.llm.set_cache(None)
|
|
261
|
+
except Exception:
|
|
262
|
+
pass
|
|
263
|
+
return cleared
|
|
264
|
+
|
|
198
265
|
def _is_gguf_model(self, model: str) -> bool:
|
|
199
266
|
"""Detect if the model is a GGUF model"""
|
|
200
267
|
# Check if it's a .gguf file path
|
|
@@ -238,19 +305,75 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
238
305
|
return any(vision_keyword in model_lower for vision_keyword in vision_models)
|
|
239
306
|
|
|
240
307
|
def _setup_device_transformers(self):
|
|
241
|
-
"""Setup device for transformers models
|
|
308
|
+
"""Setup device for transformers models (best-effort).
|
|
309
|
+
|
|
310
|
+
We validate explicit device requests even when Transformers isn't available,
|
|
311
|
+
since Torch availability (MPS/CUDA) may still matter for downstream behavior.
|
|
312
|
+
"""
|
|
313
|
+
try:
|
|
314
|
+
import torch # type: ignore
|
|
315
|
+
except Exception:
|
|
316
|
+
self.device = "cpu"
|
|
317
|
+
return
|
|
318
|
+
|
|
319
|
+
requested = str(self.device or "").strip().lower() if isinstance(self.device, str) else ""
|
|
320
|
+
if requested and requested != "auto":
|
|
321
|
+
# Respect explicit user/env request, but fall back safely if unavailable.
|
|
322
|
+
if requested == "mps":
|
|
323
|
+
if hasattr(torch.backends, "mps") and torch.backends.mps.is_built() and not torch.backends.mps.is_available():
|
|
324
|
+
self.logger.warning(
|
|
325
|
+
"HuggingFaceProvider requested device=mps but MPS is not available. "
|
|
326
|
+
"This usually means the process cannot see Metal devices (sandboxed execution). "
|
|
327
|
+
"Falling back to CPU. To silence this, set ABSTRACTCORE_HF_DEVICE=cpu."
|
|
328
|
+
)
|
|
329
|
+
self.device = "cpu"
|
|
330
|
+
else:
|
|
331
|
+
self.device = "mps"
|
|
332
|
+
elif requested == "cuda":
|
|
333
|
+
if torch.cuda.is_available():
|
|
334
|
+
self.device = "cuda"
|
|
335
|
+
else:
|
|
336
|
+
self.logger.warning(
|
|
337
|
+
"HuggingFaceProvider requested device=cuda but CUDA is not available; falling back to CPU."
|
|
338
|
+
)
|
|
339
|
+
self.device = "cpu"
|
|
340
|
+
else:
|
|
341
|
+
self.device = "cpu"
|
|
342
|
+
return
|
|
343
|
+
|
|
242
344
|
if not TRANSFORMERS_AVAILABLE:
|
|
345
|
+
# Without transformers, default to CPU for safety.
|
|
346
|
+
self.device = "cpu"
|
|
243
347
|
return
|
|
244
348
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
elif torch.backends.mps.is_available():
|
|
349
|
+
# Auto device selection.
|
|
350
|
+
if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
|
|
248
351
|
self.device = "mps"
|
|
249
352
|
elif torch.cuda.is_available():
|
|
250
353
|
self.device = "cuda"
|
|
251
354
|
else:
|
|
252
355
|
self.device = "cpu"
|
|
253
356
|
|
|
357
|
+
# Apple Silicon: MPS built but unavailable is usually a sandbox / Metal visibility issue.
|
|
358
|
+
try:
|
|
359
|
+
import platform
|
|
360
|
+
|
|
361
|
+
if (
|
|
362
|
+
self.device == "cpu"
|
|
363
|
+
and platform.system() == "Darwin"
|
|
364
|
+
and platform.machine() == "arm64"
|
|
365
|
+
and hasattr(torch.backends, "mps")
|
|
366
|
+
and torch.backends.mps.is_built()
|
|
367
|
+
and not torch.backends.mps.is_available()
|
|
368
|
+
):
|
|
369
|
+
self.logger.warning(
|
|
370
|
+
"PyTorch was built with MPS support, but MPS is not available. "
|
|
371
|
+
"This often indicates the process cannot access Metal devices (sandboxed execution). "
|
|
372
|
+
"Run outside the sandbox or force CPU via ABSTRACTCORE_HF_DEVICE=cpu."
|
|
373
|
+
)
|
|
374
|
+
except Exception:
|
|
375
|
+
pass
|
|
376
|
+
|
|
254
377
|
def _setup_device_gguf(self):
|
|
255
378
|
"""Setup device for GGUF models"""
|
|
256
379
|
# Auto-detect GPU layers if not specified
|
|
@@ -396,6 +519,15 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
396
519
|
# Respect offline-first configuration
|
|
397
520
|
if _config.should_force_local_files_only():
|
|
398
521
|
vision_kwargs['local_files_only'] = True
|
|
522
|
+
|
|
523
|
+
# Safer defaults on GPU backends: float16 unless caller provided torch_dtype.
|
|
524
|
+
try:
|
|
525
|
+
if self.device in {"mps", "cuda"} and "torch_dtype" not in vision_kwargs:
|
|
526
|
+
import torch as _torch
|
|
527
|
+
|
|
528
|
+
vision_kwargs["torch_dtype"] = _torch.float16
|
|
529
|
+
except Exception:
|
|
530
|
+
pass
|
|
399
531
|
|
|
400
532
|
# Use local cache path if offline mode is enabled and model is cached
|
|
401
533
|
model_path = self.model
|
|
@@ -419,6 +551,11 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
419
551
|
# Move to device (only if not using device_map)
|
|
420
552
|
if self.device in ["cuda", "mps"] and 'device_map' not in self.transformers_kwargs:
|
|
421
553
|
self.model_instance = self.model_instance.to(self.device)
|
|
554
|
+
|
|
555
|
+
try:
|
|
556
|
+
self.model_instance.eval()
|
|
557
|
+
except Exception:
|
|
558
|
+
pass
|
|
422
559
|
|
|
423
560
|
# For vision models, we don't use the standard pipeline
|
|
424
561
|
self.pipeline = None
|
|
@@ -737,7 +874,7 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
737
874
|
# Check if Outlines is required but unavailable
|
|
738
875
|
if self.structured_output_method == "native_outlines" and not OUTLINES_AVAILABLE:
|
|
739
876
|
return GenerateResponse(
|
|
740
|
-
content="Error: structured_output_method='native_outlines' requires Outlines library. Install with: pip install abstractcore[huggingface]",
|
|
877
|
+
content="Error: structured_output_method='native_outlines' requires Outlines library. Install with: pip install \"abstractcore[huggingface]\"",
|
|
741
878
|
model=self.model,
|
|
742
879
|
finish_reason="error"
|
|
743
880
|
)
|
|
@@ -787,6 +924,7 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
787
924
|
|
|
788
925
|
# Build input text with tool and media support
|
|
789
926
|
# Handle media content first if present
|
|
927
|
+
media_enrichment = None
|
|
790
928
|
if media:
|
|
791
929
|
try:
|
|
792
930
|
from ..media.handlers import LocalMediaHandler
|
|
@@ -794,6 +932,7 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
794
932
|
|
|
795
933
|
# Create multimodal message combining text and media
|
|
796
934
|
multimodal_message = media_handler.create_multimodal_message(prompt, media)
|
|
935
|
+
media_enrichment = getattr(media_handler, "media_enrichment", None)
|
|
797
936
|
|
|
798
937
|
# For local providers, we get text-embedded content
|
|
799
938
|
if isinstance(multimodal_message, str):
|
|
@@ -812,7 +951,7 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
812
951
|
else:
|
|
813
952
|
prompt = str(multimodal_message["content"])
|
|
814
953
|
except ImportError:
|
|
815
|
-
self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
|
|
954
|
+
self.logger.warning("Media processing not available. Install with: pip install \"abstractcore[media]\"")
|
|
816
955
|
except Exception as e:
|
|
817
956
|
self.logger.warning(f"Failed to process media content: {e}")
|
|
818
957
|
|
|
@@ -821,15 +960,19 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
821
960
|
# Generation parameters using unified system
|
|
822
961
|
generation_kwargs = self._prepare_generation_kwargs(**kwargs)
|
|
823
962
|
max_new_tokens = self._get_provider_max_tokens_param(generation_kwargs)
|
|
824
|
-
temperature =
|
|
963
|
+
temperature = generation_kwargs.get("temperature", self.temperature)
|
|
825
964
|
top_p = kwargs.get("top_p", 0.9)
|
|
826
|
-
seed_value =
|
|
965
|
+
seed_value = generation_kwargs.get("seed")
|
|
827
966
|
|
|
828
967
|
try:
|
|
829
968
|
if stream:
|
|
830
969
|
return self._stream_generate_transformers_with_tools(input_text, max_new_tokens, temperature, top_p, tools, kwargs.get('tool_call_tags'), seed_value)
|
|
831
970
|
else:
|
|
832
971
|
response = self._single_generate_transformers(input_text, max_new_tokens, temperature, top_p, seed_value)
|
|
972
|
+
if media_enrichment:
|
|
973
|
+
from ..media.enrichment import merge_enrichment_metadata
|
|
974
|
+
|
|
975
|
+
response.metadata = merge_enrichment_metadata(response.metadata, media_enrichment)
|
|
833
976
|
|
|
834
977
|
# Handle tool execution for prompted models
|
|
835
978
|
if tools and self.tool_handler.supports_prompted and response.content:
|
|
@@ -982,41 +1125,101 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
982
1125
|
)
|
|
983
1126
|
|
|
984
1127
|
try:
|
|
1128
|
+
# Server/gateway sometimes call providers with prompt="" + messages=[...] + media=[...].
|
|
1129
|
+
# For multimodal models, the user text and the media must live in the SAME user turn.
|
|
1130
|
+
# Best-effort: if prompt is empty, lift the last user message text into the prompt and
|
|
1131
|
+
# remove that message from the history to avoid duplication.
|
|
1132
|
+
prompt_text = prompt
|
|
1133
|
+
messages_for_context = list(messages) if isinstance(messages, list) else None
|
|
1134
|
+
if (not isinstance(prompt_text, str) or not prompt_text.strip()) and media and messages_for_context:
|
|
1135
|
+
for i in range(len(messages_for_context) - 1, -1, -1):
|
|
1136
|
+
msg = messages_for_context[i] or {}
|
|
1137
|
+
role = str(msg.get("role", "") or "").strip().lower()
|
|
1138
|
+
if role != "user":
|
|
1139
|
+
continue
|
|
1140
|
+
content = msg.get("content", "")
|
|
1141
|
+
lifted = None
|
|
1142
|
+
if isinstance(content, str) and content.strip():
|
|
1143
|
+
lifted = content.strip()
|
|
1144
|
+
elif isinstance(content, list):
|
|
1145
|
+
# OpenAI-style list content: [{"type":"text","text":"..."}, ...]
|
|
1146
|
+
for item in content:
|
|
1147
|
+
if not isinstance(item, dict):
|
|
1148
|
+
continue
|
|
1149
|
+
if str(item.get("type", "") or "").strip().lower() == "text":
|
|
1150
|
+
text_val = item.get("text")
|
|
1151
|
+
if isinstance(text_val, str) and text_val.strip():
|
|
1152
|
+
lifted = text_val.strip()
|
|
1153
|
+
break
|
|
1154
|
+
if lifted:
|
|
1155
|
+
prompt_text = lifted
|
|
1156
|
+
del messages_for_context[i]
|
|
1157
|
+
break
|
|
1158
|
+
|
|
985
1159
|
# Build messages for vision model
|
|
986
1160
|
chat_messages = []
|
|
987
1161
|
|
|
988
1162
|
if system_prompt:
|
|
989
1163
|
chat_messages.append({"role": "system", "content": system_prompt})
|
|
990
1164
|
|
|
991
|
-
if
|
|
992
|
-
chat_messages.extend(
|
|
1165
|
+
if messages_for_context:
|
|
1166
|
+
chat_messages.extend(messages_for_context)
|
|
993
1167
|
|
|
994
1168
|
# Build user message with media content
|
|
995
1169
|
user_content = []
|
|
996
1170
|
|
|
997
1171
|
# Add text content
|
|
998
|
-
if
|
|
999
|
-
user_content.append({"type": "text", "text":
|
|
1172
|
+
if isinstance(prompt_text, str) and prompt_text.strip():
|
|
1173
|
+
user_content.append({"type": "text", "text": prompt_text.strip()})
|
|
1000
1174
|
|
|
1001
|
-
# Add media content (images)
|
|
1175
|
+
# Add media content (images, video)
|
|
1176
|
+
has_video = False
|
|
1177
|
+
try:
|
|
1178
|
+
from ..media.types import MediaType, ContentFormat
|
|
1179
|
+
except Exception:
|
|
1180
|
+
MediaType = None # type: ignore[assignment]
|
|
1181
|
+
ContentFormat = None # type: ignore[assignment]
|
|
1182
|
+
|
|
1002
1183
|
if media:
|
|
1003
1184
|
for media_item in media:
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1185
|
+
media_type = getattr(media_item, "media_type", None)
|
|
1186
|
+
|
|
1187
|
+
# Text markers (e.g. provenance / policy annotations) should be preserved for the model.
|
|
1188
|
+
if MediaType is not None and media_type == MediaType.TEXT:
|
|
1189
|
+
txt = getattr(media_item, "content", None)
|
|
1190
|
+
if isinstance(txt, str) and txt.strip():
|
|
1191
|
+
user_content.append({"type": "text", "text": txt.strip()})
|
|
1192
|
+
continue
|
|
1193
|
+
|
|
1194
|
+
# Video inputs
|
|
1195
|
+
if MediaType is not None and media_type == MediaType.VIDEO:
|
|
1196
|
+
has_video = True
|
|
1197
|
+
# The actual video content is provided to the processor via `videos=...`;
|
|
1198
|
+
# the chat template only needs a `<video>` placeholder token.
|
|
1199
|
+
user_content.append({"type": "video"})
|
|
1200
|
+
continue
|
|
1201
|
+
|
|
1202
|
+
# Image inputs
|
|
1203
|
+
if MediaType is None or media_type == MediaType.IMAGE:
|
|
1204
|
+
if getattr(media_item, "file_path", None):
|
|
1205
|
+
user_content.append({"type": "image", "url": str(media_item.file_path)})
|
|
1206
|
+
continue
|
|
1207
|
+
|
|
1208
|
+
content = getattr(media_item, "content", None)
|
|
1209
|
+
if not content:
|
|
1210
|
+
continue
|
|
1211
|
+
|
|
1212
|
+
content_format = getattr(media_item, "content_format", None)
|
|
1213
|
+
is_base64 = False
|
|
1214
|
+
if ContentFormat is not None and content_format == ContentFormat.BASE64:
|
|
1215
|
+
is_base64 = True
|
|
1216
|
+
elif isinstance(content_format, str) and content_format.strip().lower() == "base64":
|
|
1217
|
+
is_base64 = True
|
|
1218
|
+
|
|
1219
|
+
if is_base64:
|
|
1220
|
+
mime_type = getattr(media_item, "mime_type", "image/png")
|
|
1221
|
+
data_url = f"data:{mime_type};base64,{content}"
|
|
1222
|
+
user_content.append({"type": "image", "url": data_url})
|
|
1020
1223
|
|
|
1021
1224
|
# Add user message
|
|
1022
1225
|
chat_messages.append({
|
|
@@ -1024,48 +1227,285 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
1024
1227
|
"content": user_content
|
|
1025
1228
|
})
|
|
1026
1229
|
|
|
1027
|
-
# Process messages using the processor
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1230
|
+
# Process messages using the processor.
|
|
1231
|
+
#
|
|
1232
|
+
# Some multimodal processors (e.g. LlavaNextVideoProcessor) return a *string*
|
|
1233
|
+
# from apply_chat_template; for those we must call the processor separately
|
|
1234
|
+
# with explicit images/videos tensors and keep video frame counts bounded.
|
|
1235
|
+
if has_video:
|
|
1236
|
+
# Resolve max frames for video sampling (keep small to avoid huge context).
|
|
1237
|
+
max_frames_raw = kwargs.get("video_max_frames", None)
|
|
1238
|
+
if max_frames_raw is None:
|
|
1239
|
+
try:
|
|
1240
|
+
from ..config.manager import get_config_manager
|
|
1241
|
+
|
|
1242
|
+
cfg_video = getattr(get_config_manager().config, "video", None)
|
|
1243
|
+
max_frames_raw = getattr(cfg_video, "max_frames_native", None) if cfg_video is not None else None
|
|
1244
|
+
if max_frames_raw is None:
|
|
1245
|
+
max_frames_raw = getattr(cfg_video, "max_frames", None) if cfg_video is not None else None
|
|
1246
|
+
except Exception:
|
|
1247
|
+
max_frames_raw = 3
|
|
1248
|
+
try:
|
|
1249
|
+
max_video_frames = max(1, int(max_frames_raw))
|
|
1250
|
+
except Exception:
|
|
1251
|
+
max_video_frames = 3
|
|
1252
|
+
|
|
1253
|
+
sampling_strategy_raw = kwargs.get("video_sampling_strategy", None)
|
|
1254
|
+
if sampling_strategy_raw is None:
|
|
1255
|
+
try:
|
|
1256
|
+
from ..config.manager import get_config_manager
|
|
1257
|
+
|
|
1258
|
+
sampling_strategy_raw = getattr(get_config_manager().config, "video", None).sampling_strategy # type: ignore[union-attr]
|
|
1259
|
+
except Exception:
|
|
1260
|
+
sampling_strategy_raw = "uniform"
|
|
1261
|
+
sampling_strategy = str(sampling_strategy_raw or "uniform").strip().lower()
|
|
1262
|
+
if sampling_strategy not in {"uniform", "keyframes"}:
|
|
1263
|
+
sampling_strategy = "uniform"
|
|
1264
|
+
|
|
1265
|
+
max_frame_side_raw = kwargs.get("video_max_frame_side", None)
|
|
1266
|
+
if max_frame_side_raw is None:
|
|
1267
|
+
try:
|
|
1268
|
+
from ..config.manager import get_config_manager
|
|
1269
|
+
|
|
1270
|
+
max_frame_side_raw = getattr(get_config_manager().config, "video", None).max_frame_side # type: ignore[union-attr]
|
|
1271
|
+
except Exception:
|
|
1272
|
+
max_frame_side_raw = 1024
|
|
1273
|
+
try:
|
|
1274
|
+
max_frame_side = int(max_frame_side_raw) if max_frame_side_raw is not None else None
|
|
1275
|
+
except Exception:
|
|
1276
|
+
max_frame_side = 1024
|
|
1277
|
+
if isinstance(max_frame_side, int) and max_frame_side <= 0:
|
|
1278
|
+
max_frame_side = None
|
|
1279
|
+
|
|
1280
|
+
# Build multimodal-typed messages for chat_template renderers that expect list content.
|
|
1281
|
+
# NOTE: Many HF native-video VLMs are brittle in multi-turn mode if prior turns
|
|
1282
|
+
# referenced media but we only retained text history (no `<video>` placeholders).
|
|
1283
|
+
# This can cause follow-ups like "and this one?" to over-weight the previous
|
|
1284
|
+
# text-only answer and ignore the newly attached video.
|
|
1285
|
+
#
|
|
1286
|
+
# To make follow-ups robust, collapse prior USER/ASSISTANT turns into a single
|
|
1287
|
+
# text block inside the current user message, and keep exactly one `<video>`
|
|
1288
|
+
# placeholder (the current attachment) in the chat template input.
|
|
1289
|
+
history_lines = []
|
|
1290
|
+
if messages_for_context:
|
|
1291
|
+
for msg in messages_for_context:
|
|
1292
|
+
role = str(msg.get("role", "user") or "").strip().lower()
|
|
1293
|
+
if role not in {"user", "assistant"}:
|
|
1294
|
+
continue
|
|
1295
|
+
content = msg.get("content", "")
|
|
1296
|
+
text = ""
|
|
1297
|
+
if isinstance(content, str):
|
|
1298
|
+
text = content
|
|
1299
|
+
elif isinstance(content, list):
|
|
1300
|
+
# OpenAI-style list content: [{"type":"text","text":"..."}, ...]
|
|
1301
|
+
for item in content:
|
|
1302
|
+
if not isinstance(item, dict):
|
|
1303
|
+
continue
|
|
1304
|
+
if str(item.get("type", "") or "").strip().lower() != "text":
|
|
1305
|
+
continue
|
|
1306
|
+
v = item.get("text")
|
|
1307
|
+
if isinstance(v, str) and v.strip():
|
|
1308
|
+
text = v
|
|
1309
|
+
break
|
|
1310
|
+
else:
|
|
1311
|
+
text = str(content)
|
|
1312
|
+
|
|
1313
|
+
text = str(text or "").strip()
|
|
1314
|
+
if not text:
|
|
1315
|
+
continue
|
|
1316
|
+
prefix = "USER" if role == "user" else "ASSISTANT"
|
|
1317
|
+
history_lines.append(f"{prefix}: {text}")
|
|
1318
|
+
|
|
1319
|
+
if history_lines:
|
|
1320
|
+
history_block = "Prior chat context (text-only):\n" + "\n".join(history_lines) + "\n\n"
|
|
1321
|
+
# Cap to avoid pathological prompt growth; keep the most recent tail.
|
|
1322
|
+
if len(history_block) > 8_000:
|
|
1323
|
+
history_block = "Prior chat context (text-only; truncated):\n…\n" + history_block[-7_800:]
|
|
1324
|
+
user_content = [{"type": "text", "text": history_block}] + list(user_content)
|
|
1325
|
+
|
|
1326
|
+
mm_messages = []
|
|
1327
|
+
if system_prompt:
|
|
1328
|
+
mm_messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
|
|
1329
|
+
mm_messages.append({"role": "user", "content": user_content})
|
|
1330
|
+
|
|
1331
|
+
prompt_text = self.processor.apply_chat_template(mm_messages, add_generation_prompt=True)
|
|
1332
|
+
|
|
1333
|
+
# Prepare explicit video inputs for the processor.
|
|
1334
|
+
#
|
|
1335
|
+
# Prefer ffmpeg-sampled frames (our own extraction) over relying on torchvision/torchcodec
|
|
1336
|
+
# decoding inside Transformers, which can vary by platform/codec support (notably for .mov).
|
|
1337
|
+
video_paths = []
|
|
1338
|
+
image_inputs = []
|
|
1339
|
+
for media_item in (media or []):
|
|
1340
|
+
if MediaType is not None and getattr(media_item, "media_type", None) == MediaType.VIDEO:
|
|
1341
|
+
video_path = getattr(media_item, "file_path", None) or getattr(media_item, "content", None)
|
|
1342
|
+
if not isinstance(video_path, str) or not video_path.strip():
|
|
1343
|
+
raise ValueError("Video MediaContent must provide file_path for HuggingFace video models.")
|
|
1344
|
+
video_paths.append(video_path)
|
|
1345
|
+
elif MediaType is not None and getattr(media_item, "media_type", None) == MediaType.IMAGE:
|
|
1346
|
+
fp = getattr(media_item, "file_path", None)
|
|
1347
|
+
if isinstance(fp, str) and fp.strip():
|
|
1348
|
+
try:
|
|
1349
|
+
from PIL import Image as PILImage
|
|
1350
|
+
except ImportError as e:
|
|
1351
|
+
raise RuntimeError(f"PIL is required for HuggingFace image inputs: {e}")
|
|
1352
|
+
image_inputs.append(PILImage.open(fp).convert("RGB"))
|
|
1353
|
+
|
|
1354
|
+
processor_call: Dict[str, Any] = {"text": prompt_text, "return_tensors": "pt"}
|
|
1355
|
+
if image_inputs:
|
|
1356
|
+
processor_call["images"] = image_inputs if len(image_inputs) > 1 else image_inputs[0]
|
|
1357
|
+
if video_paths:
|
|
1358
|
+
# Try ffmpeg frame sampling first.
|
|
1359
|
+
video_frame_inputs = []
|
|
1360
|
+
temp_dirs = []
|
|
1361
|
+
try:
|
|
1362
|
+
from pathlib import Path
|
|
1363
|
+
import tempfile
|
|
1364
|
+
|
|
1365
|
+
from ..media.utils.video_frames import extract_video_frames
|
|
1366
|
+
from PIL import Image as PILImage
|
|
1367
|
+
|
|
1368
|
+
for vp in video_paths:
|
|
1369
|
+
out_dir = Path(tempfile.mkdtemp(prefix="abstractcore_hf_video_frames_"))
|
|
1370
|
+
temp_dirs.append(out_dir)
|
|
1371
|
+
frames, _timestamps_s = extract_video_frames(
|
|
1372
|
+
Path(vp),
|
|
1373
|
+
max_frames=max_video_frames,
|
|
1374
|
+
frame_format="jpg",
|
|
1375
|
+
sampling_strategy=sampling_strategy,
|
|
1376
|
+
max_side=max_frame_side,
|
|
1377
|
+
output_dir=out_dir,
|
|
1378
|
+
)
|
|
1379
|
+
if not frames:
|
|
1380
|
+
raise RuntimeError("No frames extracted")
|
|
1381
|
+
video_frame_inputs.append([PILImage.open(p).convert("RGB") for p in frames])
|
|
1382
|
+
|
|
1383
|
+
# Single video -> pass list[PIL]; multiple videos -> list[list[PIL]]
|
|
1384
|
+
processor_call["videos"] = (
|
|
1385
|
+
video_frame_inputs[0]
|
|
1386
|
+
if len(video_frame_inputs) == 1
|
|
1387
|
+
else video_frame_inputs
|
|
1388
|
+
)
|
|
1389
|
+
except Exception:
|
|
1390
|
+
# If anything goes wrong with ffmpeg sampling, fall back to transformers decode.
|
|
1391
|
+
processor_call["videos"] = video_paths if len(video_paths) > 1 else video_paths[0]
|
|
1392
|
+
processor_call["videos_kwargs"] = {"do_sample_frames": True, "num_frames": max_video_frames}
|
|
1393
|
+
finally:
|
|
1394
|
+
# Cleanup extracted frames directories (frames are already loaded into memory as PIL).
|
|
1395
|
+
for d in temp_dirs:
|
|
1396
|
+
try:
|
|
1397
|
+
import shutil
|
|
1398
|
+
|
|
1399
|
+
shutil.rmtree(d, ignore_errors=True)
|
|
1400
|
+
except Exception:
|
|
1401
|
+
pass
|
|
1402
|
+
|
|
1403
|
+
inputs = self.processor(**processor_call)
|
|
1404
|
+
if hasattr(inputs, "to"):
|
|
1405
|
+
inputs = inputs.to(self.model_instance.device)
|
|
1406
|
+
else:
|
|
1407
|
+
templated = self.processor.apply_chat_template(
|
|
1408
|
+
chat_messages,
|
|
1409
|
+
tokenize=True,
|
|
1410
|
+
add_generation_prompt=True,
|
|
1411
|
+
return_dict=True,
|
|
1412
|
+
return_tensors="pt",
|
|
1413
|
+
)
|
|
1414
|
+
if isinstance(templated, str):
|
|
1415
|
+
# Processor returned a prompt string; fall back to explicit processor call.
|
|
1416
|
+
image_inputs = []
|
|
1417
|
+
for media_item in (media or []):
|
|
1418
|
+
if MediaType is not None and getattr(media_item, "media_type", None) == MediaType.IMAGE:
|
|
1419
|
+
fp = getattr(media_item, "file_path", None)
|
|
1420
|
+
if isinstance(fp, str) and fp.strip():
|
|
1421
|
+
try:
|
|
1422
|
+
from PIL import Image as PILImage
|
|
1423
|
+
except ImportError as e:
|
|
1424
|
+
raise RuntimeError(f"PIL is required for HuggingFace image inputs: {e}")
|
|
1425
|
+
image_inputs.append(PILImage.open(fp).convert("RGB"))
|
|
1426
|
+
|
|
1427
|
+
processor_call: Dict[str, Any] = {"text": templated, "return_tensors": "pt"}
|
|
1428
|
+
if image_inputs:
|
|
1429
|
+
processor_call["images"] = image_inputs if len(image_inputs) > 1 else image_inputs[0]
|
|
1430
|
+
inputs = self.processor(**processor_call)
|
|
1431
|
+
if hasattr(inputs, "to"):
|
|
1432
|
+
inputs = inputs.to(self.model_instance.device)
|
|
1433
|
+
else:
|
|
1434
|
+
inputs = templated.to(self.model_instance.device)
|
|
1035
1435
|
|
|
1036
|
-
|
|
1436
|
+
temperature_value = kwargs.get("temperature", self.temperature)
|
|
1437
|
+
# For HF multimodal video models, default to greedy decoding unless the caller explicitly
|
|
1438
|
+
# provided a temperature. This avoids premature EOS producing unusably short answers.
|
|
1439
|
+
if has_video and ("temperature" in kwargs) and kwargs.get("temperature") is None:
|
|
1440
|
+
temperature_value = 0.0
|
|
1441
|
+
if temperature_value is None:
|
|
1442
|
+
temperature_value = self.temperature
|
|
1443
|
+
|
|
1444
|
+
max_new_tokens_raw = kwargs.get("max_output_tokens", None)
|
|
1445
|
+
if max_new_tokens_raw is None:
|
|
1446
|
+
max_new_tokens_raw = kwargs.get("max_tokens", None)
|
|
1447
|
+
if max_new_tokens_raw is None:
|
|
1448
|
+
max_new_tokens_raw = self.max_output_tokens or 512
|
|
1449
|
+
try:
|
|
1450
|
+
max_new_tokens_value = max(1, int(max_new_tokens_raw))
|
|
1451
|
+
except Exception:
|
|
1452
|
+
max_new_tokens_value = int(self.max_output_tokens or 512)
|
|
1453
|
+
|
|
1454
|
+
do_sample = True
|
|
1455
|
+
try:
|
|
1456
|
+
if temperature_value is None or float(temperature_value) <= 0:
|
|
1457
|
+
do_sample = False
|
|
1458
|
+
temperature_value = 0.0
|
|
1459
|
+
except Exception:
|
|
1460
|
+
do_sample = True
|
|
1461
|
+
|
|
1037
1462
|
generation_kwargs = {
|
|
1038
|
-
"max_new_tokens":
|
|
1039
|
-
"temperature":
|
|
1040
|
-
"do_sample":
|
|
1463
|
+
"max_new_tokens": max_new_tokens_value,
|
|
1464
|
+
"temperature": temperature_value,
|
|
1465
|
+
"do_sample": do_sample,
|
|
1041
1466
|
"pad_token_id": self.processor.tokenizer.eos_token_id,
|
|
1042
1467
|
}
|
|
1043
1468
|
|
|
1044
1469
|
# Add seed if provided
|
|
1045
|
-
seed_value = kwargs.get("seed", self.seed)
|
|
1470
|
+
seed_value = self._normalize_seed(kwargs.get("seed", self.seed))
|
|
1046
1471
|
if seed_value is not None:
|
|
1047
1472
|
torch.manual_seed(seed_value)
|
|
1048
1473
|
if torch.cuda.is_available():
|
|
1049
1474
|
torch.cuda.manual_seed_all(seed_value)
|
|
1050
1475
|
|
|
1051
1476
|
# Generate response
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
# Move model and inputs to CPU
|
|
1060
|
-
cpu_model = self.model_instance.to('cpu')
|
|
1061
|
-
cpu_inputs = {k: v.to('cpu') if hasattr(v, 'to') else v for k, v in inputs.items()}
|
|
1062
|
-
generated_ids = cpu_model.generate(**cpu_inputs, **generation_kwargs)
|
|
1063
|
-
# Move model back to original device
|
|
1064
|
-
self.model_instance.to(self.model_instance.device)
|
|
1477
|
+
generated_ids = None
|
|
1478
|
+
try:
|
|
1479
|
+
with torch.inference_mode():
|
|
1480
|
+
use_mps_lock = str(getattr(self, "device", "") or "").strip().lower() == "mps"
|
|
1481
|
+
if use_mps_lock:
|
|
1482
|
+
with _MPS_GENERATION_LOCK:
|
|
1483
|
+
generated_ids = self.model_instance.generate(**inputs, **generation_kwargs)
|
|
1065
1484
|
else:
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1485
|
+
generated_ids = self.model_instance.generate(**inputs, **generation_kwargs)
|
|
1486
|
+
except RuntimeError as e:
|
|
1487
|
+
if str(getattr(self, "device", "") or "").strip().lower() == "mps":
|
|
1488
|
+
raise RuntimeError(
|
|
1489
|
+
"HuggingFaceProvider vision/video generation failed on MPS. "
|
|
1490
|
+
"If this persists, force CPU via ABSTRACTCORE_HF_DEVICE=cpu."
|
|
1491
|
+
) from e
|
|
1492
|
+
raise
|
|
1493
|
+
finally:
|
|
1494
|
+
# Best-effort: keep MPS memory pressure low between calls.
|
|
1495
|
+
try:
|
|
1496
|
+
if hasattr(torch, "mps") and hasattr(torch.mps, "empty_cache"):
|
|
1497
|
+
if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
|
1498
|
+
if hasattr(torch.mps, "synchronize"):
|
|
1499
|
+
torch.mps.synchronize()
|
|
1500
|
+
torch.mps.empty_cache()
|
|
1501
|
+
except Exception:
|
|
1502
|
+
pass
|
|
1503
|
+
try:
|
|
1504
|
+
import gc
|
|
1505
|
+
|
|
1506
|
+
gc.collect()
|
|
1507
|
+
except Exception:
|
|
1508
|
+
pass
|
|
1069
1509
|
|
|
1070
1510
|
# Decode response
|
|
1071
1511
|
output_text = self.processor.decode(
|
|
@@ -1080,7 +1520,7 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
1080
1520
|
input_tokens = inputs["input_ids"].shape[1]
|
|
1081
1521
|
output_tokens = len(generated_ids[0]) - input_tokens
|
|
1082
1522
|
|
|
1083
|
-
|
|
1523
|
+
response = GenerateResponse(
|
|
1084
1524
|
content=output_text.strip(),
|
|
1085
1525
|
model=self.model,
|
|
1086
1526
|
finish_reason="stop",
|
|
@@ -1093,15 +1533,25 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
1093
1533
|
},
|
|
1094
1534
|
gen_time=gen_time
|
|
1095
1535
|
)
|
|
1536
|
+
if stream:
|
|
1537
|
+
def _single_chunk_stream() -> Iterator[GenerateResponse]:
|
|
1538
|
+
yield response
|
|
1539
|
+
return _single_chunk_stream()
|
|
1540
|
+
return response
|
|
1096
1541
|
|
|
1097
1542
|
except Exception as e:
|
|
1098
1543
|
gen_time = (time.time() - start_time) * 1000 if 'start_time' in locals() else 0.0
|
|
1099
|
-
|
|
1544
|
+
error_resp = GenerateResponse(
|
|
1100
1545
|
content=f"Error in vision model generation: {str(e)}",
|
|
1101
1546
|
model=self.model,
|
|
1102
1547
|
finish_reason="error",
|
|
1103
1548
|
gen_time=gen_time
|
|
1104
1549
|
)
|
|
1550
|
+
if stream:
|
|
1551
|
+
def _error_stream() -> Iterator[GenerateResponse]:
|
|
1552
|
+
yield error_resp
|
|
1553
|
+
return _error_stream()
|
|
1554
|
+
return error_resp
|
|
1105
1555
|
|
|
1106
1556
|
def _patch_deepseek_for_mps(self):
|
|
1107
1557
|
"""Patch DeepSeek-OCR model to work with MPS instead of CUDA"""
|
|
@@ -1177,6 +1627,7 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
1177
1627
|
chat_messages.extend(messages)
|
|
1178
1628
|
|
|
1179
1629
|
# Handle media content for the user message - use proper vision format for GGUF models
|
|
1630
|
+
media_enrichment = None
|
|
1180
1631
|
if media:
|
|
1181
1632
|
try:
|
|
1182
1633
|
from ..architectures.detection import supports_vision
|
|
@@ -1222,10 +1673,11 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
1222
1673
|
from ..media.handlers import LocalMediaHandler
|
|
1223
1674
|
media_handler = LocalMediaHandler("huggingface", self.model_capabilities, model_name=self.model)
|
|
1224
1675
|
multimodal_message = media_handler.create_multimodal_message(prompt, media)
|
|
1676
|
+
media_enrichment = getattr(media_handler, "media_enrichment", None)
|
|
1225
1677
|
user_message_content = multimodal_message if isinstance(multimodal_message, str) else prompt
|
|
1226
1678
|
|
|
1227
1679
|
except ImportError:
|
|
1228
|
-
self.logger.warning("Media processing not available. Install with: pip install abstractcore[media]")
|
|
1680
|
+
self.logger.warning("Media processing not available. Install with: pip install \"abstractcore[media]\"")
|
|
1229
1681
|
user_message_content = prompt
|
|
1230
1682
|
except Exception as e:
|
|
1231
1683
|
self.logger.warning(f"Failed to process media content: {e}")
|
|
@@ -1235,6 +1687,27 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
1235
1687
|
|
|
1236
1688
|
chat_messages.append({"role": "user", "content": user_message_content})
|
|
1237
1689
|
|
|
1690
|
+
# Prompt caching (GGUF/llama.cpp): best-effort per-key cache selection.
|
|
1691
|
+
prompt_cache_key = kwargs.get("prompt_cache_key")
|
|
1692
|
+
if isinstance(prompt_cache_key, str) and prompt_cache_key.strip():
|
|
1693
|
+
key = prompt_cache_key.strip()
|
|
1694
|
+
cache_obj = self._prompt_cache_store.get(key)
|
|
1695
|
+
if cache_obj is None:
|
|
1696
|
+
self.prompt_cache_set(key, make_default=False)
|
|
1697
|
+
cache_obj = self._prompt_cache_store.get(key)
|
|
1698
|
+
try:
|
|
1699
|
+
if cache_obj is not None and hasattr(self.llm, "set_cache"):
|
|
1700
|
+
self.llm.set_cache(cache_obj)
|
|
1701
|
+
except Exception:
|
|
1702
|
+
pass
|
|
1703
|
+
else:
|
|
1704
|
+
# Disable cache for this request when no key is provided.
|
|
1705
|
+
try:
|
|
1706
|
+
if hasattr(self.llm, "set_cache"):
|
|
1707
|
+
self.llm.set_cache(None)
|
|
1708
|
+
except Exception:
|
|
1709
|
+
pass
|
|
1710
|
+
|
|
1238
1711
|
# Prepare parameters using unified system
|
|
1239
1712
|
unified_kwargs = self._prepare_generation_kwargs(**kwargs)
|
|
1240
1713
|
max_output_tokens = self._get_provider_max_tokens_param(unified_kwargs)
|
|
@@ -1242,13 +1715,13 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
1242
1715
|
generation_kwargs = {
|
|
1243
1716
|
"messages": chat_messages,
|
|
1244
1717
|
"max_tokens": max_output_tokens, # This is max_output_tokens for llama-cpp
|
|
1245
|
-
"temperature":
|
|
1718
|
+
"temperature": unified_kwargs.get("temperature", self.temperature),
|
|
1246
1719
|
"top_p": kwargs.get("top_p", 0.9),
|
|
1247
1720
|
"stream": stream
|
|
1248
1721
|
}
|
|
1249
1722
|
|
|
1250
1723
|
# Add seed if provided (GGUF/llama-cpp supports seed)
|
|
1251
|
-
seed_value =
|
|
1724
|
+
seed_value = unified_kwargs.get("seed")
|
|
1252
1725
|
if seed_value is not None:
|
|
1253
1726
|
generation_kwargs["seed"] = seed_value
|
|
1254
1727
|
|
|
@@ -1305,6 +1778,10 @@ class HuggingFaceProvider(BaseProvider):
|
|
|
1305
1778
|
return self._stream_generate_gguf_with_tools(generation_kwargs, tools, has_native_tools, kwargs.get('tool_call_tags'))
|
|
1306
1779
|
else:
|
|
1307
1780
|
response = self._single_generate_gguf(generation_kwargs)
|
|
1781
|
+
if media_enrichment:
|
|
1782
|
+
from ..media.enrichment import merge_enrichment_metadata
|
|
1783
|
+
|
|
1784
|
+
response.metadata = merge_enrichment_metadata(response.metadata, media_enrichment)
|
|
1308
1785
|
|
|
1309
1786
|
# Handle tool execution for both native and prompted responses
|
|
1310
1787
|
if tools and (response.has_tool_calls() or
|