PyPI - abstractcore - Versions diffs - 2.9.1__py3-none-any.whl → 2.11.2__py3-none-any.whl - Mend

abstractcore 2.9.1py3-none-any.whl → 2.11.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

abstractcore/__init__.py +7 -27
abstractcore/apps/extractor.py +33 -100
abstractcore/apps/intent.py +19 -0
abstractcore/apps/judge.py +20 -1
abstractcore/apps/summarizer.py +20 -1
abstractcore/architectures/detection.py +34 -1
abstractcore/architectures/response_postprocessing.py +313 -0
abstractcore/assets/architecture_formats.json +38 -8
abstractcore/assets/model_capabilities.json +781 -160
abstractcore/compression/__init__.py +1 -2
abstractcore/compression/glyph_processor.py +6 -4
abstractcore/config/main.py +31 -19
abstractcore/config/manager.py +389 -11
abstractcore/config/vision_config.py +5 -5
abstractcore/core/interface.py +151 -3
abstractcore/core/session.py +16 -10
abstractcore/download.py +1 -1
abstractcore/embeddings/manager.py +20 -6
abstractcore/endpoint/__init__.py +2 -0
abstractcore/endpoint/app.py +458 -0
abstractcore/mcp/client.py +3 -1
abstractcore/media/__init__.py +52 -17
abstractcore/media/auto_handler.py +42 -22
abstractcore/media/base.py +44 -1
abstractcore/media/capabilities.py +12 -33
abstractcore/media/enrichment.py +105 -0
abstractcore/media/handlers/anthropic_handler.py +19 -28
abstractcore/media/handlers/local_handler.py +124 -70
abstractcore/media/handlers/openai_handler.py +19 -31
abstractcore/media/processors/__init__.py +4 -2
abstractcore/media/processors/audio_processor.py +57 -0
abstractcore/media/processors/office_processor.py +8 -3
abstractcore/media/processors/pdf_processor.py +46 -3
abstractcore/media/processors/text_processor.py +22 -24
abstractcore/media/processors/video_processor.py +58 -0
abstractcore/media/types.py +97 -4
abstractcore/media/utils/image_scaler.py +20 -2
abstractcore/media/utils/video_frames.py +219 -0
abstractcore/media/vision_fallback.py +136 -22
abstractcore/processing/__init__.py +32 -3
abstractcore/processing/basic_deepsearch.py +15 -10
abstractcore/processing/basic_intent.py +3 -2
abstractcore/processing/basic_judge.py +3 -2
abstractcore/processing/basic_summarizer.py +1 -1
abstractcore/providers/__init__.py +3 -1
abstractcore/providers/anthropic_provider.py +95 -8
abstractcore/providers/base.py +1516 -81
abstractcore/providers/huggingface_provider.py +546 -69
abstractcore/providers/lmstudio_provider.py +35 -923
abstractcore/providers/mlx_provider.py +382 -35
abstractcore/providers/model_capabilities.py +5 -1
abstractcore/providers/ollama_provider.py +99 -15
abstractcore/providers/openai_compatible_provider.py +406 -180
abstractcore/providers/openai_provider.py +188 -44
abstractcore/providers/openrouter_provider.py +76 -0
abstractcore/providers/registry.py +61 -5
abstractcore/providers/streaming.py +138 -33
abstractcore/providers/vllm_provider.py +92 -817
abstractcore/server/app.py +461 -13
abstractcore/server/audio_endpoints.py +139 -0
abstractcore/server/vision_endpoints.py +1319 -0
abstractcore/structured/handler.py +316 -41
abstractcore/tools/common_tools.py +5501 -2012
abstractcore/tools/comms_tools.py +1641 -0
abstractcore/tools/core.py +37 -7
abstractcore/tools/handler.py +4 -9
abstractcore/tools/parser.py +49 -2
abstractcore/tools/tag_rewriter.py +2 -1
abstractcore/tools/telegram_tdlib.py +407 -0
abstractcore/tools/telegram_tools.py +261 -0
abstractcore/utils/cli.py +1085 -72
abstractcore/utils/token_utils.py +2 -0
abstractcore/utils/truncation.py +29 -0
abstractcore/utils/version.py +3 -4
abstractcore/utils/vlm_token_calculator.py +12 -2
abstractcore-2.11.2.dist-info/METADATA +562 -0
abstractcore-2.11.2.dist-info/RECORD +133 -0
{abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/WHEEL +1 -1
{abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/entry_points.txt +1 -0
abstractcore-2.9.1.dist-info/METADATA +0 -1190
abstractcore-2.9.1.dist-info/RECORD +0 -119
{abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/licenses/LICENSE +0 -0
{abstractcore-2.9.1.dist-info → abstractcore-2.11.2.dist-info}/top_level.txt +0 -0

abstractcore/server/app.py CHANGED Viewed

@@ -33,6 +33,8 @@ import urllib.parse
 import argparse
 import sys
 import logging
+import threading
+import httpx
 from typing import List, Dict, Any, Optional, Literal, Union, Iterator, Tuple, Annotated
 from enum import Enum
 from fastapi import FastAPI, HTTPException, Request, Query, Body
@@ -117,6 +119,26 @@ app.add_middleware(
     allow_headers=["*"],
 )
+# Optional: OpenAI-compatible vision generation endpoints (/v1/images/*).
+# These are safe-by-default and require explicit configuration; see `vision_endpoints.py`.
+try:
+    from .vision_endpoints import router as _vision_router
+    app.include_router(_vision_router, prefix="/v1")
+    logger.info("🖼️ Vision endpoints enabled at /v1/images/*")
+except Exception as e:
+    logger.debug(f"Vision endpoints not loaded: {e}")
+# Optional: OpenAI-compatible audio endpoints (/v1/audio/*).
+# These delegate to capability plugins (e.g. AbstractVoice) and degrade to 501 when unavailable.
+try:
+    from .audio_endpoints import router as _audio_router
+    app.include_router(_audio_router, prefix="/v1")
+    logger.info("🔊 Audio endpoints enabled at /v1/audio/*")
+except Exception as e:
+    logger.debug(f"Audio endpoints not loaded: {e}")
 # ============================================================================
 # Enhanced Error Handling and Logging Middleware
 # ============================================================================
@@ -193,9 +215,14 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE
                         body=body_json
                     )
                 except json.JSONDecodeError:
+                    raw = body.decode("utf-8", errors="replace")
+                    body_text = raw
+                    if len(body_text) > 1000:
+                        #[WARNING:TRUNCATION] bounded request-body preview for debug logs
+                        body_text = body_text[:980].rstrip() + "\n… (truncated)"
                     logger.debug(
                         "📋 Request Body (Validation Error)",
-                        body_text=body.decode('utf-8', errors='replace')[:1000]  # Limit to 1000 chars
+                        body_text=body_text,
                     )
         except Exception as e:
             logger.debug(f"Could not read request body for debugging: {e}")
@@ -450,6 +477,14 @@ class ChatCompletionRequest(BaseModel):
         example=False
     )
+    # Unified thinking/reasoning control (AbstractCore-specific feature)
+    thinking: Optional[Union[bool, str]] = Field(
+        default=None,
+        description="Unified thinking/reasoning control (best-effort across providers/models). "
+                    "Accepted values: null/'auto'/'on'/'off' or 'low'/'medium'/'high' when supported.",
+        example="off",
+    )
     # Tool calling
     tools: Optional[List[Dict[str, Any]]] = Field(
         default=None,
@@ -498,6 +533,13 @@ class ChatCompletionRequest(BaseModel):
         example=0.0
     )
+    # OpenAI prompt caching (2025+): forwarded best-effort by providers that support it.
+    prompt_cache_key: Optional[str] = Field(
+        default=None,
+        description="Provider-specific prompt cache key for prefix caching (best-effort).",
+        example="tenantA:session123"
+    )
     # Agent format control (AppV2 feature)
     agent_format: Optional[str] = Field(
         default=None,
@@ -508,10 +550,18 @@ class ChatCompletionRequest(BaseModel):
     )
     # Provider-specific parameters (AbstractCore-specific feature)
+    api_key: Optional[str] = Field(
+        default=None,
+        description="API key for the provider (AbstractCore-specific feature). "
+                    "Supports all providers requiring authentication: openai, anthropic, openrouter, openai-compatible, huggingface. "
+                    "If not specified, falls back to provider-specific environment variables "
+                    "(e.g., OPENAI_API_KEY, ANTHROPIC_API_KEY, OPENROUTER_API_KEY).",
+        example=None
+    )
     base_url: Optional[str] = Field(
         default=None,
         description="Base URL for the provider API endpoint (AbstractCore-specific feature). "
-                    "Useful for openai-compatible provider to connect to custom endpoints. "
+                    "Useful for OpenAI-compatible providers (lmstudio, vllm, openrouter, openai-compatible) and custom/proxied endpoints. "
                     "Example: 'http://localhost:1234/v1' for LMStudio, 'http://localhost:8080/v1' for llama.cpp. "
                     "If not specified, uses provider's default or environment variable.",
         example="http://localhost:1234/v1"
@@ -526,9 +576,17 @@ class ChatCompletionRequest(BaseModel):
                     "Values <= 0 are treated as unlimited.",
         example=7200.0,
     )
+    unload_after: bool = Field(
+        default=False,
+        description="If true, call `llm.unload_model(model)` after the request completes (AbstractCore-specific feature). "
+                    "This is useful for explicit memory hygiene in single-tenant or batch scenarios. "
+                    "WARNING: for providers that unload shared server state (e.g. Ollama), this can disrupt other "
+                    "clients and is disabled by default unless explicitly enabled by the server operator.",
+        example=False,
+    )
     class Config:
-        schema_extra = {
+        json_schema_extra = {
             "examples": {
                 "basic_text": {
                     "summary": "Basic Text Chat",
@@ -729,7 +787,25 @@ class ChatCompletionRequest(BaseModel):
                         "seed": 12345,
                         "frequency_penalty": 0.0,
                         "presence_penalty": 0.0,
-                        "agent_format": "auto"
+                        "agent_format": "auto",
+                        "api_key": None,
+                        "base_url": None
+                    }
+                },
+                "openrouter_with_api_key": {
+                    "summary": "OpenRouter with Per-Request API Key",
+                    "description": "Use OpenRouter with a per-request API key (useful for multi-tenant scenarios)",
+                    "value": {
+                        "model": "openrouter/anthropic/claude-3.5-sonnet",
+                        "messages": [
+                            {
+                                "role": "user",
+                                "content": "Explain quantum computing in simple terms"
+                            }
+                        ],
+                        "api_key": "sk-or-v1-your-openrouter-key",
+                        "temperature": 0.7,
+                        "max_tokens": 500
                     }
                 }
             }
@@ -771,7 +847,7 @@ class EmbeddingRequest(BaseModel):
     )
     class Config:
-        schema_extra = {
+        json_schema_extra = {
             "example": {
                 "input": "this is the story of starship lost in space",
                 "model": "huggingface/sentence-transformers/all-MiniLM-L6-v2",
@@ -792,7 +868,7 @@ class ResponsesAPIRequest(BaseModel):
     The endpoint automatically detects the format based on the presence of 'input' vs 'messages' field.
     """
     class Config:
-        schema_extra = {
+        json_schema_extra = {
             "oneOf": [
                 {
                     "title": "OpenAI Responses API Format",
@@ -896,6 +972,80 @@ def convert_openai_responses_to_chat_completion(openai_request: OpenAIResponsesR
 # Helper Functions
 # ============================================================================
+def _parse_bool_env(var_name: str) -> bool:
+    """Parse a boolean environment variable (1/true/yes/on)."""
+    val = os.getenv(var_name)
+    if val is None:
+        return False
+    return str(val).strip().lower() in {"1", "true", "yes", "on"}
+def _parse_boolish(value: Any) -> bool:
+    """Parse a request-supplied bool-ish value (bool/int/str/None)."""
+    if value is None:
+        return False
+    if isinstance(value, bool):
+        return value
+    if isinstance(value, (int, float)):
+        return bool(value)
+    if isinstance(value, str):
+        normalized = value.strip().lower()
+        if normalized in {"1", "true", "yes", "on"}:
+            return True
+        if normalized in {"0", "false", "no", "off", ""}:
+            return False
+    raise ValueError(f"Expected boolean, got {type(value).__name__}: {value!r}")
+_OLLAMA_INFLIGHT_LOCK = threading.Lock()
+_OLLAMA_INFLIGHT_COUNTS: Dict[Tuple[str, str, str], int] = {}
+_OLLAMA_UNLOAD_REQUESTED: Dict[Tuple[str, str, str], bool] = {}
+def _ollama_inflight_key(provider: str, base_url: Optional[str], model: str) -> Tuple[str, str, str]:
+    """Build a stable key for tracking in-flight Ollama requests."""
+    return (provider.strip().lower(), (base_url or "").strip(), model)
+def _ollama_inflight_enter(key: Tuple[str, str, str]) -> None:
+    """Increment in-flight counter for an Ollama (provider/base_url/model) key."""
+    with _OLLAMA_INFLIGHT_LOCK:
+        _OLLAMA_INFLIGHT_COUNTS[key] = _OLLAMA_INFLIGHT_COUNTS.get(key, 0) + 1
+def _ollama_inflight_exit(key: Tuple[str, str, str], *, unload_after_requested: bool) -> bool:
+    """Decrement in-flight counter and return True if an unload should happen now."""
+    with _OLLAMA_INFLIGHT_LOCK:
+        if unload_after_requested:
+            _OLLAMA_UNLOAD_REQUESTED[key] = True
+        current = _OLLAMA_INFLIGHT_COUNTS.get(key, 0)
+        if current <= 1:
+            _OLLAMA_INFLIGHT_COUNTS.pop(key, None)
+            return bool(_OLLAMA_UNLOAD_REQUESTED.pop(key, False))
+        _OLLAMA_INFLIGHT_COUNTS[key] = current - 1
+        return False
+def _best_effort_unload(llm: Any, *, request_id: str, provider: str, model: str) -> None:
+    """Unload provider resources without failing the request lifecycle."""
+    try:
+        if not hasattr(llm, "unload_model"):
+            raise AttributeError("Provider does not implement unload_model(model_name)")
+        llm.unload_model(model)
+        logger.info("🧹 Provider Unloaded", request_id=request_id, provider=provider, model=model)
+    except Exception as e:
+        logger.warning(
+            "⚠️ Provider unload failed",
+            request_id=request_id,
+            provider=provider,
+            model=model,
+            error=str(e),
+            error_type=type(e).__name__,
+        )
 def parse_model_string(model_string: str) -> tuple[str, str]:
     """Parse model string to extract provider and model."""
     if not model_string:
@@ -997,12 +1147,205 @@ async def health_check():
         ]
     }
+class PromptCacheProxyBase(BaseModel):
+    """Proxy configuration for forwarding AbstractCore prompt-cache control-plane calls."""
+    base_url: Optional[str] = Field(
+        default=None,
+        description=(
+            "Upstream base URL for an AbstractEndpoint instance. Can include an OpenAI-style `/v1` suffix "
+            "(it will be stripped when proxying `/acore/prompt_cache/*`)."
+        ),
+        example="http://localhost:8001/v1",
+    )
+    api_key: Optional[str] = Field(
+        default=None,
+        description="Optional upstream API key (sent as Authorization: Bearer ...).",
+        example=None,
+    )
+class PromptCacheSetProxyRequest(PromptCacheProxyBase):
+    key: str
+    make_default: bool = True
+    ttl_s: Optional[float] = None
+class PromptCacheUpdateProxyRequest(PromptCacheProxyBase):
+    key: str
+    prompt: Optional[str] = None
+    messages: Optional[List[Dict[str, Any]]] = None
+    system_prompt: Optional[str] = None
+    tools: Optional[List[Dict[str, Any]]] = None
+    add_generation_prompt: bool = False
+    ttl_s: Optional[float] = None
+class PromptCacheForkProxyRequest(PromptCacheProxyBase):
+    from_key: str
+    to_key: str
+    make_default: bool = False
+    ttl_s: Optional[float] = None
+class PromptCacheClearProxyRequest(PromptCacheProxyBase):
+    key: Optional[str] = None
+class PromptCachePrepareModulesProxyRequest(PromptCacheProxyBase):
+    namespace: str
+    modules: List[Dict[str, Any]]
+    make_default: bool = False
+    ttl_s: Optional[float] = None
+    version: int = 1
+def _normalize_control_plane_base_url(base_url: str) -> str:
+    u = str(base_url or "").strip().rstrip("/")
+    if u.endswith("/v1"):
+        u = u[:-3]
+    return u.rstrip("/")
+def _proxy_prompt_cache_request(
+    *,
+    base_url: Optional[str],
+    api_key: Optional[str],
+    method: str,
+    path: str,
+    json_body: Optional[Dict[str, Any]] = None,
+    timeout_s: float = 30.0,
+) -> Dict[str, Any]:
+    if not isinstance(base_url, str) or not base_url.strip():
+        return {
+            "supported": False,
+            "error": "base_url is required to proxy prompt cache control plane calls (use AbstractEndpoint)",
+        }
+    upstream_root = _normalize_control_plane_base_url(base_url)
+    url = f"{upstream_root}{path}"
+    headers: Dict[str, str] = {}
+    if isinstance(api_key, str) and api_key.strip():
+        headers["Authorization"] = f"Bearer {api_key.strip()}"
+    try:
+        with httpx.Client(timeout=timeout_s) as client:
+            if method.upper() == "GET":
+                resp = client.get(url, headers=headers)
+            else:
+                resp = client.post(url, headers=headers, json=json_body or {})
+    except Exception as e:
+        return {"supported": False, "error": str(e)}
+    try:
+        payload = resp.json()
+    except Exception:
+        payload = {"error": resp.text}
+    if resp.status_code >= 400:
+        return {
+            "supported": False,
+            "status_code": int(resp.status_code),
+            "error": payload,
+            "upstream": url,
+        }
+    if isinstance(payload, dict):
+        return payload
+    return {"supported": True, "data": payload}
+@app.get("/acore/prompt_cache/stats")
+def acore_prompt_cache_stats(
+    base_url: Optional[str] = Query(None, description="Upstream AbstractEndpoint base_url (optionally including /v1)"),
+    api_key: Optional[str] = Query(None, description="Optional upstream API key"),
+):
+    return _proxy_prompt_cache_request(
+        base_url=base_url,
+        api_key=api_key,
+        method="GET",
+        path="/acore/prompt_cache/stats",
+        json_body=None,
+    )
+@app.post("/acore/prompt_cache/set")
+def acore_prompt_cache_set(req: PromptCacheSetProxyRequest):
+    body = req.model_dump(exclude_none=True)
+    base_url = body.pop("base_url", None)
+    api_key = body.pop("api_key", None)
+    return _proxy_prompt_cache_request(
+        base_url=base_url,
+        api_key=api_key,
+        method="POST",
+        path="/acore/prompt_cache/set",
+        json_body=body,
+    )
+@app.post("/acore/prompt_cache/update")
+def acore_prompt_cache_update(req: PromptCacheUpdateProxyRequest):
+    body = req.model_dump(exclude_none=True)
+    base_url = body.pop("base_url", None)
+    api_key = body.pop("api_key", None)
+    return _proxy_prompt_cache_request(
+        base_url=base_url,
+        api_key=api_key,
+        method="POST",
+        path="/acore/prompt_cache/update",
+        json_body=body,
+    )
+@app.post("/acore/prompt_cache/fork")
+def acore_prompt_cache_fork(req: PromptCacheForkProxyRequest):
+    body = req.model_dump(exclude_none=True)
+    base_url = body.pop("base_url", None)
+    api_key = body.pop("api_key", None)
+    return _proxy_prompt_cache_request(
+        base_url=base_url,
+        api_key=api_key,
+        method="POST",
+        path="/acore/prompt_cache/fork",
+        json_body=body,
+    )
+@app.post("/acore/prompt_cache/clear")
+def acore_prompt_cache_clear(req: PromptCacheClearProxyRequest):
+    body = req.model_dump(exclude_none=True)
+    base_url = body.pop("base_url", None)
+    api_key = body.pop("api_key", None)
+    return _proxy_prompt_cache_request(
+        base_url=base_url,
+        api_key=api_key,
+        method="POST",
+        path="/acore/prompt_cache/clear",
+        json_body=body,
+    )
+@app.post("/acore/prompt_cache/prepare_modules")
+def acore_prompt_cache_prepare_modules(req: PromptCachePrepareModulesProxyRequest):
+    body = req.model_dump(exclude_none=True)
+    base_url = body.pop("base_url", None)
+    api_key = body.pop("api_key", None)
+    return _proxy_prompt_cache_request(
+        base_url=base_url,
+        api_key=api_key,
+        method="POST",
+        path="/acore/prompt_cache/prepare_modules",
+        json_body=body,
+    )
 @app.get("/v1/models")
 async def list_models(
     provider: Optional[str] = Query(
         None,
         description="Filter by provider (e.g., 'ollama', 'openai', 'anthropic', 'lmstudio')",
-        example=""
     ),
     input_type: Optional[ModelInputCapability] = Query(
         None,
@@ -1316,6 +1659,16 @@ async def create_response(
                 detail={"error": {"message": "Request must contain either 'input' (OpenAI format) or 'messages' (legacy format)", "type": "invalid_request"}}
             )
+        # AbstractCore extension: allow opt-in unload-after-request even for OpenAI Responses format.
+        if "unload_after" in request_data:
+            try:
+                chat_request = chat_request.model_copy(update={"unload_after": _parse_boolish(request_data.get("unload_after"))})
+            except Exception as e:
+                raise HTTPException(
+                    status_code=422,
+                    detail={"error": {"message": f"Invalid unload_after value: {e}", "type": "validation_error"}},
+                )
         # Respect user's streaming preference (defaults to False)
         # Process using our standard pipeline
@@ -2023,11 +2376,16 @@ async def process_chat_completion(
         # Detect target format for tool call syntax
         target_format = detect_target_format(f"{provider}/{model}", request, http_request)
+        user_agent_raw = http_request.headers.get("user-agent", "")
+        user_agent = str(user_agent_raw or "")
+        if len(user_agent) > 50:
+            #[WARNING:TRUNCATION] bounded user-agent capture for request logs
+            user_agent = user_agent[:50].rstrip() + "…"
         logger.info(
             "🎯 Target Format Detected",
             request_id=request_id,
             target_format=target_format.value,
-            user_agent=http_request.headers.get("user-agent", "")[:50]
+            user_agent=user_agent,
         )
         # Process media from messages
@@ -2052,11 +2410,14 @@ async def process_chat_completion(
         # Validate media files if any were found
         if all_media_files:
             validate_media_files(all_media_files)
+            #[WARNING:TRUNCATION] bounded filename preview for request logs
+            files_preview = [os.path.basename(f) for f in all_media_files[:5]]
             logger.info(
                 "📎 Media Files Processed",
                 request_id=request_id,
                 file_count=len(all_media_files),
-                files=[os.path.basename(f) for f in all_media_files[:5]]  # Log first 5 filenames
+                files=files_preview,
+                files_truncated=len(all_media_files) > 5,
             )
         # Create LLM instance
@@ -2067,6 +2428,13 @@ async def process_chat_completion(
             # Enable trace capture (trace_id) without retaining full trace buffers by default.
             provider_kwargs["enable_tracing"] = True
             provider_kwargs.setdefault("max_traces", 0)
+        if request.api_key:
+            provider_kwargs["api_key"] = request.api_key
+            logger.debug(
+                "🔑 Custom API Key Provided",
+                request_id=request_id,
+                provider=provider
+            )
         if request.base_url:
             provider_kwargs["base_url"] = request.base_url
             logger.info(
@@ -2079,7 +2447,28 @@ async def process_chat_completion(
             # Note: BaseProvider treats non-positive values as "unlimited".
             provider_kwargs["timeout"] = request.timeout_s
+        provider_normalized = provider.strip().lower()
+        unload_after_requested = bool(getattr(request, "unload_after", False))
+        allow_unsafe_unload_after = _parse_bool_env("ABSTRACTCORE_ALLOW_UNSAFE_UNLOAD_AFTER")
+        if unload_after_requested and provider_normalized == "ollama" and not allow_unsafe_unload_after:
+            raise HTTPException(
+                status_code=403,
+                detail={
+                    "error": {
+                        "message": (
+                            "unload_after=true is disabled for provider 'ollama' because it can unload shared server "
+                            "state and disrupt other clients. Set ABSTRACTCORE_ALLOW_UNSAFE_UNLOAD_AFTER=1 to enable."
+                        ),
+                        "type": "forbidden",
+                    }
+                },
+            )
         llm = create_llm(provider, model=model, **provider_kwargs)
+        ollama_key: Optional[Tuple[str, str, str]] = None
+        if provider_normalized == "ollama":
+            ollama_key = _ollama_inflight_key(provider, request.base_url, model)
+            _ollama_inflight_enter(ollama_key)
         # Convert messages
         messages = convert_to_abstractcore_messages(processed_messages)
@@ -2103,6 +2492,8 @@ async def process_chat_completion(
             gen_kwargs["trace_metadata"] = trace_metadata
         # Add optional parameters
+        if request.thinking is not None:
+            gen_kwargs["thinking"] = request.thinking
         if request.stop:
             gen_kwargs["stop"] = request.stop
         if request.seed:
@@ -2111,6 +2502,8 @@ async def process_chat_completion(
             gen_kwargs["frequency_penalty"] = request.frequency_penalty
         if request.presence_penalty:
             gen_kwargs["presence_penalty"] = request.presence_penalty
+        if isinstance(request.prompt_cache_key, str) and request.prompt_cache_key.strip():
+            gen_kwargs["prompt_cache_key"] = request.prompt_cache_key.strip()
         # Generate response
         # Only cleanup files created by this request (with our specific prefixes)
@@ -2128,7 +2521,16 @@ async def process_chat_completion(
             if request.stream:
                 return StreamingResponse(
                     generate_streaming_response(
-                        llm, gen_kwargs, provider, model, syntax_rewriter, request_id, temp_files_to_cleanup
+                        llm,
+                        gen_kwargs,
+                        provider,
+                        model,
+                        syntax_rewriter,
+                        request_id,
+                        temp_files_to_cleanup,
+                        unload_after=unload_after_requested,
+                        ollama_key=ollama_key,
+                        allow_unsafe_unload_after=allow_unsafe_unload_after,
                     ),
                     media_type="text/event-stream",
                     headers={"Cache-Control": "no-cache", "Connection": "keep-alive"}
@@ -2148,9 +2550,22 @@ async def process_chat_completion(
                     )
                 return openai_response
         finally:
-            # Cleanup temporary files (base64 and downloaded images) with delay to avoid race conditions
-            import threading
+            if not request.stream:
+                if provider_normalized == "ollama" and ollama_key is not None:
+                    should_unload = _ollama_inflight_exit(ollama_key, unload_after_requested=unload_after_requested)
+                    if should_unload and allow_unsafe_unload_after:
+                        _best_effort_unload(llm, request_id=request_id, provider=provider, model=model)
+                    elif should_unload:
+                        logger.warning(
+                            "⚠️ Unload requested but disabled by server policy",
+                            request_id=request_id,
+                            provider=provider,
+                            model=model,
+                        )
+                elif unload_after_requested:
+                    _best_effort_unload(llm, request_id=request_id, provider=provider, model=model)
+            # Cleanup temporary files (base64 and downloaded images) with delay to avoid race conditions
             def delayed_cleanup():
                 """Cleanup temporary files after a short delay to avoid race conditions"""
                 time.sleep(1)  # Short delay to ensure generation is complete
@@ -2170,6 +2585,8 @@ async def process_chat_completion(
             cleanup_thread = threading.Thread(target=delayed_cleanup, daemon=True)
             cleanup_thread.start()
+    except HTTPException:
+        raise
     except Exception as e:
         logger.error(
             "❌ Chat completion failed",
@@ -2189,9 +2606,14 @@ def generate_streaming_response(
     model: str,
     syntax_rewriter: ToolCallSyntaxRewriter,
     request_id: str,
-    temp_files_to_cleanup: List[str] = None
+    temp_files_to_cleanup: List[str] = None,
+    *,
+    unload_after: bool = False,
+    ollama_key: Optional[Tuple[str, str, str]] = None,
+    allow_unsafe_unload_after: bool = False,
 ) -> Iterator[str]:
     """Generate OpenAI-compatible streaming response with syntax rewriting."""
+    provider_normalized = provider.strip().lower()
     try:
         chat_id = f"chatcmpl-{uuid.uuid4().hex[:8]}"
         created_time = int(time.time())
@@ -2324,6 +2746,32 @@ def generate_streaming_response(
         )
         error_chunk = {"error": {"message": str(e), "type": "server_error"}}
         yield f"data: {json.dumps(error_chunk)}\n\n"
+    finally:
+        if provider_normalized == "ollama" and ollama_key is not None:
+            try:
+                should_unload = _ollama_inflight_exit(ollama_key, unload_after_requested=unload_after)
+            except Exception as e:
+                logger.warning(
+                    "⚠️ Failed to update in-flight unload state",
+                    request_id=request_id,
+                    provider=provider,
+                    model=model,
+                    error=str(e),
+                    error_type=type(e).__name__,
+                )
+                should_unload = False
+            if should_unload and allow_unsafe_unload_after:
+                _best_effort_unload(llm, request_id=request_id, provider=provider, model=model)
+            elif should_unload:
+                logger.warning(
+                    "⚠️ Unload requested but disabled by server policy",
+                    request_id=request_id,
+                    provider=provider,
+                    model=model,
+                )
+        elif unload_after:
+            _best_effort_unload(llm, request_id=request_id, provider=provider, model=model)
 def convert_to_openai_response(
     response,

abstractcore 2.9.1__py3-none-any.whl → 2.11.2__py3-none-any.whl

abstractcore 2.9.1py3-none-any.whl → 2.11.2py3-none-any.whl