abstractcore 2.9.1__py3-none-any.whl → 2.11.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. abstractcore/__init__.py +7 -27
  2. abstractcore/apps/deepsearch.py +9 -4
  3. abstractcore/apps/extractor.py +33 -100
  4. abstractcore/apps/intent.py +19 -0
  5. abstractcore/apps/judge.py +20 -1
  6. abstractcore/apps/summarizer.py +20 -1
  7. abstractcore/architectures/detection.py +34 -1
  8. abstractcore/architectures/response_postprocessing.py +313 -0
  9. abstractcore/assets/architecture_formats.json +38 -8
  10. abstractcore/assets/model_capabilities.json +882 -160
  11. abstractcore/compression/__init__.py +1 -2
  12. abstractcore/compression/glyph_processor.py +6 -4
  13. abstractcore/config/main.py +52 -20
  14. abstractcore/config/manager.py +390 -12
  15. abstractcore/config/vision_config.py +5 -5
  16. abstractcore/core/interface.py +151 -3
  17. abstractcore/core/session.py +16 -10
  18. abstractcore/download.py +1 -1
  19. abstractcore/embeddings/manager.py +20 -6
  20. abstractcore/endpoint/__init__.py +2 -0
  21. abstractcore/endpoint/app.py +458 -0
  22. abstractcore/mcp/client.py +3 -1
  23. abstractcore/media/__init__.py +52 -17
  24. abstractcore/media/auto_handler.py +42 -22
  25. abstractcore/media/base.py +44 -1
  26. abstractcore/media/capabilities.py +12 -33
  27. abstractcore/media/enrichment.py +105 -0
  28. abstractcore/media/handlers/anthropic_handler.py +19 -28
  29. abstractcore/media/handlers/local_handler.py +124 -70
  30. abstractcore/media/handlers/openai_handler.py +19 -31
  31. abstractcore/media/processors/__init__.py +4 -2
  32. abstractcore/media/processors/audio_processor.py +57 -0
  33. abstractcore/media/processors/office_processor.py +8 -3
  34. abstractcore/media/processors/pdf_processor.py +46 -3
  35. abstractcore/media/processors/text_processor.py +22 -24
  36. abstractcore/media/processors/video_processor.py +58 -0
  37. abstractcore/media/types.py +97 -4
  38. abstractcore/media/utils/image_scaler.py +20 -2
  39. abstractcore/media/utils/video_frames.py +219 -0
  40. abstractcore/media/vision_fallback.py +136 -22
  41. abstractcore/processing/__init__.py +32 -3
  42. abstractcore/processing/basic_deepsearch.py +15 -10
  43. abstractcore/processing/basic_intent.py +3 -2
  44. abstractcore/processing/basic_judge.py +3 -2
  45. abstractcore/processing/basic_summarizer.py +1 -1
  46. abstractcore/providers/__init__.py +3 -1
  47. abstractcore/providers/anthropic_provider.py +95 -8
  48. abstractcore/providers/base.py +1516 -81
  49. abstractcore/providers/huggingface_provider.py +546 -69
  50. abstractcore/providers/lmstudio_provider.py +30 -916
  51. abstractcore/providers/mlx_provider.py +382 -35
  52. abstractcore/providers/model_capabilities.py +5 -1
  53. abstractcore/providers/ollama_provider.py +99 -15
  54. abstractcore/providers/openai_compatible_provider.py +406 -180
  55. abstractcore/providers/openai_provider.py +188 -44
  56. abstractcore/providers/openrouter_provider.py +76 -0
  57. abstractcore/providers/registry.py +61 -5
  58. abstractcore/providers/streaming.py +138 -33
  59. abstractcore/providers/vllm_provider.py +92 -817
  60. abstractcore/server/app.py +478 -28
  61. abstractcore/server/audio_endpoints.py +139 -0
  62. abstractcore/server/vision_endpoints.py +1319 -0
  63. abstractcore/structured/handler.py +316 -41
  64. abstractcore/tools/common_tools.py +5501 -2012
  65. abstractcore/tools/comms_tools.py +1641 -0
  66. abstractcore/tools/core.py +37 -7
  67. abstractcore/tools/handler.py +4 -9
  68. abstractcore/tools/parser.py +49 -2
  69. abstractcore/tools/tag_rewriter.py +2 -1
  70. abstractcore/tools/telegram_tdlib.py +407 -0
  71. abstractcore/tools/telegram_tools.py +261 -0
  72. abstractcore/utils/cli.py +1085 -72
  73. abstractcore/utils/structured_logging.py +29 -8
  74. abstractcore/utils/token_utils.py +2 -0
  75. abstractcore/utils/truncation.py +29 -0
  76. abstractcore/utils/version.py +3 -4
  77. abstractcore/utils/vlm_token_calculator.py +12 -2
  78. abstractcore-2.11.4.dist-info/METADATA +562 -0
  79. abstractcore-2.11.4.dist-info/RECORD +133 -0
  80. {abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/WHEEL +1 -1
  81. {abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/entry_points.txt +1 -0
  82. abstractcore-2.9.1.dist-info/METADATA +0 -1190
  83. abstractcore-2.9.1.dist-info/RECORD +0 -119
  84. {abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/licenses/LICENSE +0 -0
  85. {abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,8 @@ import urllib.parse
33
33
  import argparse
34
34
  import sys
35
35
  import logging
36
+ import threading
37
+ import httpx
36
38
  from typing import List, Dict, Any, Optional, Literal, Union, Iterator, Tuple, Annotated
37
39
  from enum import Enum
38
40
  from fastapi import FastAPI, HTTPException, Request, Query, Body
@@ -60,25 +62,27 @@ from ..tools.syntax_rewriter import (
60
62
  # Configuration
61
63
  # ============================================================================
62
64
 
63
- # Initialize with default logging configuration (can be overridden later)
65
+ # Initialize with default logging configuration (can be overridden later).
66
+ #
67
+ # IMPORTANT: default console verbosity is controlled by AbstractCore's centralized logging defaults
68
+ # (and env overrides like ABSTRACTCORE_CONSOLE_LOG_LEVEL). The server must not force INFO-level
69
+ # console logs on startup.
64
70
  debug_mode = os.getenv("ABSTRACTCORE_DEBUG", "false").lower() == "true"
65
71
 
66
- # Initial logging setup (will be reconfigured if --debug is used)
67
- # Check environment variable for debug mode
68
- initial_console_level = logging.DEBUG if debug_mode else logging.INFO
69
- configure_logging(
70
- console_level=initial_console_level,
71
- file_level=logging.DEBUG,
72
- log_dir="logs",
73
- verbatim_enabled=True,
74
- console_json=False,
75
- file_json=True
76
- )
72
+ if debug_mode:
73
+ configure_logging(
74
+ console_level=logging.DEBUG,
75
+ file_level=logging.DEBUG,
76
+ log_dir="logs",
77
+ verbatim_enabled=True,
78
+ console_json=False,
79
+ file_json=True,
80
+ )
77
81
 
78
82
  # Get initial logger
79
83
  logger = get_logger("server")
80
84
 
81
- # Log initial startup with debug mode status
85
+ # Log initial startup with debug mode status (may be suppressed by console level).
82
86
  logger.info("🚀 AbstractCore Server Initializing", version=__version__, debug_mode=debug_mode)
83
87
 
84
88
  def reconfigure_for_debug():
@@ -117,6 +121,26 @@ app.add_middleware(
117
121
  allow_headers=["*"],
118
122
  )
119
123
 
124
+ # Optional: OpenAI-compatible vision generation endpoints (/v1/images/*).
125
+ # These are safe-by-default and require explicit configuration; see `vision_endpoints.py`.
126
+ try:
127
+ from .vision_endpoints import router as _vision_router
128
+
129
+ app.include_router(_vision_router, prefix="/v1")
130
+ logger.info("🖼️ Vision endpoints enabled at /v1/images/*")
131
+ except Exception as e:
132
+ logger.debug(f"Vision endpoints not loaded: {e}")
133
+
134
+ # Optional: OpenAI-compatible audio endpoints (/v1/audio/*).
135
+ # These delegate to capability plugins (e.g. AbstractVoice) and degrade to 501 when unavailable.
136
+ try:
137
+ from .audio_endpoints import router as _audio_router
138
+
139
+ app.include_router(_audio_router, prefix="/v1")
140
+ logger.info("🔊 Audio endpoints enabled at /v1/audio/*")
141
+ except Exception as e:
142
+ logger.debug(f"Audio endpoints not loaded: {e}")
143
+
120
144
  # ============================================================================
121
145
  # Enhanced Error Handling and Logging Middleware
122
146
  # ============================================================================
@@ -193,9 +217,14 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE
193
217
  body=body_json
194
218
  )
195
219
  except json.JSONDecodeError:
220
+ raw = body.decode("utf-8", errors="replace")
221
+ body_text = raw
222
+ if len(body_text) > 1000:
223
+ #[WARNING:TRUNCATION] bounded request-body preview for debug logs
224
+ body_text = body_text[:980].rstrip() + "\n… (truncated)"
196
225
  logger.debug(
197
226
  "📋 Request Body (Validation Error)",
198
- body_text=body.decode('utf-8', errors='replace')[:1000] # Limit to 1000 chars
227
+ body_text=body_text,
199
228
  )
200
229
  except Exception as e:
201
230
  logger.debug(f"Could not read request body for debugging: {e}")
@@ -450,6 +479,14 @@ class ChatCompletionRequest(BaseModel):
450
479
  example=False
451
480
  )
452
481
 
482
+ # Unified thinking/reasoning control (AbstractCore-specific feature)
483
+ thinking: Optional[Union[bool, str]] = Field(
484
+ default=None,
485
+ description="Unified thinking/reasoning control (best-effort across providers/models). "
486
+ "Accepted values: null/'auto'/'on'/'off' or 'low'/'medium'/'high' when supported.",
487
+ example="off",
488
+ )
489
+
453
490
  # Tool calling
454
491
  tools: Optional[List[Dict[str, Any]]] = Field(
455
492
  default=None,
@@ -498,6 +535,13 @@ class ChatCompletionRequest(BaseModel):
498
535
  example=0.0
499
536
  )
500
537
 
538
+ # OpenAI prompt caching (2025+): forwarded best-effort by providers that support it.
539
+ prompt_cache_key: Optional[str] = Field(
540
+ default=None,
541
+ description="Provider-specific prompt cache key for prefix caching (best-effort).",
542
+ example="tenantA:session123"
543
+ )
544
+
501
545
  # Agent format control (AppV2 feature)
502
546
  agent_format: Optional[str] = Field(
503
547
  default=None,
@@ -508,10 +552,18 @@ class ChatCompletionRequest(BaseModel):
508
552
  )
509
553
 
510
554
  # Provider-specific parameters (AbstractCore-specific feature)
555
+ api_key: Optional[str] = Field(
556
+ default=None,
557
+ description="API key for the provider (AbstractCore-specific feature). "
558
+ "Supports all providers requiring authentication: openai, anthropic, openrouter, openai-compatible, huggingface. "
559
+ "If not specified, falls back to provider-specific environment variables "
560
+ "(e.g., OPENAI_API_KEY, ANTHROPIC_API_KEY, OPENROUTER_API_KEY).",
561
+ example=None
562
+ )
511
563
  base_url: Optional[str] = Field(
512
564
  default=None,
513
565
  description="Base URL for the provider API endpoint (AbstractCore-specific feature). "
514
- "Useful for openai-compatible provider to connect to custom endpoints. "
566
+ "Useful for OpenAI-compatible providers (lmstudio, vllm, openrouter, openai-compatible) and custom/proxied endpoints. "
515
567
  "Example: 'http://localhost:1234/v1' for LMStudio, 'http://localhost:8080/v1' for llama.cpp. "
516
568
  "If not specified, uses provider's default or environment variable.",
517
569
  example="http://localhost:1234/v1"
@@ -526,9 +578,17 @@ class ChatCompletionRequest(BaseModel):
526
578
  "Values <= 0 are treated as unlimited.",
527
579
  example=7200.0,
528
580
  )
581
+ unload_after: bool = Field(
582
+ default=False,
583
+ description="If true, call `llm.unload_model(model)` after the request completes (AbstractCore-specific feature). "
584
+ "This is useful for explicit memory hygiene in single-tenant or batch scenarios. "
585
+ "WARNING: for providers that unload shared server state (e.g. Ollama), this can disrupt other "
586
+ "clients and is disabled by default unless explicitly enabled by the server operator.",
587
+ example=False,
588
+ )
529
589
 
530
590
  class Config:
531
- schema_extra = {
591
+ json_schema_extra = {
532
592
  "examples": {
533
593
  "basic_text": {
534
594
  "summary": "Basic Text Chat",
@@ -729,7 +789,25 @@ class ChatCompletionRequest(BaseModel):
729
789
  "seed": 12345,
730
790
  "frequency_penalty": 0.0,
731
791
  "presence_penalty": 0.0,
732
- "agent_format": "auto"
792
+ "agent_format": "auto",
793
+ "api_key": None,
794
+ "base_url": None
795
+ }
796
+ },
797
+ "openrouter_with_api_key": {
798
+ "summary": "OpenRouter with Per-Request API Key",
799
+ "description": "Use OpenRouter with a per-request API key (useful for multi-tenant scenarios)",
800
+ "value": {
801
+ "model": "openrouter/anthropic/claude-3.5-sonnet",
802
+ "messages": [
803
+ {
804
+ "role": "user",
805
+ "content": "Explain quantum computing in simple terms"
806
+ }
807
+ ],
808
+ "api_key": "sk-or-v1-your-openrouter-key",
809
+ "temperature": 0.7,
810
+ "max_tokens": 500
733
811
  }
734
812
  }
735
813
  }
@@ -771,7 +849,7 @@ class EmbeddingRequest(BaseModel):
771
849
  )
772
850
 
773
851
  class Config:
774
- schema_extra = {
852
+ json_schema_extra = {
775
853
  "example": {
776
854
  "input": "this is the story of starship lost in space",
777
855
  "model": "huggingface/sentence-transformers/all-MiniLM-L6-v2",
@@ -792,7 +870,7 @@ class ResponsesAPIRequest(BaseModel):
792
870
  The endpoint automatically detects the format based on the presence of 'input' vs 'messages' field.
793
871
  """
794
872
  class Config:
795
- schema_extra = {
873
+ json_schema_extra = {
796
874
  "oneOf": [
797
875
  {
798
876
  "title": "OpenAI Responses API Format",
@@ -896,6 +974,80 @@ def convert_openai_responses_to_chat_completion(openai_request: OpenAIResponsesR
896
974
  # Helper Functions
897
975
  # ============================================================================
898
976
 
977
+ def _parse_bool_env(var_name: str) -> bool:
978
+ """Parse a boolean environment variable (1/true/yes/on)."""
979
+ val = os.getenv(var_name)
980
+ if val is None:
981
+ return False
982
+ return str(val).strip().lower() in {"1", "true", "yes", "on"}
983
+
984
+
985
+ def _parse_boolish(value: Any) -> bool:
986
+ """Parse a request-supplied bool-ish value (bool/int/str/None)."""
987
+ if value is None:
988
+ return False
989
+ if isinstance(value, bool):
990
+ return value
991
+ if isinstance(value, (int, float)):
992
+ return bool(value)
993
+ if isinstance(value, str):
994
+ normalized = value.strip().lower()
995
+ if normalized in {"1", "true", "yes", "on"}:
996
+ return True
997
+ if normalized in {"0", "false", "no", "off", ""}:
998
+ return False
999
+ raise ValueError(f"Expected boolean, got {type(value).__name__}: {value!r}")
1000
+
1001
+
1002
+ _OLLAMA_INFLIGHT_LOCK = threading.Lock()
1003
+ _OLLAMA_INFLIGHT_COUNTS: Dict[Tuple[str, str, str], int] = {}
1004
+ _OLLAMA_UNLOAD_REQUESTED: Dict[Tuple[str, str, str], bool] = {}
1005
+
1006
+
1007
+ def _ollama_inflight_key(provider: str, base_url: Optional[str], model: str) -> Tuple[str, str, str]:
1008
+ """Build a stable key for tracking in-flight Ollama requests."""
1009
+ return (provider.strip().lower(), (base_url or "").strip(), model)
1010
+
1011
+
1012
+ def _ollama_inflight_enter(key: Tuple[str, str, str]) -> None:
1013
+ """Increment in-flight counter for an Ollama (provider/base_url/model) key."""
1014
+ with _OLLAMA_INFLIGHT_LOCK:
1015
+ _OLLAMA_INFLIGHT_COUNTS[key] = _OLLAMA_INFLIGHT_COUNTS.get(key, 0) + 1
1016
+
1017
+
1018
+ def _ollama_inflight_exit(key: Tuple[str, str, str], *, unload_after_requested: bool) -> bool:
1019
+ """Decrement in-flight counter and return True if an unload should happen now."""
1020
+ with _OLLAMA_INFLIGHT_LOCK:
1021
+ if unload_after_requested:
1022
+ _OLLAMA_UNLOAD_REQUESTED[key] = True
1023
+
1024
+ current = _OLLAMA_INFLIGHT_COUNTS.get(key, 0)
1025
+ if current <= 1:
1026
+ _OLLAMA_INFLIGHT_COUNTS.pop(key, None)
1027
+ return bool(_OLLAMA_UNLOAD_REQUESTED.pop(key, False))
1028
+
1029
+ _OLLAMA_INFLIGHT_COUNTS[key] = current - 1
1030
+ return False
1031
+
1032
+
1033
+ def _best_effort_unload(llm: Any, *, request_id: str, provider: str, model: str) -> None:
1034
+ """Unload provider resources without failing the request lifecycle."""
1035
+ try:
1036
+ if not hasattr(llm, "unload_model"):
1037
+ raise AttributeError("Provider does not implement unload_model(model_name)")
1038
+ llm.unload_model(model)
1039
+ logger.info("🧹 Provider Unloaded", request_id=request_id, provider=provider, model=model)
1040
+ except Exception as e:
1041
+ logger.warning(
1042
+ "⚠️ Provider unload failed",
1043
+ request_id=request_id,
1044
+ provider=provider,
1045
+ model=model,
1046
+ error=str(e),
1047
+ error_type=type(e).__name__,
1048
+ )
1049
+
1050
+
899
1051
  def parse_model_string(model_string: str) -> tuple[str, str]:
900
1052
  """Parse model string to extract provider and model."""
901
1053
  if not model_string:
@@ -997,12 +1149,205 @@ async def health_check():
997
1149
  ]
998
1150
  }
999
1151
 
1152
+
1153
+ class PromptCacheProxyBase(BaseModel):
1154
+ """Proxy configuration for forwarding AbstractCore prompt-cache control-plane calls."""
1155
+
1156
+ base_url: Optional[str] = Field(
1157
+ default=None,
1158
+ description=(
1159
+ "Upstream base URL for an AbstractEndpoint instance. Can include an OpenAI-style `/v1` suffix "
1160
+ "(it will be stripped when proxying `/acore/prompt_cache/*`)."
1161
+ ),
1162
+ example="http://localhost:8001/v1",
1163
+ )
1164
+ api_key: Optional[str] = Field(
1165
+ default=None,
1166
+ description="Optional upstream API key (sent as Authorization: Bearer ...).",
1167
+ example=None,
1168
+ )
1169
+
1170
+
1171
+ class PromptCacheSetProxyRequest(PromptCacheProxyBase):
1172
+ key: str
1173
+ make_default: bool = True
1174
+ ttl_s: Optional[float] = None
1175
+
1176
+
1177
+ class PromptCacheUpdateProxyRequest(PromptCacheProxyBase):
1178
+ key: str
1179
+ prompt: Optional[str] = None
1180
+ messages: Optional[List[Dict[str, Any]]] = None
1181
+ system_prompt: Optional[str] = None
1182
+ tools: Optional[List[Dict[str, Any]]] = None
1183
+ add_generation_prompt: bool = False
1184
+ ttl_s: Optional[float] = None
1185
+
1186
+
1187
+ class PromptCacheForkProxyRequest(PromptCacheProxyBase):
1188
+ from_key: str
1189
+ to_key: str
1190
+ make_default: bool = False
1191
+ ttl_s: Optional[float] = None
1192
+
1193
+
1194
+ class PromptCacheClearProxyRequest(PromptCacheProxyBase):
1195
+ key: Optional[str] = None
1196
+
1197
+
1198
+ class PromptCachePrepareModulesProxyRequest(PromptCacheProxyBase):
1199
+ namespace: str
1200
+ modules: List[Dict[str, Any]]
1201
+ make_default: bool = False
1202
+ ttl_s: Optional[float] = None
1203
+ version: int = 1
1204
+
1205
+
1206
+ def _normalize_control_plane_base_url(base_url: str) -> str:
1207
+ u = str(base_url or "").strip().rstrip("/")
1208
+ if u.endswith("/v1"):
1209
+ u = u[:-3]
1210
+ return u.rstrip("/")
1211
+
1212
+
1213
+ def _proxy_prompt_cache_request(
1214
+ *,
1215
+ base_url: Optional[str],
1216
+ api_key: Optional[str],
1217
+ method: str,
1218
+ path: str,
1219
+ json_body: Optional[Dict[str, Any]] = None,
1220
+ timeout_s: float = 30.0,
1221
+ ) -> Dict[str, Any]:
1222
+ if not isinstance(base_url, str) or not base_url.strip():
1223
+ return {
1224
+ "supported": False,
1225
+ "error": "base_url is required to proxy prompt cache control plane calls (use AbstractEndpoint)",
1226
+ }
1227
+
1228
+ upstream_root = _normalize_control_plane_base_url(base_url)
1229
+ url = f"{upstream_root}{path}"
1230
+
1231
+ headers: Dict[str, str] = {}
1232
+ if isinstance(api_key, str) and api_key.strip():
1233
+ headers["Authorization"] = f"Bearer {api_key.strip()}"
1234
+
1235
+ try:
1236
+ with httpx.Client(timeout=timeout_s) as client:
1237
+ if method.upper() == "GET":
1238
+ resp = client.get(url, headers=headers)
1239
+ else:
1240
+ resp = client.post(url, headers=headers, json=json_body or {})
1241
+ except Exception as e:
1242
+ return {"supported": False, "error": str(e)}
1243
+
1244
+ try:
1245
+ payload = resp.json()
1246
+ except Exception:
1247
+ payload = {"error": resp.text}
1248
+
1249
+ if resp.status_code >= 400:
1250
+ return {
1251
+ "supported": False,
1252
+ "status_code": int(resp.status_code),
1253
+ "error": payload,
1254
+ "upstream": url,
1255
+ }
1256
+
1257
+ if isinstance(payload, dict):
1258
+ return payload
1259
+ return {"supported": True, "data": payload}
1260
+
1261
+
1262
+ @app.get("/acore/prompt_cache/stats")
1263
+ def acore_prompt_cache_stats(
1264
+ base_url: Optional[str] = Query(None, description="Upstream AbstractEndpoint base_url (optionally including /v1)"),
1265
+ api_key: Optional[str] = Query(None, description="Optional upstream API key"),
1266
+ ):
1267
+ return _proxy_prompt_cache_request(
1268
+ base_url=base_url,
1269
+ api_key=api_key,
1270
+ method="GET",
1271
+ path="/acore/prompt_cache/stats",
1272
+ json_body=None,
1273
+ )
1274
+
1275
+
1276
+ @app.post("/acore/prompt_cache/set")
1277
+ def acore_prompt_cache_set(req: PromptCacheSetProxyRequest):
1278
+ body = req.model_dump(exclude_none=True)
1279
+ base_url = body.pop("base_url", None)
1280
+ api_key = body.pop("api_key", None)
1281
+ return _proxy_prompt_cache_request(
1282
+ base_url=base_url,
1283
+ api_key=api_key,
1284
+ method="POST",
1285
+ path="/acore/prompt_cache/set",
1286
+ json_body=body,
1287
+ )
1288
+
1289
+
1290
+ @app.post("/acore/prompt_cache/update")
1291
+ def acore_prompt_cache_update(req: PromptCacheUpdateProxyRequest):
1292
+ body = req.model_dump(exclude_none=True)
1293
+ base_url = body.pop("base_url", None)
1294
+ api_key = body.pop("api_key", None)
1295
+ return _proxy_prompt_cache_request(
1296
+ base_url=base_url,
1297
+ api_key=api_key,
1298
+ method="POST",
1299
+ path="/acore/prompt_cache/update",
1300
+ json_body=body,
1301
+ )
1302
+
1303
+
1304
+ @app.post("/acore/prompt_cache/fork")
1305
+ def acore_prompt_cache_fork(req: PromptCacheForkProxyRequest):
1306
+ body = req.model_dump(exclude_none=True)
1307
+ base_url = body.pop("base_url", None)
1308
+ api_key = body.pop("api_key", None)
1309
+ return _proxy_prompt_cache_request(
1310
+ base_url=base_url,
1311
+ api_key=api_key,
1312
+ method="POST",
1313
+ path="/acore/prompt_cache/fork",
1314
+ json_body=body,
1315
+ )
1316
+
1317
+
1318
+ @app.post("/acore/prompt_cache/clear")
1319
+ def acore_prompt_cache_clear(req: PromptCacheClearProxyRequest):
1320
+ body = req.model_dump(exclude_none=True)
1321
+ base_url = body.pop("base_url", None)
1322
+ api_key = body.pop("api_key", None)
1323
+ return _proxy_prompt_cache_request(
1324
+ base_url=base_url,
1325
+ api_key=api_key,
1326
+ method="POST",
1327
+ path="/acore/prompt_cache/clear",
1328
+ json_body=body,
1329
+ )
1330
+
1331
+
1332
+ @app.post("/acore/prompt_cache/prepare_modules")
1333
+ def acore_prompt_cache_prepare_modules(req: PromptCachePrepareModulesProxyRequest):
1334
+ body = req.model_dump(exclude_none=True)
1335
+ base_url = body.pop("base_url", None)
1336
+ api_key = body.pop("api_key", None)
1337
+ return _proxy_prompt_cache_request(
1338
+ base_url=base_url,
1339
+ api_key=api_key,
1340
+ method="POST",
1341
+ path="/acore/prompt_cache/prepare_modules",
1342
+ json_body=body,
1343
+ )
1344
+
1345
+
1000
1346
  @app.get("/v1/models")
1001
1347
  async def list_models(
1002
1348
  provider: Optional[str] = Query(
1003
1349
  None,
1004
1350
  description="Filter by provider (e.g., 'ollama', 'openai', 'anthropic', 'lmstudio')",
1005
- example=""
1006
1351
  ),
1007
1352
  input_type: Optional[ModelInputCapability] = Query(
1008
1353
  None,
@@ -1316,6 +1661,16 @@ async def create_response(
1316
1661
  detail={"error": {"message": "Request must contain either 'input' (OpenAI format) or 'messages' (legacy format)", "type": "invalid_request"}}
1317
1662
  )
1318
1663
 
1664
+ # AbstractCore extension: allow opt-in unload-after-request even for OpenAI Responses format.
1665
+ if "unload_after" in request_data:
1666
+ try:
1667
+ chat_request = chat_request.model_copy(update={"unload_after": _parse_boolish(request_data.get("unload_after"))})
1668
+ except Exception as e:
1669
+ raise HTTPException(
1670
+ status_code=422,
1671
+ detail={"error": {"message": f"Invalid unload_after value: {e}", "type": "validation_error"}},
1672
+ )
1673
+
1319
1674
  # Respect user's streaming preference (defaults to False)
1320
1675
 
1321
1676
  # Process using our standard pipeline
@@ -2023,11 +2378,16 @@ async def process_chat_completion(
2023
2378
 
2024
2379
  # Detect target format for tool call syntax
2025
2380
  target_format = detect_target_format(f"{provider}/{model}", request, http_request)
2381
+ user_agent_raw = http_request.headers.get("user-agent", "")
2382
+ user_agent = str(user_agent_raw or "")
2383
+ if len(user_agent) > 50:
2384
+ #[WARNING:TRUNCATION] bounded user-agent capture for request logs
2385
+ user_agent = user_agent[:50].rstrip() + "…"
2026
2386
  logger.info(
2027
2387
  "🎯 Target Format Detected",
2028
2388
  request_id=request_id,
2029
2389
  target_format=target_format.value,
2030
- user_agent=http_request.headers.get("user-agent", "")[:50]
2390
+ user_agent=user_agent,
2031
2391
  )
2032
2392
 
2033
2393
  # Process media from messages
@@ -2052,11 +2412,14 @@ async def process_chat_completion(
2052
2412
  # Validate media files if any were found
2053
2413
  if all_media_files:
2054
2414
  validate_media_files(all_media_files)
2415
+ #[WARNING:TRUNCATION] bounded filename preview for request logs
2416
+ files_preview = [os.path.basename(f) for f in all_media_files[:5]]
2055
2417
  logger.info(
2056
2418
  "📎 Media Files Processed",
2057
2419
  request_id=request_id,
2058
2420
  file_count=len(all_media_files),
2059
- files=[os.path.basename(f) for f in all_media_files[:5]] # Log first 5 filenames
2421
+ files=files_preview,
2422
+ files_truncated=len(all_media_files) > 5,
2060
2423
  )
2061
2424
 
2062
2425
  # Create LLM instance
@@ -2067,6 +2430,13 @@ async def process_chat_completion(
2067
2430
  # Enable trace capture (trace_id) without retaining full trace buffers by default.
2068
2431
  provider_kwargs["enable_tracing"] = True
2069
2432
  provider_kwargs.setdefault("max_traces", 0)
2433
+ if request.api_key:
2434
+ provider_kwargs["api_key"] = request.api_key
2435
+ logger.debug(
2436
+ "🔑 Custom API Key Provided",
2437
+ request_id=request_id,
2438
+ provider=provider
2439
+ )
2070
2440
  if request.base_url:
2071
2441
  provider_kwargs["base_url"] = request.base_url
2072
2442
  logger.info(
@@ -2079,7 +2449,28 @@ async def process_chat_completion(
2079
2449
  # Note: BaseProvider treats non-positive values as "unlimited".
2080
2450
  provider_kwargs["timeout"] = request.timeout_s
2081
2451
 
2452
+ provider_normalized = provider.strip().lower()
2453
+ unload_after_requested = bool(getattr(request, "unload_after", False))
2454
+ allow_unsafe_unload_after = _parse_bool_env("ABSTRACTCORE_ALLOW_UNSAFE_UNLOAD_AFTER")
2455
+ if unload_after_requested and provider_normalized == "ollama" and not allow_unsafe_unload_after:
2456
+ raise HTTPException(
2457
+ status_code=403,
2458
+ detail={
2459
+ "error": {
2460
+ "message": (
2461
+ "unload_after=true is disabled for provider 'ollama' because it can unload shared server "
2462
+ "state and disrupt other clients. Set ABSTRACTCORE_ALLOW_UNSAFE_UNLOAD_AFTER=1 to enable."
2463
+ ),
2464
+ "type": "forbidden",
2465
+ }
2466
+ },
2467
+ )
2468
+
2082
2469
  llm = create_llm(provider, model=model, **provider_kwargs)
2470
+ ollama_key: Optional[Tuple[str, str, str]] = None
2471
+ if provider_normalized == "ollama":
2472
+ ollama_key = _ollama_inflight_key(provider, request.base_url, model)
2473
+ _ollama_inflight_enter(ollama_key)
2083
2474
 
2084
2475
  # Convert messages
2085
2476
  messages = convert_to_abstractcore_messages(processed_messages)
@@ -2103,6 +2494,8 @@ async def process_chat_completion(
2103
2494
  gen_kwargs["trace_metadata"] = trace_metadata
2104
2495
 
2105
2496
  # Add optional parameters
2497
+ if request.thinking is not None:
2498
+ gen_kwargs["thinking"] = request.thinking
2106
2499
  if request.stop:
2107
2500
  gen_kwargs["stop"] = request.stop
2108
2501
  if request.seed:
@@ -2111,6 +2504,8 @@ async def process_chat_completion(
2111
2504
  gen_kwargs["frequency_penalty"] = request.frequency_penalty
2112
2505
  if request.presence_penalty:
2113
2506
  gen_kwargs["presence_penalty"] = request.presence_penalty
2507
+ if isinstance(request.prompt_cache_key, str) and request.prompt_cache_key.strip():
2508
+ gen_kwargs["prompt_cache_key"] = request.prompt_cache_key.strip()
2114
2509
 
2115
2510
  # Generate response
2116
2511
  # Only cleanup files created by this request (with our specific prefixes)
@@ -2128,7 +2523,16 @@ async def process_chat_completion(
2128
2523
  if request.stream:
2129
2524
  return StreamingResponse(
2130
2525
  generate_streaming_response(
2131
- llm, gen_kwargs, provider, model, syntax_rewriter, request_id, temp_files_to_cleanup
2526
+ llm,
2527
+ gen_kwargs,
2528
+ provider,
2529
+ model,
2530
+ syntax_rewriter,
2531
+ request_id,
2532
+ temp_files_to_cleanup,
2533
+ unload_after=unload_after_requested,
2534
+ ollama_key=ollama_key,
2535
+ allow_unsafe_unload_after=allow_unsafe_unload_after,
2132
2536
  ),
2133
2537
  media_type="text/event-stream",
2134
2538
  headers={"Cache-Control": "no-cache", "Connection": "keep-alive"}
@@ -2148,9 +2552,22 @@ async def process_chat_completion(
2148
2552
  )
2149
2553
  return openai_response
2150
2554
  finally:
2151
- # Cleanup temporary files (base64 and downloaded images) with delay to avoid race conditions
2152
- import threading
2555
+ if not request.stream:
2556
+ if provider_normalized == "ollama" and ollama_key is not None:
2557
+ should_unload = _ollama_inflight_exit(ollama_key, unload_after_requested=unload_after_requested)
2558
+ if should_unload and allow_unsafe_unload_after:
2559
+ _best_effort_unload(llm, request_id=request_id, provider=provider, model=model)
2560
+ elif should_unload:
2561
+ logger.warning(
2562
+ "⚠️ Unload requested but disabled by server policy",
2563
+ request_id=request_id,
2564
+ provider=provider,
2565
+ model=model,
2566
+ )
2567
+ elif unload_after_requested:
2568
+ _best_effort_unload(llm, request_id=request_id, provider=provider, model=model)
2153
2569
 
2570
+ # Cleanup temporary files (base64 and downloaded images) with delay to avoid race conditions
2154
2571
  def delayed_cleanup():
2155
2572
  """Cleanup temporary files after a short delay to avoid race conditions"""
2156
2573
  time.sleep(1) # Short delay to ensure generation is complete
@@ -2170,6 +2587,8 @@ async def process_chat_completion(
2170
2587
  cleanup_thread = threading.Thread(target=delayed_cleanup, daemon=True)
2171
2588
  cleanup_thread.start()
2172
2589
 
2590
+ except HTTPException:
2591
+ raise
2173
2592
  except Exception as e:
2174
2593
  logger.error(
2175
2594
  "❌ Chat completion failed",
@@ -2189,9 +2608,14 @@ def generate_streaming_response(
2189
2608
  model: str,
2190
2609
  syntax_rewriter: ToolCallSyntaxRewriter,
2191
2610
  request_id: str,
2192
- temp_files_to_cleanup: List[str] = None
2611
+ temp_files_to_cleanup: List[str] = None,
2612
+ *,
2613
+ unload_after: bool = False,
2614
+ ollama_key: Optional[Tuple[str, str, str]] = None,
2615
+ allow_unsafe_unload_after: bool = False,
2193
2616
  ) -> Iterator[str]:
2194
2617
  """Generate OpenAI-compatible streaming response with syntax rewriting."""
2618
+ provider_normalized = provider.strip().lower()
2195
2619
  try:
2196
2620
  chat_id = f"chatcmpl-{uuid.uuid4().hex[:8]}"
2197
2621
  created_time = int(time.time())
@@ -2324,6 +2748,32 @@ def generate_streaming_response(
2324
2748
  )
2325
2749
  error_chunk = {"error": {"message": str(e), "type": "server_error"}}
2326
2750
  yield f"data: {json.dumps(error_chunk)}\n\n"
2751
+ finally:
2752
+ if provider_normalized == "ollama" and ollama_key is not None:
2753
+ try:
2754
+ should_unload = _ollama_inflight_exit(ollama_key, unload_after_requested=unload_after)
2755
+ except Exception as e:
2756
+ logger.warning(
2757
+ "⚠️ Failed to update in-flight unload state",
2758
+ request_id=request_id,
2759
+ provider=provider,
2760
+ model=model,
2761
+ error=str(e),
2762
+ error_type=type(e).__name__,
2763
+ )
2764
+ should_unload = False
2765
+
2766
+ if should_unload and allow_unsafe_unload_after:
2767
+ _best_effort_unload(llm, request_id=request_id, provider=provider, model=model)
2768
+ elif should_unload:
2769
+ logger.warning(
2770
+ "⚠️ Unload requested but disabled by server policy",
2771
+ request_id=request_id,
2772
+ provider=provider,
2773
+ model=model,
2774
+ )
2775
+ elif unload_after:
2776
+ _best_effort_unload(llm, request_id=request_id, provider=provider, model=model)
2327
2777
 
2328
2778
  def convert_to_openai_response(
2329
2779
  response,
@@ -2407,7 +2857,7 @@ def convert_to_openai_response(
2407
2857
  def run_server(host: str = "0.0.0.0", port: int = 8000):
2408
2858
  """Run the server"""
2409
2859
  import uvicorn
2410
- uvicorn.run(app, host=host, port=port)
2860
+ uvicorn.run(app, host=host, port=port, log_level="error")
2411
2861
 
2412
2862
  # ============================================================================
2413
2863
  # Server Runner Function
@@ -2476,7 +2926,7 @@ Debug Mode:
2476
2926
  "app": app,
2477
2927
  "host": args.host,
2478
2928
  "port": args.port,
2479
- "log_level": "debug" if debug_mode else "info"
2929
+ "log_level": "debug" if debug_mode else "error",
2480
2930
  }
2481
2931
 
2482
2932
  # In debug mode, enable more detailed uvicorn logging