PyPI - abstractcore - Versions diffs - 2.9.1__py3-none-any.whl → 2.11.4__py3-none-any.whl - Mend

abstractcore 2.9.1py3-none-any.whl → 2.11.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

abstractcore/__init__.py +7 -27
abstractcore/apps/deepsearch.py +9 -4
abstractcore/apps/extractor.py +33 -100
abstractcore/apps/intent.py +19 -0
abstractcore/apps/judge.py +20 -1
abstractcore/apps/summarizer.py +20 -1
abstractcore/architectures/detection.py +34 -1
abstractcore/architectures/response_postprocessing.py +313 -0
abstractcore/assets/architecture_formats.json +38 -8
abstractcore/assets/model_capabilities.json +882 -160
abstractcore/compression/__init__.py +1 -2
abstractcore/compression/glyph_processor.py +6 -4
abstractcore/config/main.py +52 -20
abstractcore/config/manager.py +390 -12
abstractcore/config/vision_config.py +5 -5
abstractcore/core/interface.py +151 -3
abstractcore/core/session.py +16 -10
abstractcore/download.py +1 -1
abstractcore/embeddings/manager.py +20 -6
abstractcore/endpoint/__init__.py +2 -0
abstractcore/endpoint/app.py +458 -0
abstractcore/mcp/client.py +3 -1
abstractcore/media/__init__.py +52 -17
abstractcore/media/auto_handler.py +42 -22
abstractcore/media/base.py +44 -1
abstractcore/media/capabilities.py +12 -33
abstractcore/media/enrichment.py +105 -0
abstractcore/media/handlers/anthropic_handler.py +19 -28
abstractcore/media/handlers/local_handler.py +124 -70
abstractcore/media/handlers/openai_handler.py +19 -31
abstractcore/media/processors/__init__.py +4 -2
abstractcore/media/processors/audio_processor.py +57 -0
abstractcore/media/processors/office_processor.py +8 -3
abstractcore/media/processors/pdf_processor.py +46 -3
abstractcore/media/processors/text_processor.py +22 -24
abstractcore/media/processors/video_processor.py +58 -0
abstractcore/media/types.py +97 -4
abstractcore/media/utils/image_scaler.py +20 -2
abstractcore/media/utils/video_frames.py +219 -0
abstractcore/media/vision_fallback.py +136 -22
abstractcore/processing/__init__.py +32 -3
abstractcore/processing/basic_deepsearch.py +15 -10
abstractcore/processing/basic_intent.py +3 -2
abstractcore/processing/basic_judge.py +3 -2
abstractcore/processing/basic_summarizer.py +1 -1
abstractcore/providers/__init__.py +3 -1
abstractcore/providers/anthropic_provider.py +95 -8
abstractcore/providers/base.py +1516 -81
abstractcore/providers/huggingface_provider.py +546 -69
abstractcore/providers/lmstudio_provider.py +30 -916
abstractcore/providers/mlx_provider.py +382 -35
abstractcore/providers/model_capabilities.py +5 -1
abstractcore/providers/ollama_provider.py +99 -15
abstractcore/providers/openai_compatible_provider.py +406 -180
abstractcore/providers/openai_provider.py +188 -44
abstractcore/providers/openrouter_provider.py +76 -0
abstractcore/providers/registry.py +61 -5
abstractcore/providers/streaming.py +138 -33
abstractcore/providers/vllm_provider.py +92 -817
abstractcore/server/app.py +478 -28
abstractcore/server/audio_endpoints.py +139 -0
abstractcore/server/vision_endpoints.py +1319 -0
abstractcore/structured/handler.py +316 -41
abstractcore/tools/common_tools.py +5501 -2012
abstractcore/tools/comms_tools.py +1641 -0
abstractcore/tools/core.py +37 -7
abstractcore/tools/handler.py +4 -9
abstractcore/tools/parser.py +49 -2
abstractcore/tools/tag_rewriter.py +2 -1
abstractcore/tools/telegram_tdlib.py +407 -0
abstractcore/tools/telegram_tools.py +261 -0
abstractcore/utils/cli.py +1085 -72
abstractcore/utils/structured_logging.py +29 -8
abstractcore/utils/token_utils.py +2 -0
abstractcore/utils/truncation.py +29 -0
abstractcore/utils/version.py +3 -4
abstractcore/utils/vlm_token_calculator.py +12 -2
abstractcore-2.11.4.dist-info/METADATA +562 -0
abstractcore-2.11.4.dist-info/RECORD +133 -0
{abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/WHEEL +1 -1
{abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/entry_points.txt +1 -0
abstractcore-2.9.1.dist-info/METADATA +0 -1190
abstractcore-2.9.1.dist-info/RECORD +0 -119
{abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/licenses/LICENSE +0 -0
{abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/top_level.txt +0 -0

abstractcore/utils/cli.py CHANGED Viewed

@@ -16,14 +16,23 @@ AbstractCore framework directly.
 Usage:
     python -m abstractcore.utils.cli --provider ollama --model qwen3-coder:30b
-    python -m abstractcore.utils.cli --provider openai --model gpt-4o-mini --stream
-    python -m abstractcore.utils.cli --provider anthropic --model claude-3-5-haiku-20241022 --prompt "What is Python?"
+    python -m abstractcore.utils.cli --provider openai --model gpt-5-mini --stream
+    python -m abstractcore.utils.cli --provider anthropic --model claude-haiku-4-5 --prompt "What is Python?"
+    python -m abstractcore.utils.cli --provider lmstudio --model qwen/qwen3-4b-2507 --base-url http://localhost:1234/v1
+    python -m abstractcore.utils.cli --provider openrouter --model openai/gpt-4o-mini
 """
 import argparse
+import os
 import sys
 import time
-from typing import Optional
+import uuid
+import locale
+from datetime import datetime
+from pathlib import Path
+from typing import Optional, Any, Dict, Iterator, List, Union
+from .truncation import preview_text
 # Enable command history and arrow key navigation
 try:
@@ -42,11 +51,30 @@ from ..tools.common_tools import list_files, read_file, write_file, execute_comm
 from ..processing import BasicExtractor, BasicJudge, BasicIntentAnalyzer
+class _NoPromptCacheProvider:
+    """Proxy that forces `prompt_cache_key=None` for every call (to avoid polluting KV caches)."""
+    def __init__(self, provider: Any):
+        self._provider = provider
+    def generate(self, *args: Any, **kwargs: Any):
+        kwargs["prompt_cache_key"] = None
+        return self._provider.generate(*args, **kwargs)
+    async def agenerate(self, *args: Any, **kwargs: Any):
+        kwargs["prompt_cache_key"] = None
+        return await self._provider.agenerate(*args, **kwargs)
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self._provider, name)
 class SimpleCLI:
     """Simplified CLI REPL for AbstractCore"""
     def __init__(self, provider: str, model: str, stream: bool = False,
-                 max_tokens: int = None, debug: bool = False, show_banner: bool = True, **kwargs):
+                 max_tokens: int = None, max_output_tokens: int = None,
+                 debug: bool = False, show_banner: bool = True, **kwargs):
         self.provider_name = provider
         self.model_name = model
         self.stream_mode = stream
@@ -55,6 +83,7 @@ class SimpleCLI:
         self.kwargs = kwargs
         # Auto-detect max_tokens from model capabilities if not specified
+        self.max_tokens_auto = max_tokens is None
         if max_tokens is None:
             try:
                 from ..architectures.detection import get_model_capabilities
@@ -68,18 +97,41 @@ class SimpleCLI:
                     print(f"⚠️ Failed to auto-detect max_tokens, using fallback: {max_tokens} ({e})")
         self.max_tokens = max_tokens
+        self.max_output_tokens_auto = max_output_tokens is None
+        # Unified thinking/reasoning control (best-effort, provider/model dependent).
+        # - None: auto (provider/model default)
+        # - bool: on/off
+        # - str: "low"|"medium"|"high" when supported
+        self.thinking: Optional[Union[bool, str]] = None
+        # Whether to display model-supplied reasoning/thinking separately.
+        # - None: auto (show when thinking != off)
+        # - bool: force on/off
+        self.show_reasoning: Optional[bool] = None
         # Initialize command history with persistent storage
         self._setup_command_history()
         # Initialize provider and session with tools
-        self.provider = create_llm(provider, model=model, max_tokens=max_tokens, **kwargs)
+        provider_kwargs = dict(kwargs)
+        provider_kwargs["max_tokens"] = max_tokens
+        if max_output_tokens is not None:
+            provider_kwargs["max_output_tokens"] = max_output_tokens
+        self.provider = create_llm(provider, model=model, **provider_kwargs)
+        # Store the effective max_output_tokens (provider may auto-select based on model capabilities).
+        self.max_output_tokens = getattr(self.provider, "max_output_tokens", max_output_tokens or 2048)
         self.session = BasicSession(
             self.provider,
             system_prompt="You are a helpful AI assistant with vision capabilities. When users provide images or media files, analyze and describe them directly. You also have access to file operation tools.",
             tools=[list_files, read_file, write_file, execute_command, search_files]
         )
+        # Prompt caching (best-effort; provider-dependent).
+        self.country_code = self._get_country_code()
+        self.prompt_cache_mode = "off"  # off | key | kv
+        self.prompt_cache_key: Optional[str] = None
+        self.prompt_cache_file: Optional[str] = None
+        self._init_prompt_caching(show_banner=show_banner)
         # Only show banner in interactive mode
         if show_banner:
             print("=" * 70)
@@ -89,7 +141,7 @@ class SimpleCLI:
             print(f"📝 Model: {model}")
             print(f"🌊 Streaming: {'ON' if stream else 'OFF'} | 🐛 Debug: {'ON' if debug else 'OFF'}")
             print()
-            print("💬 Quick Commands: /help /save /load /status /history /quit")
+            print("💬 Quick Commands: /help /session /cache /status /history /quit")
             print("🛠️  Available Tools: list_files, search_files, read_file, write_file, execute_command")
             print()
             print("💡 Type '/help' for comprehensive command guide")
@@ -158,7 +210,8 @@ class SimpleCLI:
             print("─" * 50)
             print("  /help                    Show this comprehensive help")
             print("  /quit                    Exit the CLI")
-            print("  /clear                   Clear the screen (like unix terminal)")
+            print("  /clear                   Clear prompt cache + context (like mlx-chat)")
+            print("  /cls                     Clear the screen (like unix terminal)")
             print("  /reset                   Reset conversation history")
             print("  /status                  Show system status and capabilities")
@@ -175,17 +228,25 @@ class SimpleCLI:
             print("                           • /system         - Show current prompt")
             print("                           • /system <text>  - Set new prompt")
-            print("\n💾 SESSION PERSISTENCE")
+            print("\n💾 SESSION & CACHE")
             print("─" * 50)
-            print("  /save <file> [options]   Save session with optional analytics")
-            print("                           • /save chat.json")
-            print("                           • /save analyzed --summary --assessment --facts")
+            print("  /session save <name> [options]  Save session to <name>.json with optional analytics")
+            print("                           • /session save chat")
+            print("                           • /session save analyzed --summary --assessment --facts")
             print("                           Options:")
             print("                             --summary     Generate conversation summary")
             print("                             --assessment  Evaluate conversation quality")
             print("                             --facts       Extract knowledge as facts")
-            print("  /load <file>             Load saved session (replaces current)")
-            print("                           • /load chat.json")
+            print("  /session load <name>            Load session from <name>.json (replaces current)")
+            print("                           • /session load chat")
+            print("  /session clear                  Clear session + cache (same as /clear)")
+            print("  /save /load                     Aliases for /session save|load (sessions only)")
+            print("  /cache save <name>              Save prompt/KV cache to <name>.safetensors (MLX only, model-locked)")
+            print("                           • /cache save chat_cache")
+            print("                             --q8          Quantize cache before saving (smaller, lossy)")
+            print("  /cache load <name>              Load prompt/KV cache from <name>.safetensors (MLX only, model-locked)")
+            print("                           • /cache load chat_cache")
+            print("  /cache clear                    Clear prompt cache only (KV mode rebuilds from transcript)")
             print("\n📊 ANALYTICS & INSIGHTS")
             print("─" * 50)
@@ -201,8 +262,15 @@ class SimpleCLI:
             print("\n⚙️  CONFIGURATION")
             print("─" * 50)
             print("  /model <provider:model>  Switch LLM provider/model")
-            print("                           • /model openai:gpt-4o-mini")
-            print("                           • /model anthropic:claude-3-5-haiku")
+            print("                           • /model openai:gpt-5-mini")
+            print("                           • /model anthropic:claude-haiku-4-5")
+            print("                           • /model openrouter:openai/gpt-4o-mini")
+            print("  /max-tokens <n|auto>     Set context token budget")
+            print("  /max-output-tokens <n|auto> Set max output tokens per response")
+            print("  /thinking <mode>         Set thinking/reasoning mode (best-effort)")
+            print("                           • /thinking auto|on|off|low|medium|high")
+            print("  /show-reasoning <mode>   Display reasoning separately (auto/on/off)")
+            print("                           • /show-reasoning auto|on|off")
             print("  /stream                  Toggle streaming mode on/off")
             print("  /debug                   Toggle debug info (timing, detection)")
@@ -231,7 +299,7 @@ class SimpleCLI:
             print("  • Search inside files: 'Find all TODO comments in Python files'")
             print("  • Request file operations: 'Read the README.md file'")
             print("  • Attach files: 'What's in this image? @photo.jpg'")
-            print("  • Save important conversations: '/save project_discussion --summary'")
+            print("  • Save important conversations: '/session save project_discussion --summary'")
             print("  • Switch models for different tasks: '/model ollama:qwen3-coder:30b'")
             print("  • Use /status to check token usage and model capabilities")
@@ -240,13 +308,17 @@ class SimpleCLI:
             print("=" * 70 + "\n")
         elif cmd == 'clear':
-            # Clear the screen like in unix terminal
-            import os
-            os.system('cls' if os.name == 'nt' else 'clear')
+            self.handle_clear()
+        elif cmd == 'cls':
+            self._clear_screen()
         elif cmd == 'reset':
-            self.session.clear_history(keep_system=True)
-            print("🧹 Chat history reset")
+            if self.prompt_cache_mode == "kv":
+                self.handle_clear()
+            else:
+                self.session.clear_history(keep_system=True)
+                print("🧹 Chat history reset")
         elif cmd == 'stream':
             self.stream_mode = not self.stream_mode
@@ -260,6 +332,134 @@ class SimpleCLI:
         elif cmd == 'status':
             self.handle_status()
+        elif cmd.startswith('thinking'):
+            parts = cmd.split(maxsplit=1)
+            if len(parts) == 1:
+                current = "auto" if self.thinking is None else ("on" if self.thinking is True else "off" if self.thinking is False else str(self.thinking))
+                print(f"🧠 thinking: {current}")
+                print("❓ Usage: /thinking <auto|on|off|low|medium|high>")
+                return True
+            raw = parts[1].strip().lower()
+            if raw in {"auto", "none", "null"}:
+                self.thinking = None
+            elif raw in {"on", "true", "1", "yes"}:
+                self.thinking = True
+            elif raw in {"off", "false", "0", "no"}:
+                self.thinking = False
+            elif raw in {"low", "medium", "high"}:
+                self.thinking = raw
+            else:
+                print("❓ Usage: /thinking <auto|on|off|low|medium|high>")
+                return True
+            current = "auto" if self.thinking is None else ("on" if self.thinking is True else "off" if self.thinking is False else str(self.thinking))
+            print(f"✅ thinking set to: {current}")
+            return True
+        elif cmd.startswith('show-reasoning') or cmd.startswith('reasoning'):
+            parts = cmd.split(maxsplit=1)
+            if len(parts) == 1:
+                current = "auto" if self.show_reasoning is None else ("on" if self.show_reasoning else "off")
+                print(f"🧠 show-reasoning: {current}")
+                print("❓ Usage: /show-reasoning <auto|on|off>")
+                return True
+            raw = parts[1].strip().lower()
+            if raw in {"auto", "none", "null"}:
+                self.show_reasoning = None
+            elif raw in {"on", "true", "1", "yes"}:
+                self.show_reasoning = True
+            elif raw in {"off", "false", "0", "no"}:
+                self.show_reasoning = False
+            else:
+                print("❓ Usage: /show-reasoning <auto|on|off>")
+                return True
+            current = "auto" if self.show_reasoning is None else ("on" if self.show_reasoning else "off")
+            print(f"✅ show-reasoning set to: {current}")
+            return True
+        elif cmd.startswith('max-tokens'):
+            parts = cmd.split()
+            if len(parts) == 1:
+                print(f"💾 max_tokens (context budget): {self.max_tokens:,} ({'auto' if self.max_tokens_auto else 'manual'})")
+                print("❓ Usage: /max-tokens <n|auto>")
+            else:
+                raw_value = parts[1].strip().lower()
+                if raw_value in {"auto", "-1"}:
+                    try:
+                        from ..architectures.detection import get_model_capabilities
+                        capabilities = get_model_capabilities(self.model_name)
+                        detected = capabilities.get('max_tokens', 16384)
+                    except Exception:
+                        detected = 16384
+                    self.max_tokens = int(detected)
+                    self.max_tokens_auto = True
+                else:
+                    try:
+                        new_max = int(raw_value)
+                        if new_max <= 0:
+                            raise ValueError
+                        self.max_tokens = new_max
+                        self.max_tokens_auto = False
+                    except ValueError:
+                        print("❓ Usage: /max-tokens <n|auto> (n must be a positive integer)")
+                        return True
+                # Apply to current provider (best-effort; mostly used for token budgeting/compaction).
+                try:
+                    setattr(self.provider, "max_tokens", self.max_tokens)
+                except Exception:
+                    pass
+                # Safety clamp: output should not exceed total budget.
+                if isinstance(self.max_output_tokens, int) and self.max_output_tokens > int(self.max_tokens):
+                    self.max_output_tokens = int(self.max_tokens)
+                    try:
+                        setattr(self.provider, "max_output_tokens", self.max_output_tokens)
+                    except Exception:
+                        pass
+                print(f"✅ max_tokens set to {self.max_tokens:,}")
+        elif cmd.startswith('max-output-tokens'):
+            parts = cmd.split()
+            if len(parts) == 1:
+                print(f"✍️ max_output_tokens (per response): {self.max_output_tokens:,} ({'auto' if self.max_output_tokens_auto else 'manual'})")
+                print("❓ Usage: /max-output-tokens <n|auto>")
+            else:
+                raw_value = parts[1].strip().lower()
+                if raw_value in {"auto", "-1"}:
+                    try:
+                        from ..architectures.detection import get_model_capabilities
+                        capabilities = get_model_capabilities(self.model_name)
+                        detected = capabilities.get('max_output_tokens', getattr(self.provider, "max_output_tokens", 2048))
+                    except Exception:
+                        detected = getattr(self.provider, "max_output_tokens", 2048)
+                    self.max_output_tokens = int(detected)
+                    self.max_output_tokens_auto = True
+                else:
+                    try:
+                        new_max = int(raw_value)
+                        if new_max <= 0:
+                            raise ValueError
+                        self.max_output_tokens = new_max
+                        self.max_output_tokens_auto = False
+                    except ValueError:
+                        print("❓ Usage: /max-output-tokens <n|auto> (n must be a positive integer)")
+                        return True
+                # Safety clamp: output should not exceed total budget.
+                if isinstance(self.max_tokens, int) and self.max_output_tokens > int(self.max_tokens):
+                    self.max_output_tokens = int(self.max_tokens)
+                try:
+                    setattr(self.provider, "max_output_tokens", self.max_output_tokens)
+                except Exception:
+                    pass
+                print(f"✅ max_output_tokens set to {self.max_output_tokens:,}")
         elif cmd.startswith('history'):
             # Parse /history [n] command
             parts = cmd.split()
@@ -282,13 +482,46 @@ class SimpleCLI:
                     self.model_name = model_spec
                 print(f"🔄 Switching to {self.provider_name}:{self.model_name}...")
+                # If token limits were auto-detected, re-detect them for the new model.
+                next_max_tokens = self.max_tokens
+                if self.max_tokens_auto:
+                    try:
+                        from ..architectures.detection import get_model_capabilities
+                        capabilities = get_model_capabilities(self.model_name)
+                        next_max_tokens = int(capabilities.get('max_tokens', 16384))
+                    except Exception:
+                        next_max_tokens = 16384
+                next_max_output_tokens = self.max_output_tokens
+                if self.max_output_tokens_auto:
+                    try:
+                        from ..architectures.detection import get_model_capabilities
+                        capabilities = get_model_capabilities(self.model_name)
+                        next_max_output_tokens = int(capabilities.get('max_output_tokens', self.max_output_tokens))
+                    except Exception:
+                        next_max_output_tokens = self.max_output_tokens
+                # Safety clamp: output should not exceed total budget.
+                if isinstance(next_max_tokens, int) and isinstance(next_max_output_tokens, int):
+                    if next_max_output_tokens > next_max_tokens:
+                        next_max_output_tokens = next_max_tokens
                 self.provider = create_llm(self.provider_name, model=self.model_name,
-                                         max_tokens=self.max_tokens, **self.kwargs)
+                                         max_tokens=next_max_tokens,
+                                         max_output_tokens=next_max_output_tokens,
+                                         **self.kwargs)
+                self.max_tokens = next_max_tokens
+                self.max_output_tokens = getattr(self.provider, "max_output_tokens", next_max_output_tokens)
                 self.session = BasicSession(
                     self.provider,
                     system_prompt="You are a helpful AI assistant with vision capabilities. When users provide images or media files, analyze and describe them directly. You also have access to file operation tools.",
                     tools=[list_files, read_file, write_file, execute_command, search_files]
                 )
+                # Reset caching state for the new provider+model.
+                self.prompt_cache_key = None
+                self.prompt_cache_file = None
+                self.prompt_cache_mode = "off"
+                self._init_prompt_caching(show_banner=False)
                 print("✅ Model switched")
             except Exception as e:
                 print(f"❌ Failed to switch: {e}")
@@ -345,12 +578,87 @@ class SimpleCLI:
                 else:
                     self.handle_system_show()
+        elif cmd.startswith('session'):
+            # /session save|load|clear ...
+            parts = cmd.split()
+            if len(parts) < 2:
+                print("❓ Usage: /session <save|load|clear> ...")
+                print("   Examples:")
+                print("     /session save my_conversation")
+                print("     /session save analyzed_session --summary --assessment --facts")
+                print("     /session load my_conversation")
+                print("     /session clear")
+                return True
+            action = parts[1].strip().lower()
+            if action == "save":
+                if len(parts) < 3:
+                    print("❓ Usage: /session save <name> [--summary] [--assessment] [--facts]")
+                    return True
+                filename = parts[2]
+                options = {
+                    'summary': '--summary' in parts[3:],
+                    'assessment': '--assessment' in parts[3:],
+                    'facts': '--facts' in parts[3:],
+                }
+                self.handle_save(filename, **options)
+                return True
+            if action == "load":
+                if len(parts) != 3:
+                    print("❓ Usage: /session load <name>")
+                    return True
+                self.handle_load(parts[2])
+                return True
+            if action == "clear":
+                self.handle_clear()
+                return True
+            print("❓ Usage: /session <save|load|clear> ...")
+            return True
+        elif cmd.startswith('cache'):
+            # /cache save|load|clear ...
+            parts = cmd.split()
+            if len(parts) < 2:
+                print("❓ Usage: /cache <save|load|clear> ...")
+                print("   Examples:")
+                print("     /cache save chat_cache")
+                print("     /cache load chat_cache")
+                print("     /cache clear")
+                return True
+            action = parts[1].strip().lower()
+            if action == "save":
+                if len(parts) < 3:
+                    print("❓ Usage: /cache save <name> [--q8]")
+                    return True
+                filename = parts[2]
+                self.handle_save_prompt_cache(filename, q8=("--q8" in parts[3:]))
+                return True
+            if action == "load":
+                if len(parts) != 3:
+                    print("❓ Usage: /cache load <name>")
+                    return True
+                self.handle_load_prompt_cache(parts[2])
+                return True
+            if action == "clear":
+                self.handle_cache_clear()
+                return True
+            print("❓ Usage: /cache <save|load|clear> ...")
+            return True
         elif cmd.startswith('save'):
             # Parse /save <file> [--summary] [--assessment] [--facts] command
             parts = cmd.split()
             if len(parts) < 2:
                 print("❓ Usage: /save <filename> [--summary] [--assessment] [--facts]")
-                print("   Example: /save my_conversation.json")
+                print("   Example: /save my_conversation")
+                print("   Hint: use /cache save <name> for prompt caches")
                 print("   Example: /save analyzed_session --summary --assessment --facts")
             else:
                 filename = parts[1]
@@ -366,7 +674,8 @@ class SimpleCLI:
             parts = cmd.split()
             if len(parts) != 2:
                 print("❓ Usage: /load <filename>")
-                print("   Example: /load my_conversation.json")
+                print("   Example: /load my_conversation")
+                print("   Hint: use /cache load <name> for prompt caches")
             else:
                 filename = parts[1]
                 self.handle_load(filename)
@@ -390,6 +699,423 @@ class SimpleCLI:
         return True
+    def _clear_screen(self) -> None:
+        os.system('cls' if os.name == 'nt' else 'clear')
+    def _print_error(self, msg: str) -> None:
+        red = "\033[31m"
+        reset = "\033[0m"
+        print(f"{red}{msg}{reset}")
+    def _print_warn(self, msg: str) -> None:
+        yellow = "\033[33m"
+        reset = "\033[0m"
+        print(f"{yellow}{msg}{reset}")
+    def _force_extension(self, filename: str, ext: str) -> str:
+        """Ensure `filename` ends with `ext` by replacing any existing suffix (best-effort)."""
+        ext = str(ext or "").strip()
+        if not ext:
+            return filename
+        if not ext.startswith("."):
+            ext = f".{ext}"
+        try:
+            p = Path(filename)
+        except Exception:
+            return f"{filename}{ext}"
+        if p.suffix:
+            return str(p.with_suffix(ext))
+        return f"{p}{ext}"
+    def _resolve_session_path(self, filename: str) -> Optional[str]:
+        """Resolve a session file path (prefers exact match, then `.json`)."""
+        if not isinstance(filename, str) or not filename.strip():
+            return None
+        raw = filename.strip()
+        candidates = [raw]
+        forced = self._force_extension(raw, ".json")
+        if forced != raw:
+            candidates.append(forced)
+        for cand in candidates:
+            if os.path.exists(cand):
+                return cand
+        return None
+    def _resolve_cache_path(self, filename: str) -> Optional[str]:
+        """Resolve a cache file path (prefers exact match, then `.safetensors` / `.safetensor`)."""
+        if not isinstance(filename, str) or not filename.strip():
+            return None
+        raw = filename.strip()
+        candidates = [raw]
+        forced = self._force_extension(raw, ".safetensors")
+        if forced != raw:
+            candidates.append(forced)
+        forced_alt = self._force_extension(raw, ".safetensor")
+        if forced_alt not in candidates:
+            candidates.append(forced_alt)
+        for cand in candidates:
+            if os.path.exists(cand):
+                return cand
+        return None
+    def _kv_cache_token_count(self, key: str) -> Optional[int]:
+        """Best-effort token count for the active KV cache key (MLX)."""
+        if not isinstance(key, str) or not key.strip():
+            return None
+        try:
+            cache_obj = getattr(self.provider, "_prompt_cache_store").get(key.strip())
+        except Exception:
+            cache_obj = None
+        if cache_obj is None:
+            return None
+        try:
+            tok = getattr(self.provider, "_prompt_cache_backend_token_count")(cache_obj)
+            return int(tok) if isinstance(tok, int) else None
+        except Exception:
+            return None
+    def _kv_refresh_tools_if_needed(self, *, reason: str, force: bool = False) -> bool:
+        """Re-inject tool specs into the active KV cache when recency or origin requires it."""
+        if self.prompt_cache_mode != "kv":
+            return False
+        if not self._is_mlx_provider():
+            return False
+        if not self._supports_prompt_cache():
+            return False
+        if not getattr(self.session, "tools", None):
+            return False
+        key = self.prompt_cache_key
+        if not isinstance(key, str) or not key.strip():
+            return False
+        # Long-context models can “forget” early tool specs; re-inject near the end when the cache is very large.
+        threshold_default = 50_000
+        try:
+            threshold = int(os.getenv("ABSTRACTCORE_CLI_KV_REFRESH_TOOLS_AT", str(threshold_default)))
+        except Exception:
+            threshold = threshold_default
+        if threshold < 0:
+            threshold = threshold_default
+        tok = self._kv_cache_token_count(key)
+        should = bool(force) or (isinstance(tok, int) and tok >= threshold)
+        if not should:
+            return False
+        try:
+            getattr(self.provider, "prompt_cache_update")(
+                key,
+                system_prompt=None,  # tools-only system message for recency
+                tools=self.session.tools,
+                add_generation_prompt=False,
+            )
+        except Exception as e:
+            self._print_warn(f"⚠️ Could not refresh tools into KV cache ({reason}): {e}")
+            return False
+        if not self.single_prompt_mode:
+            extra = f" (~{tok:,} tokens)" if isinstance(tok, int) and tok > 0 else ""
+            print(f"🧰 Tools refreshed into KV cache ({reason}){extra}")
+        return True
+    def _get_country_code(self) -> str:
+        val = os.getenv("ABSTRACTCORE_CLI_COUNTRY")
+        if isinstance(val, str) and val.strip():
+            cc = val.strip().upper()
+            return cc if len(cc) == 2 else cc[:2]
+        # Best-effort locale fallback (e.g. "en_US" -> "US")
+        try:
+            loc = locale.getlocale()[0] or ""
+        except Exception:
+            loc = ""
+        if isinstance(loc, str) and "_" in loc:
+            cc = loc.split("_", 1)[1].strip().upper()
+            if cc:
+                return cc[:2]
+        return "FR"
+    def _timestamp_user_message(self, text: str) -> str:
+        ts = datetime.now().strftime("%Y/%m/%d %H:%M")
+        return f"[{ts} {self.country_code}] {text}"
+    def _supports_prompt_cache(self) -> bool:
+        try:
+            fn = getattr(self.provider, "supports_prompt_cache", None)
+            return bool(fn and fn())
+        except Exception:
+            return False
+    def _is_mlx_provider(self) -> bool:
+        return str(self.provider_name or "").strip().lower() == "mlx"
+    def _analysis_provider(self) -> Any:
+        """Provider to use for internal CLI analytics (never mutates KV prompt cache)."""
+        if self.prompt_cache_mode != "kv":
+            return self.provider
+        return _NoPromptCacheProvider(self.provider)
+    def _init_prompt_caching(self, *, show_banner: bool) -> None:
+        if not self._supports_prompt_cache():
+            self.prompt_cache_mode = "off"
+            return
+        # Default policy:
+        # - MLX: local KV cache (append-only) with explicit prefill (system+tools).
+        # - Other providers: key-only hint (pass-through / best-effort).
+        if self._is_mlx_provider():
+            self.prompt_cache_mode = "kv"
+        else:
+            self.prompt_cache_mode = "key"
+        self.prompt_cache_key = f"cli:{uuid.uuid4().hex[:12]}"
+        try:
+            ok = bool(getattr(self.provider, "prompt_cache_set")(self.prompt_cache_key, make_default=True))
+        except Exception:
+            ok = False
+        if not ok:
+            self.prompt_cache_mode = "off"
+            self.prompt_cache_key = None
+            return
+        if self.prompt_cache_mode == "kv":
+            # Prefill stable modules once so each turn can be appended safely.
+            try:
+                getattr(self.provider, "prompt_cache_update")(
+                    self.prompt_cache_key,
+                    system_prompt=self.session.system_prompt,
+                    tools=self.session.tools,
+                    add_generation_prompt=False,
+                )
+            except Exception as e:
+                self._print_warn(f"⚠️ Prompt cache prefill failed; falling back to key-only mode: {e}")
+                self.prompt_cache_mode = "key"
+        if show_banner:
+            if self.prompt_cache_mode == "kv":
+                print(f"🧠 Prompt caching: ON (KV local)  key={self.prompt_cache_key}")
+            elif self.prompt_cache_mode == "key":
+                print(f"🧠 Prompt caching: ON (key hint)  key={self.prompt_cache_key}")
+    def handle_clear(self) -> None:
+        """Clear prompt cache and context (best-effort)."""
+        # Clear session transcript (keep system prompt for user visibility).
+        self.session.clear_history(keep_system=True)
+        if not self._supports_prompt_cache():
+            print("🧹 Context cleared (prompt caching unsupported)")
+            return
+        # Clear provider-side in-process caches (best-effort).
+        try:
+            getattr(self.provider, "prompt_cache_clear")(None)
+        except Exception:
+            pass
+        # Re-init caching for this run.
+        self.prompt_cache_key = None
+        self.prompt_cache_file = None
+        self._init_prompt_caching(show_banner=False)
+        if self.prompt_cache_mode == "off":
+            print("🧹 Context cleared (prompt caching disabled)")
+        else:
+            print("🧹 Context + prompt cache cleared")
+    def handle_cache_clear(self) -> None:
+        """Clear prompt cache only (best-effort)."""
+        if not self._supports_prompt_cache():
+            print("🧹 Prompt cache cleared (prompt caching unsupported)")
+            return
+        # In KV mode the cache is the source-of-truth for model context; clearing it without clearing
+        # or resending history would desync the model and the transcript. Rebuild from transcript.
+        if self.prompt_cache_mode == "kv":
+            self._print_warn("⚠️ KV cache cleared; rebuilding from current session transcript")
+            try:
+                self._rebuild_kv_cache_from_session()
+                return
+            except Exception as e:
+                self._print_error(f"❌ KV cache rebuild failed: {e}")
+                self._print_warn("⚠️ Falling back to session-managed mode (no KV)")
+                self.prompt_cache_mode = "key"
+        # Key-only / remote mode: clear provider-side caches (best-effort) and rotate key.
+        try:
+            getattr(self.provider, "prompt_cache_clear")(None)
+        except Exception:
+            pass
+        self.prompt_cache_key = None
+        self.prompt_cache_file = None
+        self._init_prompt_caching(show_banner=False)
+        if self.prompt_cache_mode == "off":
+            print("🧹 Prompt cache cleared (prompt caching disabled)")
+        else:
+            print("🧹 Prompt cache cleared")
+    def handle_save_prompt_cache(self, filename: str, *, q8: bool = False) -> None:
+        """Save MLX prompt cache to disk (writes a `.safetensors` file; model-locked)."""
+        if not self._is_mlx_provider():
+            self._print_error("❌ KV cache save is only supported for provider 'mlx'")
+            return
+        if not self._supports_prompt_cache():
+            self._print_error("❌ This provider does not support prompt caching")
+            return
+        filename = self._force_extension(filename, ".safetensors")
+        key = self.prompt_cache_key
+        if not isinstance(key, str) or not key.strip():
+            self._print_error("❌ No active prompt cache key; start chatting first or /clear to re-init caching")
+            return
+        try:
+            cache_obj = getattr(self.provider, "_prompt_cache_store").get(key)
+        except Exception:
+            cache_obj = None
+        if cache_obj is None:
+            self._print_error("❌ Prompt cache is empty; nothing to save yet")
+            return
+        try:
+            from mlx_lm.models.cache import save_prompt_cache
+        except Exception:
+            self._print_error("❌ MLX cache saving requires mlx-lm (install: `pip install \"abstractcore[mlx]\"`)")
+            return
+        meta: Dict[str, str] = {
+            "format": "abstractcore-cli-prompt-cache/v1",
+            "provider": str(self.provider_name),
+            "model": str(getattr(self.provider, "model", self.model_name)),
+            "saved_at": datetime.now().isoformat(),
+        }
+        try:
+            tok = getattr(self.provider, "_prompt_cache_backend_token_count")(cache_obj)
+            if isinstance(tok, int) and tok >= 0:
+                meta["token_count"] = str(tok)
+        except Exception:
+            pass
+        cache_to_save = cache_obj
+        if q8:
+            try:
+                cache_to_save = [layer.to_quantized(group_size=64, bits=8) for layer in cache_obj]
+                meta["quantized"] = "q8"
+            except Exception as e:
+                self._print_warn(f"⚠️ q8 quantization failed; saving full-precision cache: {e}")
+        try:
+            save_prompt_cache(filename, cache_to_save, metadata=meta)
+            self.prompt_cache_file = filename
+            extra = ""
+            if "token_count" in meta:
+                extra = f" ({meta['token_count']} tokens)"
+            print(f"💾 Cache saved to {filename}{extra}")
+        except Exception as e:
+            self._print_error(f"❌ Failed to save prompt cache: {e}")
+    def handle_load_prompt_cache(self, filename: str) -> None:
+        """Load MLX prompt cache from disk (reads a `.safetensors` file; model-locked)."""
+        if not self._is_mlx_provider():
+            self._print_error("❌ KV cache load is only supported for provider 'mlx'")
+            return
+        if not self._supports_prompt_cache():
+            self._print_error("❌ This provider does not support prompt caching")
+            return
+        resolved = self._resolve_cache_path(filename)
+        if not resolved:
+            self._print_error(f"❌ File not found: {self._force_extension(filename, '.safetensors')}")
+            return
+        try:
+            from mlx_lm.models.cache import load_prompt_cache
+        except Exception:
+            self._print_error("❌ MLX cache loading requires mlx-lm (install: `pip install \"abstractcore[mlx]\"`)")
+            return
+        try:
+            loaded_cache, meta = load_prompt_cache(resolved, return_metadata=True)
+        except Exception as e:
+            self._print_error(f"❌ Failed to load prompt cache: {e}")
+            return
+        required_model = None
+        if isinstance(meta, dict):
+            required_model = meta.get("model") or meta.get("model_id")
+        current_model = str(getattr(self.provider, "model", self.model_name))
+        if isinstance(required_model, str) and required_model.strip() and required_model.strip() != current_model:
+            self._print_error(
+                "❌ Prompt cache model mismatch:\n"
+                f"   cache expects: {required_model}\n"
+                f"   current model: {current_model}\n"
+                f"   hint: run `/model mlx:{required_model}` then `/cache load {self._force_extension(filename, '.safetensors')}`"
+            )
+            return
+        if not isinstance(required_model, str) or not required_model.strip():
+            # Best-effort structural check: layer count mismatch is a strong signal of wrong model.
+            try:
+                expected = getattr(self.provider, "_prompt_cache_backend_create")()
+                if isinstance(expected, (list, tuple)) and isinstance(loaded_cache, (list, tuple)):
+                    if len(expected) != len(loaded_cache):
+                        self._print_error(
+                            "❌ Prompt cache appears incompatible with the current model (layer count mismatch).\n"
+                            f"   cache layers:   {len(loaded_cache)}\n"
+                            f"   model layers:   {len(expected)}\n"
+                            f"   hint: regenerate the cache with this model, or switch model and retry"
+                        )
+                        return
+            except Exception:
+                pass
+            self._print_warn("⚠️ Cache metadata has no model id; cannot fully verify compatibility (proceeding best-effort)")
+        # Clear existing caches and install the loaded cache under a fresh key.
+        try:
+            getattr(self.provider, "prompt_cache_clear")(None)
+        except Exception:
+            pass
+        new_key = f"cli:{uuid.uuid4().hex[:12]}"
+        try:
+            getattr(self.provider, "prompt_cache_set")(new_key, make_default=True)
+        except Exception:
+            pass
+        try:
+            getattr(self.provider, "_prompt_cache_store").set(
+                new_key,
+                loaded_cache,
+                meta={"backend": "mlx", "loaded_from": resolved, **(meta if isinstance(meta, dict) else {})},
+            )
+        except Exception as e:
+            self._print_error(f"❌ Failed to install loaded cache into provider store: {e}")
+            return
+        self.prompt_cache_mode = "kv"
+        self.prompt_cache_key = new_key
+        self.prompt_cache_file = resolved
+        # Reset transcript; the cache becomes the source of truth for context.
+        self.session.clear_history(keep_system=False)
+        token_note = ""
+        if isinstance(meta, dict) and isinstance(meta.get("token_count"), str) and meta.get("token_count"):
+            token_note = f" ({meta.get('token_count')} tokens)"
+        print(f"📂 Cache loaded from {resolved}{token_note} (key={new_key})")
+        cache_format = meta.get("format") if isinstance(meta, dict) else None
+        force_refresh = cache_format != "abstractcore-cli-prompt-cache/v1"
+        if force_refresh and not self.single_prompt_mode:
+            self._print_warn(
+                "⚠️ Loaded cache has no AbstractCore CLI metadata; it may not include tool specs.\n"
+                "   Injecting current CLI tool definitions into the KV cache for recency."
+            )
+        self._kv_refresh_tools_if_needed(reason="cache load", force=force_refresh)
     def handle_compact(self, focus: Optional[str] = None):
         """Handle /compact [focus] command - compact chat history with optional focus"""
         messages = self.session.get_messages()
@@ -419,10 +1145,17 @@ class SimpleCLI:
             start_time = time.time()
             # Perform in-place compaction with optional focus
-            self.session.force_compact(
-                preserve_recent=4,  # Keep last 6 messages (3 exchanges)
-                focus=focus or "key information and ongoing context"
+            compacted = self.session.compact(
+                preserve_recent=4,  # Keep last 4 messages (2 exchanges)
+                focus=focus or "key information and ongoing context",
+                compact_provider=compact_provider,
+                reason="user_requested",
             )
+            # Replace current session with compacted version (in-place).
+            try:
+                self.session._replace_with_compacted(compacted)
+            except Exception:
+                self.session = compacted
             duration = time.time() - start_time
@@ -439,10 +1172,10 @@ class SimpleCLI:
                     else:
                         print(f"   {i+1}. ⚙️  System prompt")
                 elif msg.role == 'user':
-                    preview = msg.content[:50] + "..." if len(msg.content) > 50 else msg.content
+                    preview = preview_text(msg.content, max_chars=50)
                     print(f"   {i+1}. 👤 {preview}")
                 elif msg.role == 'assistant':
-                    preview = msg.content[:50] + "..." if len(msg.content) > 50 else msg.content
+                    preview = preview_text(msg.content, max_chars=50)
                     print(f"   {i+1}. 🤖 {preview}")
             print("   💡 Note: Token count may increase initially due to detailed summary")
@@ -463,7 +1196,7 @@ class SimpleCLI:
             print("🔍 Extracting facts from conversation history...")
             # Create fact extractor using current provider for consistency
-            extractor = BasicExtractor(self.provider)
+            extractor = BasicExtractor(self._analysis_provider())
             # Format conversation history as text
             conversation_text = self._format_conversation_for_extraction(messages)
@@ -539,7 +1272,7 @@ class SimpleCLI:
             print("⚖️  Evaluating conversation quality...")
             # Create judge using current provider for consistency
-            judge = BasicJudge(self.provider)
+            judge = BasicJudge(self._analysis_provider())
             # Format conversation history as text
             conversation_text = self._format_conversation_for_extraction(messages)
@@ -653,7 +1386,7 @@ class SimpleCLI:
                 print("🎯 Analyzing conversation intents for all participants...")
             # Create intent analyzer using current provider for consistency
-            analyzer = BasicIntentAnalyzer(self.provider)
+            analyzer = BasicIntentAnalyzer(self._analysis_provider())
             # Convert session messages to the format expected by intent analyzer
             conversation_messages = [msg for msg in messages if msg.role != 'system']
@@ -717,7 +1450,7 @@ class SimpleCLI:
                 # Truncate long response approaches for readability
                 response_approach = analysis.suggested_response_approach
                 if len(response_approach) > 200:
-                    response_approach = response_approach[:197] + "..."
+                    response_approach = preview_text(response_approach, max_chars=200)
                 print(f"   {response_approach}")
                 # Analysis metadata
@@ -861,18 +1594,26 @@ class SimpleCLI:
                 break
         else:
             # No existing system message, add one at the beginning
-            self.session.messages.insert(0, self.session.add_message('system', new_prompt))
+            created = self.session.add_message('system', new_prompt)
+            # add_message appends; move the created system message to the front for correct ordering.
+            try:
+                self.session.messages.remove(created)
+            except Exception:
+                pass
+            self.session.messages.insert(0, created)
         print("✅ System prompt updated!")
         print(f"📝 Old: {old_prompt[:100]}{'...' if len(old_prompt) > 100 else ''}")
         print(f"📝 New: {new_prompt[:100]}{'...' if len(new_prompt) > 100 else ''}")
+        if self.prompt_cache_mode == "kv":
+            self._print_warn("⚠️ KV prompt cache invalidated by system prompt change; clearing cache and context")
+            self.handle_clear()
     def handle_save(self, filename: str, summary: bool = False, assessment: bool = False, facts: bool = False):
         """Handle /save <file> command - save current session to file with optional analytics"""
         try:
-            # Ensure .json extension for consistency
-            if not filename.endswith('.json'):
-                filename = f"{filename}.json"
+            filename = self._force_extension(filename, ".json")
             print(f"💾 Saving session to {filename}...")
@@ -882,11 +1623,12 @@ class SimpleCLI:
             # Generate optional analytics if requested
             analytics_generated = []
+            analysis_provider = self._analysis_provider()
             if summary:
                 print("   🔄 Generating summary...")
                 try:
-                    self.session.generate_summary(focus="key discussion points")
+                    self.session.generate_summary(focus="key discussion points", compact_provider=analysis_provider)
                     analytics_generated.append("summary")
                     print("   ✅ Summary generated")
                 except Exception as e:
@@ -894,20 +1636,38 @@ class SimpleCLI:
             if assessment:
                 print("   🔄 Generating assessment...")
+                original_provider = None
                 try:
+                    original_provider = self.session.provider
+                    self.session.provider = analysis_provider
                     self.session.generate_assessment()
+                    self.session.provider = original_provider
                     analytics_generated.append("assessment")
                     print("   ✅ Assessment generated")
                 except Exception as e:
+                    try:
+                        if original_provider is not None:
+                            self.session.provider = original_provider
+                    except Exception:
+                        pass
                     print(f"   ⚠️  Assessment generation failed: {e}")
             if facts:
                 print("   🔄 Extracting facts...")
+                original_provider = None
                 try:
+                    original_provider = self.session.provider
+                    self.session.provider = analysis_provider
                     self.session.extract_facts()
+                    self.session.provider = original_provider
                     analytics_generated.append("facts")
                     print("   ✅ Facts extracted")
                 except Exception as e:
+                    try:
+                        if original_provider is not None:
+                            self.session.provider = original_provider
+                    except Exception:
+                        pass
                     print(f"   ⚠️  Fact extraction failed: {e}")
             # Save using enhanced serialization
@@ -935,17 +1695,15 @@ class SimpleCLI:
     def handle_load(self, filename: str):
         """Handle /load <file> command - load session from file"""
         try:
-            # Ensure .json extension for consistency
-            if not filename.endswith('.json'):
-                filename = f"{filename}.json"
+            resolved = self._resolve_session_path(filename) or self._force_extension(filename, ".json")
             # Check if file exists
             import os
-            if not os.path.exists(filename):
-                print(f"❌ File not found: {filename}")
+            if not os.path.exists(resolved):
+                print(f"❌ File not found: {resolved}")
                 return
-            print(f"📂 Loading session from {filename}...")
+            print(f"📂 Loading session from {resolved}...")
             # Store current session info for comparison
             old_messages = len(self.session.get_messages())
@@ -955,17 +1713,27 @@ class SimpleCLI:
             from ..tools.common_tools import list_files, read_file, write_file, execute_command, search_files
             tools = [list_files, read_file, write_file, execute_command, search_files]
-            loaded_session = BasicSession.load(filename, provider=self.provider, tools=tools)
+            loaded_session = BasicSession.load(resolved, provider=self.provider, tools=tools)
             # Replace current session
             self.session = loaded_session
+            # If we're in local KV cache mode (MLX), rebuild the cache from the loaded transcript so
+            # the model context matches what the user sees.
+            if self._is_mlx_provider() and self._supports_prompt_cache():
+                try:
+                    self.prompt_cache_mode = "kv"
+                    self._rebuild_kv_cache_from_session()
+                except Exception as e:
+                    self._print_warn(f"⚠️ KV cache rebuild from session failed; continuing without KV mode: {e}")
+                    self.prompt_cache_mode = "key"
             # Get new session info
             new_messages = len(self.session.get_messages())
             new_tokens = self.session.get_token_estimate()
             print(f"✅ Session loaded successfully!")
-            print(f"   📁 File: {filename}")
+            print(f"   📁 File: {resolved}")
             print(f"   📝 Messages: {old_messages} → {new_messages}")
             print(f"   🔢 Tokens: ~{old_tokens:,} → ~{new_tokens:,}")
             print(f"   🤖 Provider: {self.provider_name}:{self.model_name} (current)")
@@ -994,6 +1762,69 @@ class SimpleCLI:
                 import traceback
                 traceback.print_exc()
+    def _rebuild_kv_cache_from_session(self) -> None:
+        """Best-effort rebuild of the local KV prompt cache from the current session transcript."""
+        if not self._is_mlx_provider():
+            return
+        if not self._supports_prompt_cache():
+            return
+        # Fresh cache key for the rebuilt state.
+        try:
+            getattr(self.provider, "prompt_cache_clear")(None)
+        except Exception:
+            pass
+        key = f"cli:{uuid.uuid4().hex[:12]}"
+        ok = False
+        try:
+            ok = bool(getattr(self.provider, "prompt_cache_set")(key, make_default=True))
+        except Exception:
+            ok = False
+        if not ok:
+            self.prompt_cache_mode = "off"
+            self.prompt_cache_key = None
+            raise RuntimeError("provider failed to create a prompt cache")
+        # Prefill stable modules.
+        try:
+            getattr(self.provider, "prompt_cache_update")(
+                key,
+                system_prompt=self.session.system_prompt,
+                tools=self.session.tools,
+                add_generation_prompt=False,
+            )
+        except Exception as e:
+            raise RuntimeError(f"failed to prefill system/tools: {e}") from e
+        # Append any additional transcript messages (excluding the main system prompt we just prefixed).
+        messages_to_append: List[Dict[str, Any]] = []
+        for msg in self.session.get_messages():
+            role = getattr(msg, "role", None)
+            content = getattr(msg, "content", None)
+            if role == "system":
+                if isinstance(self.session.system_prompt, str) and content == self.session.system_prompt and not str(content).startswith("[CONVERSATION HISTORY]"):
+                    continue
+            if role and content is not None:
+                messages_to_append.append({"role": role, "content": content})
+        if messages_to_append:
+            try:
+                getattr(self.provider, "prompt_cache_update")(
+                    key,
+                    messages=messages_to_append,
+                    add_generation_prompt=False,
+                )
+            except Exception as e:
+                raise RuntimeError(f"failed to append transcript messages: {e}") from e
+        self.prompt_cache_key = key
+        self.prompt_cache_file = None
+        self.prompt_cache_mode = "kv"
+        print(f"🧠 KV prompt cache rebuilt from session (key={key}, messages={len(messages_to_append)})")
+        self._kv_refresh_tools_if_needed(reason="session rebuild", force=False)
     def handle_tooltag_test(self, opening_tag: str, closing_tag: str):
         """Handle /tooltag command - demonstrate tool call format handling"""
         print(f"🏷️ Tool call format testing: {opening_tag}...{closing_tag}")
@@ -1010,6 +1841,29 @@ class SimpleCLI:
         print(f"🔧 Provider: {self.provider_name}")
         print(f"🤖 Model: {self.model_name}")
         print(f"🌊 Streaming: {'Enabled' if self.stream_mode else 'Disabled'}")
+        thinking_label = "auto" if self.thinking is None else ("on" if self.thinking is True else "off" if self.thinking is False else str(self.thinking))
+        print(f"🧠 Thinking: {thinking_label}")
+        show_reasoning_label = "auto" if self.show_reasoning is None else ("on" if self.show_reasoning else "off")
+        print(f"🧠 Show reasoning: {show_reasoning_label}")
+        if self.prompt_cache_mode != "off":
+            cache_details = f"mode={self.prompt_cache_mode}"
+            if self.prompt_cache_key:
+                cache_details += f" key={self.prompt_cache_key}"
+            if self.prompt_cache_file:
+                cache_details += f" file={self.prompt_cache_file}"
+            print(f"🧠 Prompt caching: {cache_details}")
+            try:
+                if hasattr(self.provider, "get_prompt_cache_stats"):
+                    stats = self.provider.get_prompt_cache_stats()
+                    if isinstance(stats, dict):
+                        entries = stats.get("entries")
+                        max_entries = stats.get("max_entries")
+                        if entries is not None and max_entries is not None:
+                            print(f"   Cache store: {entries}/{max_entries} entries")
+            except Exception:
+                pass
+        else:
+            print("🧠 Prompt caching: off")
         # Debug status - show both CLI and system logging
         print(f"🐛 CLI Debug: {'Enabled' if self.debug_mode else 'Disabled'}")
@@ -1035,7 +1889,8 @@ class SimpleCLI:
         # Token usage
         current_tokens = self.session.get_token_estimate()
-        print(f"💾 Token Usage: {current_tokens:,} / {self.max_tokens:,} tokens ({(current_tokens/self.max_tokens*100):.1f}%)")
+        print(f"💾 Context Usage: {current_tokens:,} / {self.max_tokens:,} tokens ({(current_tokens/self.max_tokens*100):.1f}%)")
+        print(f"✍️ Max Output Tokens: {self.max_output_tokens:,}")
         # Model capabilities
         try:
@@ -1050,6 +1905,11 @@ class SimpleCLI:
             print(f"   Vision Support: {'Yes' if capabilities.get('vision_support', False) else 'No'}")
             print(f"   Audio Support: {'Yes' if capabilities.get('audio_support', False) else 'No'}")
             print(f"   Thinking Support: {'Yes' if capabilities.get('thinking_support', False) else 'No'}")
+            reasoning_levels = capabilities.get("reasoning_levels")
+            if isinstance(reasoning_levels, list) and reasoning_levels:
+                levels_str = ", ".join([str(x) for x in reasoning_levels if isinstance(x, str) and x.strip()])
+                if levels_str:
+                    print(f"   Reasoning Levels: {levels_str}")
             # Show aliases if any
             aliases = capabilities.get('aliases', [])
@@ -1129,23 +1989,37 @@ class SimpleCLI:
             if not clean_input and media_files:
                 clean_input = "Please analyze the attached file(s)."
+            clean_input = self._timestamp_user_message(clean_input)
             if self.debug_mode:
                 print(f"🔍 Sending to {self.provider_name}:{self.model_name}")
                 if media_files:
                     print(f"🔍 Media files: {media_files}")
-            # Generate response with media support
-            response = self.session.generate(
-                clean_input,
-                stream=self.stream_mode,
-                media=media_files if media_files else None
-            )
+            if self.prompt_cache_mode == "kv":
+                response = self._generate_response_kv(
+                    clean_input,
+                    media=media_files if media_files else None,
+                )
+            else:
+                # Generate response with media support (session-managed history)
+                gen_kwargs: Dict[str, Any] = {
+                    "stream": self.stream_mode,
+                    "media": media_files if media_files else None,
+                    "max_output_tokens": self.max_output_tokens,
+                }
+                if self.thinking is not None:
+                    gen_kwargs["thinking"] = self.thinking
+                response = self.session.generate(clean_input, **gen_kwargs)
             if self.stream_mode:
-                if not self.single_prompt_mode:
+                show_reasoning = self._should_show_reasoning() and not self.single_prompt_mode
+                buffer_for_reasoning_first = self._should_buffer_stream_for_reasoning_first()
+                if not self.single_prompt_mode and not buffer_for_reasoning_first:
                     print("🤖 Assistant: ", end="", flush=True)
                 full_content = ""
                 display_buffer = ""  # Buffer for cleaned display content
+                reasoning_parts: List[str] = []
                 for chunk in response:
                     if hasattr(chunk, 'content') and chunk.content:
@@ -1170,17 +2044,34 @@ class SimpleCLI:
                             '```tool_code'
                         ])
-                        if not has_tool_marker:
-                            print(chunk_text, end="", flush=True)
+                        # If we want reasoning-first display, buffer output (no live streaming).
+                        if buffer_for_reasoning_first:
                             display_buffer += chunk_text
                         else:
-                            # Buffer the chunk, we'll process after streaming
-                            display_buffer += chunk_text
+                            if not has_tool_marker:
+                                print(chunk_text, end="", flush=True)
+                                display_buffer += chunk_text
+                            else:
+                                # Buffer the chunk, we'll process after streaming
+                                display_buffer += chunk_text
+                    # Best-effort: capture streamed reasoning metadata (OpenAI-compatible deltas, etc.).
+                    if hasattr(chunk, "metadata") and isinstance(getattr(chunk, "metadata"), dict):
+                        r = chunk.metadata.get("reasoning")
+                        if isinstance(r, str) and r.strip():
+                            reasoning_parts.append(r.strip())
-                print()  # New line after streaming
+                if not buffer_for_reasoning_first:
+                    print()  # New line after streaming
                 # Parse and execute tool calls from full content
                 clean_content, tool_calls = self._parse_and_strip_tool_calls(full_content)
+                if self.prompt_cache_mode == "kv":
+                    # Maintain transcript for UX; model context lives in KV cache.
+                    try:
+                        self.session.add_message("assistant", clean_content.strip() or full_content)
+                    except Exception:
+                        pass
                 # If we buffered tool call content, we should have shown clean content
                 # For now, if there's significant difference, show the clean version
@@ -1189,12 +2080,38 @@ class SimpleCLI:
                     # This happens when tool calls appear mid-stream
                     if self.debug_mode:
                         print(f"\n🔍 Cleaned content differs from streamed content")
+                combined = "\n\n".join(reasoning_parts).strip() if reasoning_parts else ""
+                if show_reasoning and combined:
+                    self._print_reasoning_block(combined)
+                # Reasoning-first UX: show the final answer after reasoning (buffered).
+                if buffer_for_reasoning_first:
+                    if clean_content.strip():
+                        print(f"🤖 Assistant: {clean_content}")
+                    elif tool_calls and not self.single_prompt_mode:
+                        print("🤖 Assistant: ", end="")
+                    elif self.single_prompt_mode:
+                        print(clean_content or full_content)
+                    else:
+                        print(f"🤖 Assistant: {clean_content or full_content}")
                 self._execute_tool_calls(tool_calls)
             else:
                 # Non-streaming: parse content, display clean version, execute tools
                 clean_content, tool_calls = self._parse_and_strip_tool_calls(response.content)
+                if self.prompt_cache_mode == "kv":
+                    try:
+                        self.session.add_message("assistant", clean_content.strip() or response.content)
+                    except Exception:
+                        pass
+                meta = getattr(response, "metadata", None)
+                if self._should_show_reasoning() and not self.single_prompt_mode and isinstance(meta, dict):
+                    r = meta.get("reasoning")
+                    if isinstance(r, str) and r.strip():
+                        self._print_reasoning_block(r.strip())
                 # Display only the clean content (without tool call syntax)
                 if clean_content.strip():
                     if self.single_prompt_mode:
@@ -1204,14 +2121,14 @@ class SimpleCLI:
                 elif tool_calls:
                     # Only tool calls, no text response
                     if not self.single_prompt_mode:
-                        print(f"🤖 Assistant: ", end="")
+                        print("🤖 Assistant: ", end="")
                 else:
                     # Empty response
                     if self.single_prompt_mode:
                         print(response.content)
                     else:
                         print(f"🤖 Assistant: {response.content}")
                 # Execute tool calls
                 self._execute_tool_calls(tool_calls)
@@ -1227,6 +2144,96 @@ class SimpleCLI:
                 import traceback
                 traceback.print_exc()
+    def _should_show_reasoning(self) -> bool:
+        """Decide whether to display reasoning in the CLI output."""
+        if self.show_reasoning is not None:
+            return bool(self.show_reasoning)
+        # Auto: show when present unless explicitly disabled.
+        if self.thinking is False:
+            return False
+        return True
+    def _should_buffer_stream_for_reasoning_first(self) -> bool:
+        """Decide whether to buffer streaming output to show reasoning before the answer."""
+        if self.single_prompt_mode:
+            return False
+        if not self._should_show_reasoning():
+            return False
+        # If the user explicitly enabled reasoning display or requested thinking, honor reasoning-first UX.
+        if self.show_reasoning is True:
+            return True
+        if self.thinking is not None and self.thinking is not False:
+            return True
+        # Auto mode: only buffer when the model is expected to emit a separate reasoning channel.
+        try:
+            from ..architectures.detection import detect_architecture, get_architecture_format, get_model_capabilities
+            caps = get_model_capabilities(self.model_name)
+            arch = detect_architecture(self.model_name)
+            arch_fmt = get_architecture_format(arch)
+        except Exception:
+            caps = {}
+            arch_fmt = {}
+        resp_fmt = str((caps or {}).get("response_format") or "").strip().lower()
+        if resp_fmt == "harmony":
+            return True
+        for src in (caps, arch_fmt):
+            if isinstance(src, dict):
+                f = src.get("thinking_output_field")
+                if isinstance(f, str) and f.strip():
+                    return True
+        return False
+    def _print_reasoning_block(self, reasoning: str) -> None:
+        """Print reasoning in a visually distinct style (best-effort)."""
+        import sys
+        text = reasoning.strip()
+        if not text:
+            return
+        print("🧠 Reasoning:")
+        if sys.stdout.isatty():
+            # Grey + italic (best-effort; not all terminals support italics).
+            print(f"\x1b[90m\x1b[3m{text}\x1b[0m")
+        else:
+            print(text)
+    def _generate_response_kv(self, prompt: str, *, media: Optional[list] = None):
+        """Generate response using append-only KV cache mode (local providers only)."""
+        # Maintain a local transcript for UX, but do not send it to the model; the KV cache is source-of-truth.
+        try:
+            self.session.add_message("user", prompt)
+        except Exception:
+            pass
+        gen_kwargs: Dict[str, Any] = {
+            "prompt": prompt,
+            "messages": None,
+            "system_prompt": None,
+            "tools": None,  # tools were prefixed into the cache during prefill
+            "media": media,
+            "stream": bool(self.stream_mode),
+            "max_output_tokens": self.max_output_tokens,
+        }
+        if self.thinking is not None:
+            gen_kwargs["thinking"] = self.thinking
+        # Preserve session-level generation parameters for consistency.
+        try:
+            if getattr(self.session, "temperature", None) is not None:
+                gen_kwargs["temperature"] = self.session.temperature
+            if isinstance(getattr(self.session, "seed", None), int) and self.session.seed >= 0:
+                gen_kwargs["seed"] = self.session.seed
+        except Exception:
+            pass
+        return self.provider.generate(**gen_kwargs)
     def _parse_and_strip_tool_calls(self, content: str):
         """
         Parse tool calls from content and return (clean_content, tool_calls).
@@ -1337,7 +2344,7 @@ class SimpleCLI:
                 if not self.single_prompt_mode:
                     args_str = str(tool_args) if tool_args else "{}"
                     if len(args_str) > 100:
-                        args_str = args_str[:97] + "..."
+                        args_str = preview_text(args_str, max_chars=100)
                     print(f"**{tool_name}({args_str})**")
                 # Execute the tool
@@ -1435,14 +2442,18 @@ def main():
         epilog="""
 Examples:
   python -m abstractcore.utils.cli --provider ollama --model qwen3-coder:30b
-  python -m abstractcore.utils.cli --provider openai --model gpt-4o-mini --stream
-  python -m abstractcore.utils.cli --provider anthropic --model claude-3-5-haiku-20241022
+  python -m abstractcore.utils.cli --provider openai --model gpt-5-mini --stream
+  python -m abstractcore.utils.cli --provider anthropic --model claude-haiku-4-5
+  python -m abstractcore.utils.cli --provider lmstudio --model qwen/qwen3-4b-2507 --base-url http://localhost:1234/v1
+  python -m abstractcore.utils.cli --provider openrouter --model openai/gpt-4o-mini
   python -m abstractcore.utils.cli --prompt "What is Python?"  # Uses configured defaults
 Key Commands:
   /help                           Show comprehensive command guide
-  /save <file> [--summary --assessment --facts]  Save session with analytics
-  /load <file>                    Load saved session
+  /session save <name> [--summary --assessment --facts]  Save session JSON (writes .json)
+  /session load <name>            Load saved session JSON (reads .json)
+  /cache save <name>              Save MLX prompt/KV cache (writes .safetensors)
+  /cache load <name>              Load MLX prompt/KV cache (reads .safetensors)
   /status                         Show system status and capabilities
   /history [n]                    Show conversation history
   /model <provider:model>         Switch LLM provider/model
@@ -1471,18 +2482,19 @@ build custom solutions using the AbstractCore framework directly.
     # Optional arguments (no longer required - will use configured defaults)
     parser.add_argument('--provider',
-                       choices=['openai', 'anthropic', 'ollama', 'huggingface', 'mlx', 'lmstudio'],
+                       choices=['openai', 'anthropic', 'openrouter', 'openai-compatible', 'vllm', 'ollama', 'huggingface', 'mlx', 'lmstudio'],
                        help='LLM provider to use (optional - uses configured default)')
     parser.add_argument('--model', help='Model name to use (optional - uses configured default)')
     # Optional arguments
     parser.add_argument('--stream', action='store_true', help='Enable streaming mode')
     parser.add_argument('--debug', action='store_true', help='Enable debug mode')
-    parser.add_argument('--max-tokens', type=int, default=None, help='Maximum tokens (default: auto-detect from model capabilities)')
+    parser.add_argument('--max-tokens', type=int, default=None, help='Maximum total context tokens (default: auto-detect from model capabilities)')
+    parser.add_argument('--max-output-tokens', type=int, default=None, help='Maximum output tokens per response (default: provider/model default)')
     parser.add_argument('--prompt', help='Execute single prompt and exit')
     # Provider-specific
-    parser.add_argument('--base-url', help='Base URL (ollama, lmstudio)')
+    parser.add_argument('--base-url', help='Base URL override (OpenAI-compatible /v1 servers, proxies, Ollama)')
     parser.add_argument('--api-key', help='API key')
     parser.add_argument('--temperature', type=float, default=0.7, help='Temperature (default: 0.7)')
@@ -1554,6 +2566,7 @@ build custom solutions using the AbstractCore framework directly.
         model=model,
         stream=stream_mode,
         max_tokens=args.max_tokens,
+        max_output_tokens=args.max_output_tokens,
         debug=args.debug,
         show_banner=not args.prompt,  # Hide banner in single-prompt mode
         **kwargs
@@ -1567,4 +2580,4 @@ build custom solutions using the AbstractCore framework directly.
 if __name__ == "__main__":
-    main()
+    main()

abstractcore 2.9.1__py3-none-any.whl → 2.11.4__py3-none-any.whl

abstractcore 2.9.1py3-none-any.whl → 2.11.4py3-none-any.whl