abstractcore 2.9.1__py3-none-any.whl → 2.11.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractcore/__init__.py +7 -27
- abstractcore/apps/deepsearch.py +9 -4
- abstractcore/apps/extractor.py +33 -100
- abstractcore/apps/intent.py +19 -0
- abstractcore/apps/judge.py +20 -1
- abstractcore/apps/summarizer.py +20 -1
- abstractcore/architectures/detection.py +34 -1
- abstractcore/architectures/response_postprocessing.py +313 -0
- abstractcore/assets/architecture_formats.json +38 -8
- abstractcore/assets/model_capabilities.json +882 -160
- abstractcore/compression/__init__.py +1 -2
- abstractcore/compression/glyph_processor.py +6 -4
- abstractcore/config/main.py +52 -20
- abstractcore/config/manager.py +390 -12
- abstractcore/config/vision_config.py +5 -5
- abstractcore/core/interface.py +151 -3
- abstractcore/core/session.py +16 -10
- abstractcore/download.py +1 -1
- abstractcore/embeddings/manager.py +20 -6
- abstractcore/endpoint/__init__.py +2 -0
- abstractcore/endpoint/app.py +458 -0
- abstractcore/mcp/client.py +3 -1
- abstractcore/media/__init__.py +52 -17
- abstractcore/media/auto_handler.py +42 -22
- abstractcore/media/base.py +44 -1
- abstractcore/media/capabilities.py +12 -33
- abstractcore/media/enrichment.py +105 -0
- abstractcore/media/handlers/anthropic_handler.py +19 -28
- abstractcore/media/handlers/local_handler.py +124 -70
- abstractcore/media/handlers/openai_handler.py +19 -31
- abstractcore/media/processors/__init__.py +4 -2
- abstractcore/media/processors/audio_processor.py +57 -0
- abstractcore/media/processors/office_processor.py +8 -3
- abstractcore/media/processors/pdf_processor.py +46 -3
- abstractcore/media/processors/text_processor.py +22 -24
- abstractcore/media/processors/video_processor.py +58 -0
- abstractcore/media/types.py +97 -4
- abstractcore/media/utils/image_scaler.py +20 -2
- abstractcore/media/utils/video_frames.py +219 -0
- abstractcore/media/vision_fallback.py +136 -22
- abstractcore/processing/__init__.py +32 -3
- abstractcore/processing/basic_deepsearch.py +15 -10
- abstractcore/processing/basic_intent.py +3 -2
- abstractcore/processing/basic_judge.py +3 -2
- abstractcore/processing/basic_summarizer.py +1 -1
- abstractcore/providers/__init__.py +3 -1
- abstractcore/providers/anthropic_provider.py +95 -8
- abstractcore/providers/base.py +1516 -81
- abstractcore/providers/huggingface_provider.py +546 -69
- abstractcore/providers/lmstudio_provider.py +30 -916
- abstractcore/providers/mlx_provider.py +382 -35
- abstractcore/providers/model_capabilities.py +5 -1
- abstractcore/providers/ollama_provider.py +99 -15
- abstractcore/providers/openai_compatible_provider.py +406 -180
- abstractcore/providers/openai_provider.py +188 -44
- abstractcore/providers/openrouter_provider.py +76 -0
- abstractcore/providers/registry.py +61 -5
- abstractcore/providers/streaming.py +138 -33
- abstractcore/providers/vllm_provider.py +92 -817
- abstractcore/server/app.py +478 -28
- abstractcore/server/audio_endpoints.py +139 -0
- abstractcore/server/vision_endpoints.py +1319 -0
- abstractcore/structured/handler.py +316 -41
- abstractcore/tools/common_tools.py +5501 -2012
- abstractcore/tools/comms_tools.py +1641 -0
- abstractcore/tools/core.py +37 -7
- abstractcore/tools/handler.py +4 -9
- abstractcore/tools/parser.py +49 -2
- abstractcore/tools/tag_rewriter.py +2 -1
- abstractcore/tools/telegram_tdlib.py +407 -0
- abstractcore/tools/telegram_tools.py +261 -0
- abstractcore/utils/cli.py +1085 -72
- abstractcore/utils/structured_logging.py +29 -8
- abstractcore/utils/token_utils.py +2 -0
- abstractcore/utils/truncation.py +29 -0
- abstractcore/utils/version.py +3 -4
- abstractcore/utils/vlm_token_calculator.py +12 -2
- abstractcore-2.11.4.dist-info/METADATA +562 -0
- abstractcore-2.11.4.dist-info/RECORD +133 -0
- {abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/WHEEL +1 -1
- {abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/entry_points.txt +1 -0
- abstractcore-2.9.1.dist-info/METADATA +0 -1190
- abstractcore-2.9.1.dist-info/RECORD +0 -119
- {abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/licenses/LICENSE +0 -0
- {abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/top_level.txt +0 -0
abstractcore/utils/cli.py
CHANGED
|
@@ -16,14 +16,23 @@ AbstractCore framework directly.
|
|
|
16
16
|
|
|
17
17
|
Usage:
|
|
18
18
|
python -m abstractcore.utils.cli --provider ollama --model qwen3-coder:30b
|
|
19
|
-
python -m abstractcore.utils.cli --provider openai --model gpt-
|
|
20
|
-
python -m abstractcore.utils.cli --provider anthropic --model claude-
|
|
19
|
+
python -m abstractcore.utils.cli --provider openai --model gpt-5-mini --stream
|
|
20
|
+
python -m abstractcore.utils.cli --provider anthropic --model claude-haiku-4-5 --prompt "What is Python?"
|
|
21
|
+
python -m abstractcore.utils.cli --provider lmstudio --model qwen/qwen3-4b-2507 --base-url http://localhost:1234/v1
|
|
22
|
+
python -m abstractcore.utils.cli --provider openrouter --model openai/gpt-4o-mini
|
|
21
23
|
"""
|
|
22
24
|
|
|
23
25
|
import argparse
|
|
26
|
+
import os
|
|
24
27
|
import sys
|
|
25
28
|
import time
|
|
26
|
-
|
|
29
|
+
import uuid
|
|
30
|
+
import locale
|
|
31
|
+
from datetime import datetime
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
from typing import Optional, Any, Dict, Iterator, List, Union
|
|
34
|
+
|
|
35
|
+
from .truncation import preview_text
|
|
27
36
|
|
|
28
37
|
# Enable command history and arrow key navigation
|
|
29
38
|
try:
|
|
@@ -42,11 +51,30 @@ from ..tools.common_tools import list_files, read_file, write_file, execute_comm
|
|
|
42
51
|
from ..processing import BasicExtractor, BasicJudge, BasicIntentAnalyzer
|
|
43
52
|
|
|
44
53
|
|
|
54
|
+
class _NoPromptCacheProvider:
|
|
55
|
+
"""Proxy that forces `prompt_cache_key=None` for every call (to avoid polluting KV caches)."""
|
|
56
|
+
|
|
57
|
+
def __init__(self, provider: Any):
|
|
58
|
+
self._provider = provider
|
|
59
|
+
|
|
60
|
+
def generate(self, *args: Any, **kwargs: Any):
|
|
61
|
+
kwargs["prompt_cache_key"] = None
|
|
62
|
+
return self._provider.generate(*args, **kwargs)
|
|
63
|
+
|
|
64
|
+
async def agenerate(self, *args: Any, **kwargs: Any):
|
|
65
|
+
kwargs["prompt_cache_key"] = None
|
|
66
|
+
return await self._provider.agenerate(*args, **kwargs)
|
|
67
|
+
|
|
68
|
+
def __getattr__(self, name: str) -> Any:
|
|
69
|
+
return getattr(self._provider, name)
|
|
70
|
+
|
|
71
|
+
|
|
45
72
|
class SimpleCLI:
|
|
46
73
|
"""Simplified CLI REPL for AbstractCore"""
|
|
47
74
|
|
|
48
75
|
def __init__(self, provider: str, model: str, stream: bool = False,
|
|
49
|
-
max_tokens: int = None,
|
|
76
|
+
max_tokens: int = None, max_output_tokens: int = None,
|
|
77
|
+
debug: bool = False, show_banner: bool = True, **kwargs):
|
|
50
78
|
self.provider_name = provider
|
|
51
79
|
self.model_name = model
|
|
52
80
|
self.stream_mode = stream
|
|
@@ -55,6 +83,7 @@ class SimpleCLI:
|
|
|
55
83
|
self.kwargs = kwargs
|
|
56
84
|
|
|
57
85
|
# Auto-detect max_tokens from model capabilities if not specified
|
|
86
|
+
self.max_tokens_auto = max_tokens is None
|
|
58
87
|
if max_tokens is None:
|
|
59
88
|
try:
|
|
60
89
|
from ..architectures.detection import get_model_capabilities
|
|
@@ -68,18 +97,41 @@ class SimpleCLI:
|
|
|
68
97
|
print(f"⚠️ Failed to auto-detect max_tokens, using fallback: {max_tokens} ({e})")
|
|
69
98
|
|
|
70
99
|
self.max_tokens = max_tokens
|
|
100
|
+
self.max_output_tokens_auto = max_output_tokens is None
|
|
101
|
+
# Unified thinking/reasoning control (best-effort, provider/model dependent).
|
|
102
|
+
# - None: auto (provider/model default)
|
|
103
|
+
# - bool: on/off
|
|
104
|
+
# - str: "low"|"medium"|"high" when supported
|
|
105
|
+
self.thinking: Optional[Union[bool, str]] = None
|
|
106
|
+
# Whether to display model-supplied reasoning/thinking separately.
|
|
107
|
+
# - None: auto (show when thinking != off)
|
|
108
|
+
# - bool: force on/off
|
|
109
|
+
self.show_reasoning: Optional[bool] = None
|
|
71
110
|
|
|
72
111
|
# Initialize command history with persistent storage
|
|
73
112
|
self._setup_command_history()
|
|
74
113
|
|
|
75
114
|
# Initialize provider and session with tools
|
|
76
|
-
|
|
115
|
+
provider_kwargs = dict(kwargs)
|
|
116
|
+
provider_kwargs["max_tokens"] = max_tokens
|
|
117
|
+
if max_output_tokens is not None:
|
|
118
|
+
provider_kwargs["max_output_tokens"] = max_output_tokens
|
|
119
|
+
self.provider = create_llm(provider, model=model, **provider_kwargs)
|
|
120
|
+
# Store the effective max_output_tokens (provider may auto-select based on model capabilities).
|
|
121
|
+
self.max_output_tokens = getattr(self.provider, "max_output_tokens", max_output_tokens or 2048)
|
|
77
122
|
self.session = BasicSession(
|
|
78
123
|
self.provider,
|
|
79
124
|
system_prompt="You are a helpful AI assistant with vision capabilities. When users provide images or media files, analyze and describe them directly. You also have access to file operation tools.",
|
|
80
125
|
tools=[list_files, read_file, write_file, execute_command, search_files]
|
|
81
126
|
)
|
|
82
127
|
|
|
128
|
+
# Prompt caching (best-effort; provider-dependent).
|
|
129
|
+
self.country_code = self._get_country_code()
|
|
130
|
+
self.prompt_cache_mode = "off" # off | key | kv
|
|
131
|
+
self.prompt_cache_key: Optional[str] = None
|
|
132
|
+
self.prompt_cache_file: Optional[str] = None
|
|
133
|
+
self._init_prompt_caching(show_banner=show_banner)
|
|
134
|
+
|
|
83
135
|
# Only show banner in interactive mode
|
|
84
136
|
if show_banner:
|
|
85
137
|
print("=" * 70)
|
|
@@ -89,7 +141,7 @@ class SimpleCLI:
|
|
|
89
141
|
print(f"📝 Model: {model}")
|
|
90
142
|
print(f"🌊 Streaming: {'ON' if stream else 'OFF'} | 🐛 Debug: {'ON' if debug else 'OFF'}")
|
|
91
143
|
print()
|
|
92
|
-
print("💬 Quick Commands: /help /
|
|
144
|
+
print("💬 Quick Commands: /help /session /cache /status /history /quit")
|
|
93
145
|
print("🛠️ Available Tools: list_files, search_files, read_file, write_file, execute_command")
|
|
94
146
|
print()
|
|
95
147
|
print("💡 Type '/help' for comprehensive command guide")
|
|
@@ -158,7 +210,8 @@ class SimpleCLI:
|
|
|
158
210
|
print("─" * 50)
|
|
159
211
|
print(" /help Show this comprehensive help")
|
|
160
212
|
print(" /quit Exit the CLI")
|
|
161
|
-
print(" /clear Clear
|
|
213
|
+
print(" /clear Clear prompt cache + context (like mlx-chat)")
|
|
214
|
+
print(" /cls Clear the screen (like unix terminal)")
|
|
162
215
|
print(" /reset Reset conversation history")
|
|
163
216
|
print(" /status Show system status and capabilities")
|
|
164
217
|
|
|
@@ -175,17 +228,25 @@ class SimpleCLI:
|
|
|
175
228
|
print(" • /system - Show current prompt")
|
|
176
229
|
print(" • /system <text> - Set new prompt")
|
|
177
230
|
|
|
178
|
-
print("\n💾 SESSION
|
|
231
|
+
print("\n💾 SESSION & CACHE")
|
|
179
232
|
print("─" * 50)
|
|
180
|
-
print(" /save <
|
|
181
|
-
print(" • /save chat
|
|
182
|
-
print(" • /save analyzed --summary --assessment --facts")
|
|
233
|
+
print(" /session save <name> [options] Save session to <name>.json with optional analytics")
|
|
234
|
+
print(" • /session save chat")
|
|
235
|
+
print(" • /session save analyzed --summary --assessment --facts")
|
|
183
236
|
print(" Options:")
|
|
184
237
|
print(" --summary Generate conversation summary")
|
|
185
238
|
print(" --assessment Evaluate conversation quality")
|
|
186
239
|
print(" --facts Extract knowledge as facts")
|
|
187
|
-
print(" /load <
|
|
188
|
-
print(" • /load chat
|
|
240
|
+
print(" /session load <name> Load session from <name>.json (replaces current)")
|
|
241
|
+
print(" • /session load chat")
|
|
242
|
+
print(" /session clear Clear session + cache (same as /clear)")
|
|
243
|
+
print(" /save /load Aliases for /session save|load (sessions only)")
|
|
244
|
+
print(" /cache save <name> Save prompt/KV cache to <name>.safetensors (MLX only, model-locked)")
|
|
245
|
+
print(" • /cache save chat_cache")
|
|
246
|
+
print(" --q8 Quantize cache before saving (smaller, lossy)")
|
|
247
|
+
print(" /cache load <name> Load prompt/KV cache from <name>.safetensors (MLX only, model-locked)")
|
|
248
|
+
print(" • /cache load chat_cache")
|
|
249
|
+
print(" /cache clear Clear prompt cache only (KV mode rebuilds from transcript)")
|
|
189
250
|
|
|
190
251
|
print("\n📊 ANALYTICS & INSIGHTS")
|
|
191
252
|
print("─" * 50)
|
|
@@ -201,8 +262,15 @@ class SimpleCLI:
|
|
|
201
262
|
print("\n⚙️ CONFIGURATION")
|
|
202
263
|
print("─" * 50)
|
|
203
264
|
print(" /model <provider:model> Switch LLM provider/model")
|
|
204
|
-
print(" • /model openai:gpt-
|
|
205
|
-
print(" • /model anthropic:claude-
|
|
265
|
+
print(" • /model openai:gpt-5-mini")
|
|
266
|
+
print(" • /model anthropic:claude-haiku-4-5")
|
|
267
|
+
print(" • /model openrouter:openai/gpt-4o-mini")
|
|
268
|
+
print(" /max-tokens <n|auto> Set context token budget")
|
|
269
|
+
print(" /max-output-tokens <n|auto> Set max output tokens per response")
|
|
270
|
+
print(" /thinking <mode> Set thinking/reasoning mode (best-effort)")
|
|
271
|
+
print(" • /thinking auto|on|off|low|medium|high")
|
|
272
|
+
print(" /show-reasoning <mode> Display reasoning separately (auto/on/off)")
|
|
273
|
+
print(" • /show-reasoning auto|on|off")
|
|
206
274
|
print(" /stream Toggle streaming mode on/off")
|
|
207
275
|
print(" /debug Toggle debug info (timing, detection)")
|
|
208
276
|
|
|
@@ -231,7 +299,7 @@ class SimpleCLI:
|
|
|
231
299
|
print(" • Search inside files: 'Find all TODO comments in Python files'")
|
|
232
300
|
print(" • Request file operations: 'Read the README.md file'")
|
|
233
301
|
print(" • Attach files: 'What's in this image? @photo.jpg'")
|
|
234
|
-
print(" • Save important conversations: '/save project_discussion --summary'")
|
|
302
|
+
print(" • Save important conversations: '/session save project_discussion --summary'")
|
|
235
303
|
print(" • Switch models for different tasks: '/model ollama:qwen3-coder:30b'")
|
|
236
304
|
print(" • Use /status to check token usage and model capabilities")
|
|
237
305
|
|
|
@@ -240,13 +308,17 @@ class SimpleCLI:
|
|
|
240
308
|
print("=" * 70 + "\n")
|
|
241
309
|
|
|
242
310
|
elif cmd == 'clear':
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
311
|
+
self.handle_clear()
|
|
312
|
+
|
|
313
|
+
elif cmd == 'cls':
|
|
314
|
+
self._clear_screen()
|
|
246
315
|
|
|
247
316
|
elif cmd == 'reset':
|
|
248
|
-
self.
|
|
249
|
-
|
|
317
|
+
if self.prompt_cache_mode == "kv":
|
|
318
|
+
self.handle_clear()
|
|
319
|
+
else:
|
|
320
|
+
self.session.clear_history(keep_system=True)
|
|
321
|
+
print("🧹 Chat history reset")
|
|
250
322
|
|
|
251
323
|
elif cmd == 'stream':
|
|
252
324
|
self.stream_mode = not self.stream_mode
|
|
@@ -260,6 +332,134 @@ class SimpleCLI:
|
|
|
260
332
|
elif cmd == 'status':
|
|
261
333
|
self.handle_status()
|
|
262
334
|
|
|
335
|
+
elif cmd.startswith('thinking'):
|
|
336
|
+
parts = cmd.split(maxsplit=1)
|
|
337
|
+
if len(parts) == 1:
|
|
338
|
+
current = "auto" if self.thinking is None else ("on" if self.thinking is True else "off" if self.thinking is False else str(self.thinking))
|
|
339
|
+
print(f"🧠 thinking: {current}")
|
|
340
|
+
print("❓ Usage: /thinking <auto|on|off|low|medium|high>")
|
|
341
|
+
return True
|
|
342
|
+
|
|
343
|
+
raw = parts[1].strip().lower()
|
|
344
|
+
if raw in {"auto", "none", "null"}:
|
|
345
|
+
self.thinking = None
|
|
346
|
+
elif raw in {"on", "true", "1", "yes"}:
|
|
347
|
+
self.thinking = True
|
|
348
|
+
elif raw in {"off", "false", "0", "no"}:
|
|
349
|
+
self.thinking = False
|
|
350
|
+
elif raw in {"low", "medium", "high"}:
|
|
351
|
+
self.thinking = raw
|
|
352
|
+
else:
|
|
353
|
+
print("❓ Usage: /thinking <auto|on|off|low|medium|high>")
|
|
354
|
+
return True
|
|
355
|
+
|
|
356
|
+
current = "auto" if self.thinking is None else ("on" if self.thinking is True else "off" if self.thinking is False else str(self.thinking))
|
|
357
|
+
print(f"✅ thinking set to: {current}")
|
|
358
|
+
return True
|
|
359
|
+
|
|
360
|
+
elif cmd.startswith('show-reasoning') or cmd.startswith('reasoning'):
|
|
361
|
+
parts = cmd.split(maxsplit=1)
|
|
362
|
+
if len(parts) == 1:
|
|
363
|
+
current = "auto" if self.show_reasoning is None else ("on" if self.show_reasoning else "off")
|
|
364
|
+
print(f"🧠 show-reasoning: {current}")
|
|
365
|
+
print("❓ Usage: /show-reasoning <auto|on|off>")
|
|
366
|
+
return True
|
|
367
|
+
|
|
368
|
+
raw = parts[1].strip().lower()
|
|
369
|
+
if raw in {"auto", "none", "null"}:
|
|
370
|
+
self.show_reasoning = None
|
|
371
|
+
elif raw in {"on", "true", "1", "yes"}:
|
|
372
|
+
self.show_reasoning = True
|
|
373
|
+
elif raw in {"off", "false", "0", "no"}:
|
|
374
|
+
self.show_reasoning = False
|
|
375
|
+
else:
|
|
376
|
+
print("❓ Usage: /show-reasoning <auto|on|off>")
|
|
377
|
+
return True
|
|
378
|
+
|
|
379
|
+
current = "auto" if self.show_reasoning is None else ("on" if self.show_reasoning else "off")
|
|
380
|
+
print(f"✅ show-reasoning set to: {current}")
|
|
381
|
+
return True
|
|
382
|
+
|
|
383
|
+
elif cmd.startswith('max-tokens'):
|
|
384
|
+
parts = cmd.split()
|
|
385
|
+
if len(parts) == 1:
|
|
386
|
+
print(f"💾 max_tokens (context budget): {self.max_tokens:,} ({'auto' if self.max_tokens_auto else 'manual'})")
|
|
387
|
+
print("❓ Usage: /max-tokens <n|auto>")
|
|
388
|
+
else:
|
|
389
|
+
raw_value = parts[1].strip().lower()
|
|
390
|
+
if raw_value in {"auto", "-1"}:
|
|
391
|
+
try:
|
|
392
|
+
from ..architectures.detection import get_model_capabilities
|
|
393
|
+
capabilities = get_model_capabilities(self.model_name)
|
|
394
|
+
detected = capabilities.get('max_tokens', 16384)
|
|
395
|
+
except Exception:
|
|
396
|
+
detected = 16384
|
|
397
|
+
self.max_tokens = int(detected)
|
|
398
|
+
self.max_tokens_auto = True
|
|
399
|
+
else:
|
|
400
|
+
try:
|
|
401
|
+
new_max = int(raw_value)
|
|
402
|
+
if new_max <= 0:
|
|
403
|
+
raise ValueError
|
|
404
|
+
self.max_tokens = new_max
|
|
405
|
+
self.max_tokens_auto = False
|
|
406
|
+
except ValueError:
|
|
407
|
+
print("❓ Usage: /max-tokens <n|auto> (n must be a positive integer)")
|
|
408
|
+
return True
|
|
409
|
+
|
|
410
|
+
# Apply to current provider (best-effort; mostly used for token budgeting/compaction).
|
|
411
|
+
try:
|
|
412
|
+
setattr(self.provider, "max_tokens", self.max_tokens)
|
|
413
|
+
except Exception:
|
|
414
|
+
pass
|
|
415
|
+
|
|
416
|
+
# Safety clamp: output should not exceed total budget.
|
|
417
|
+
if isinstance(self.max_output_tokens, int) and self.max_output_tokens > int(self.max_tokens):
|
|
418
|
+
self.max_output_tokens = int(self.max_tokens)
|
|
419
|
+
try:
|
|
420
|
+
setattr(self.provider, "max_output_tokens", self.max_output_tokens)
|
|
421
|
+
except Exception:
|
|
422
|
+
pass
|
|
423
|
+
|
|
424
|
+
print(f"✅ max_tokens set to {self.max_tokens:,}")
|
|
425
|
+
|
|
426
|
+
elif cmd.startswith('max-output-tokens'):
|
|
427
|
+
parts = cmd.split()
|
|
428
|
+
if len(parts) == 1:
|
|
429
|
+
print(f"✍️ max_output_tokens (per response): {self.max_output_tokens:,} ({'auto' if self.max_output_tokens_auto else 'manual'})")
|
|
430
|
+
print("❓ Usage: /max-output-tokens <n|auto>")
|
|
431
|
+
else:
|
|
432
|
+
raw_value = parts[1].strip().lower()
|
|
433
|
+
if raw_value in {"auto", "-1"}:
|
|
434
|
+
try:
|
|
435
|
+
from ..architectures.detection import get_model_capabilities
|
|
436
|
+
capabilities = get_model_capabilities(self.model_name)
|
|
437
|
+
detected = capabilities.get('max_output_tokens', getattr(self.provider, "max_output_tokens", 2048))
|
|
438
|
+
except Exception:
|
|
439
|
+
detected = getattr(self.provider, "max_output_tokens", 2048)
|
|
440
|
+
self.max_output_tokens = int(detected)
|
|
441
|
+
self.max_output_tokens_auto = True
|
|
442
|
+
else:
|
|
443
|
+
try:
|
|
444
|
+
new_max = int(raw_value)
|
|
445
|
+
if new_max <= 0:
|
|
446
|
+
raise ValueError
|
|
447
|
+
self.max_output_tokens = new_max
|
|
448
|
+
self.max_output_tokens_auto = False
|
|
449
|
+
except ValueError:
|
|
450
|
+
print("❓ Usage: /max-output-tokens <n|auto> (n must be a positive integer)")
|
|
451
|
+
return True
|
|
452
|
+
|
|
453
|
+
# Safety clamp: output should not exceed total budget.
|
|
454
|
+
if isinstance(self.max_tokens, int) and self.max_output_tokens > int(self.max_tokens):
|
|
455
|
+
self.max_output_tokens = int(self.max_tokens)
|
|
456
|
+
|
|
457
|
+
try:
|
|
458
|
+
setattr(self.provider, "max_output_tokens", self.max_output_tokens)
|
|
459
|
+
except Exception:
|
|
460
|
+
pass
|
|
461
|
+
print(f"✅ max_output_tokens set to {self.max_output_tokens:,}")
|
|
462
|
+
|
|
263
463
|
elif cmd.startswith('history'):
|
|
264
464
|
# Parse /history [n] command
|
|
265
465
|
parts = cmd.split()
|
|
@@ -282,13 +482,46 @@ class SimpleCLI:
|
|
|
282
482
|
self.model_name = model_spec
|
|
283
483
|
|
|
284
484
|
print(f"🔄 Switching to {self.provider_name}:{self.model_name}...")
|
|
485
|
+
# If token limits were auto-detected, re-detect them for the new model.
|
|
486
|
+
next_max_tokens = self.max_tokens
|
|
487
|
+
if self.max_tokens_auto:
|
|
488
|
+
try:
|
|
489
|
+
from ..architectures.detection import get_model_capabilities
|
|
490
|
+
capabilities = get_model_capabilities(self.model_name)
|
|
491
|
+
next_max_tokens = int(capabilities.get('max_tokens', 16384))
|
|
492
|
+
except Exception:
|
|
493
|
+
next_max_tokens = 16384
|
|
494
|
+
|
|
495
|
+
next_max_output_tokens = self.max_output_tokens
|
|
496
|
+
if self.max_output_tokens_auto:
|
|
497
|
+
try:
|
|
498
|
+
from ..architectures.detection import get_model_capabilities
|
|
499
|
+
capabilities = get_model_capabilities(self.model_name)
|
|
500
|
+
next_max_output_tokens = int(capabilities.get('max_output_tokens', self.max_output_tokens))
|
|
501
|
+
except Exception:
|
|
502
|
+
next_max_output_tokens = self.max_output_tokens
|
|
503
|
+
|
|
504
|
+
# Safety clamp: output should not exceed total budget.
|
|
505
|
+
if isinstance(next_max_tokens, int) and isinstance(next_max_output_tokens, int):
|
|
506
|
+
if next_max_output_tokens > next_max_tokens:
|
|
507
|
+
next_max_output_tokens = next_max_tokens
|
|
508
|
+
|
|
285
509
|
self.provider = create_llm(self.provider_name, model=self.model_name,
|
|
286
|
-
max_tokens=
|
|
510
|
+
max_tokens=next_max_tokens,
|
|
511
|
+
max_output_tokens=next_max_output_tokens,
|
|
512
|
+
**self.kwargs)
|
|
513
|
+
self.max_tokens = next_max_tokens
|
|
514
|
+
self.max_output_tokens = getattr(self.provider, "max_output_tokens", next_max_output_tokens)
|
|
287
515
|
self.session = BasicSession(
|
|
288
516
|
self.provider,
|
|
289
517
|
system_prompt="You are a helpful AI assistant with vision capabilities. When users provide images or media files, analyze and describe them directly. You also have access to file operation tools.",
|
|
290
518
|
tools=[list_files, read_file, write_file, execute_command, search_files]
|
|
291
519
|
)
|
|
520
|
+
# Reset caching state for the new provider+model.
|
|
521
|
+
self.prompt_cache_key = None
|
|
522
|
+
self.prompt_cache_file = None
|
|
523
|
+
self.prompt_cache_mode = "off"
|
|
524
|
+
self._init_prompt_caching(show_banner=False)
|
|
292
525
|
print("✅ Model switched")
|
|
293
526
|
except Exception as e:
|
|
294
527
|
print(f"❌ Failed to switch: {e}")
|
|
@@ -345,12 +578,87 @@ class SimpleCLI:
|
|
|
345
578
|
else:
|
|
346
579
|
self.handle_system_show()
|
|
347
580
|
|
|
581
|
+
elif cmd.startswith('session'):
|
|
582
|
+
# /session save|load|clear ...
|
|
583
|
+
parts = cmd.split()
|
|
584
|
+
if len(parts) < 2:
|
|
585
|
+
print("❓ Usage: /session <save|load|clear> ...")
|
|
586
|
+
print(" Examples:")
|
|
587
|
+
print(" /session save my_conversation")
|
|
588
|
+
print(" /session save analyzed_session --summary --assessment --facts")
|
|
589
|
+
print(" /session load my_conversation")
|
|
590
|
+
print(" /session clear")
|
|
591
|
+
return True
|
|
592
|
+
|
|
593
|
+
action = parts[1].strip().lower()
|
|
594
|
+
if action == "save":
|
|
595
|
+
if len(parts) < 3:
|
|
596
|
+
print("❓ Usage: /session save <name> [--summary] [--assessment] [--facts]")
|
|
597
|
+
return True
|
|
598
|
+
filename = parts[2]
|
|
599
|
+
options = {
|
|
600
|
+
'summary': '--summary' in parts[3:],
|
|
601
|
+
'assessment': '--assessment' in parts[3:],
|
|
602
|
+
'facts': '--facts' in parts[3:],
|
|
603
|
+
}
|
|
604
|
+
self.handle_save(filename, **options)
|
|
605
|
+
return True
|
|
606
|
+
|
|
607
|
+
if action == "load":
|
|
608
|
+
if len(parts) != 3:
|
|
609
|
+
print("❓ Usage: /session load <name>")
|
|
610
|
+
return True
|
|
611
|
+
self.handle_load(parts[2])
|
|
612
|
+
return True
|
|
613
|
+
|
|
614
|
+
if action == "clear":
|
|
615
|
+
self.handle_clear()
|
|
616
|
+
return True
|
|
617
|
+
|
|
618
|
+
print("❓ Usage: /session <save|load|clear> ...")
|
|
619
|
+
return True
|
|
620
|
+
|
|
621
|
+
elif cmd.startswith('cache'):
|
|
622
|
+
# /cache save|load|clear ...
|
|
623
|
+
parts = cmd.split()
|
|
624
|
+
if len(parts) < 2:
|
|
625
|
+
print("❓ Usage: /cache <save|load|clear> ...")
|
|
626
|
+
print(" Examples:")
|
|
627
|
+
print(" /cache save chat_cache")
|
|
628
|
+
print(" /cache load chat_cache")
|
|
629
|
+
print(" /cache clear")
|
|
630
|
+
return True
|
|
631
|
+
|
|
632
|
+
action = parts[1].strip().lower()
|
|
633
|
+
if action == "save":
|
|
634
|
+
if len(parts) < 3:
|
|
635
|
+
print("❓ Usage: /cache save <name> [--q8]")
|
|
636
|
+
return True
|
|
637
|
+
filename = parts[2]
|
|
638
|
+
self.handle_save_prompt_cache(filename, q8=("--q8" in parts[3:]))
|
|
639
|
+
return True
|
|
640
|
+
|
|
641
|
+
if action == "load":
|
|
642
|
+
if len(parts) != 3:
|
|
643
|
+
print("❓ Usage: /cache load <name>")
|
|
644
|
+
return True
|
|
645
|
+
self.handle_load_prompt_cache(parts[2])
|
|
646
|
+
return True
|
|
647
|
+
|
|
648
|
+
if action == "clear":
|
|
649
|
+
self.handle_cache_clear()
|
|
650
|
+
return True
|
|
651
|
+
|
|
652
|
+
print("❓ Usage: /cache <save|load|clear> ...")
|
|
653
|
+
return True
|
|
654
|
+
|
|
348
655
|
elif cmd.startswith('save'):
|
|
349
656
|
# Parse /save <file> [--summary] [--assessment] [--facts] command
|
|
350
657
|
parts = cmd.split()
|
|
351
658
|
if len(parts) < 2:
|
|
352
659
|
print("❓ Usage: /save <filename> [--summary] [--assessment] [--facts]")
|
|
353
|
-
print(" Example: /save my_conversation
|
|
660
|
+
print(" Example: /save my_conversation")
|
|
661
|
+
print(" Hint: use /cache save <name> for prompt caches")
|
|
354
662
|
print(" Example: /save analyzed_session --summary --assessment --facts")
|
|
355
663
|
else:
|
|
356
664
|
filename = parts[1]
|
|
@@ -366,7 +674,8 @@ class SimpleCLI:
|
|
|
366
674
|
parts = cmd.split()
|
|
367
675
|
if len(parts) != 2:
|
|
368
676
|
print("❓ Usage: /load <filename>")
|
|
369
|
-
print(" Example: /load my_conversation
|
|
677
|
+
print(" Example: /load my_conversation")
|
|
678
|
+
print(" Hint: use /cache load <name> for prompt caches")
|
|
370
679
|
else:
|
|
371
680
|
filename = parts[1]
|
|
372
681
|
self.handle_load(filename)
|
|
@@ -390,6 +699,423 @@ class SimpleCLI:
|
|
|
390
699
|
|
|
391
700
|
return True
|
|
392
701
|
|
|
702
|
+
def _clear_screen(self) -> None:
|
|
703
|
+
os.system('cls' if os.name == 'nt' else 'clear')
|
|
704
|
+
|
|
705
|
+
def _print_error(self, msg: str) -> None:
|
|
706
|
+
red = "\033[31m"
|
|
707
|
+
reset = "\033[0m"
|
|
708
|
+
print(f"{red}{msg}{reset}")
|
|
709
|
+
|
|
710
|
+
def _print_warn(self, msg: str) -> None:
|
|
711
|
+
yellow = "\033[33m"
|
|
712
|
+
reset = "\033[0m"
|
|
713
|
+
print(f"{yellow}{msg}{reset}")
|
|
714
|
+
|
|
715
|
+
def _force_extension(self, filename: str, ext: str) -> str:
|
|
716
|
+
"""Ensure `filename` ends with `ext` by replacing any existing suffix (best-effort)."""
|
|
717
|
+
ext = str(ext or "").strip()
|
|
718
|
+
if not ext:
|
|
719
|
+
return filename
|
|
720
|
+
if not ext.startswith("."):
|
|
721
|
+
ext = f".{ext}"
|
|
722
|
+
try:
|
|
723
|
+
p = Path(filename)
|
|
724
|
+
except Exception:
|
|
725
|
+
return f"{filename}{ext}"
|
|
726
|
+
if p.suffix:
|
|
727
|
+
return str(p.with_suffix(ext))
|
|
728
|
+
return f"{p}{ext}"
|
|
729
|
+
|
|
730
|
+
def _resolve_session_path(self, filename: str) -> Optional[str]:
|
|
731
|
+
"""Resolve a session file path (prefers exact match, then `.json`)."""
|
|
732
|
+
if not isinstance(filename, str) or not filename.strip():
|
|
733
|
+
return None
|
|
734
|
+
raw = filename.strip()
|
|
735
|
+
candidates = [raw]
|
|
736
|
+
forced = self._force_extension(raw, ".json")
|
|
737
|
+
if forced != raw:
|
|
738
|
+
candidates.append(forced)
|
|
739
|
+
for cand in candidates:
|
|
740
|
+
if os.path.exists(cand):
|
|
741
|
+
return cand
|
|
742
|
+
return None
|
|
743
|
+
|
|
744
|
+
def _resolve_cache_path(self, filename: str) -> Optional[str]:
|
|
745
|
+
"""Resolve a cache file path (prefers exact match, then `.safetensors` / `.safetensor`)."""
|
|
746
|
+
if not isinstance(filename, str) or not filename.strip():
|
|
747
|
+
return None
|
|
748
|
+
raw = filename.strip()
|
|
749
|
+
candidates = [raw]
|
|
750
|
+
forced = self._force_extension(raw, ".safetensors")
|
|
751
|
+
if forced != raw:
|
|
752
|
+
candidates.append(forced)
|
|
753
|
+
forced_alt = self._force_extension(raw, ".safetensor")
|
|
754
|
+
if forced_alt not in candidates:
|
|
755
|
+
candidates.append(forced_alt)
|
|
756
|
+
for cand in candidates:
|
|
757
|
+
if os.path.exists(cand):
|
|
758
|
+
return cand
|
|
759
|
+
return None
|
|
760
|
+
|
|
761
|
+
def _kv_cache_token_count(self, key: str) -> Optional[int]:
|
|
762
|
+
"""Best-effort token count for the active KV cache key (MLX)."""
|
|
763
|
+
if not isinstance(key, str) or not key.strip():
|
|
764
|
+
return None
|
|
765
|
+
try:
|
|
766
|
+
cache_obj = getattr(self.provider, "_prompt_cache_store").get(key.strip())
|
|
767
|
+
except Exception:
|
|
768
|
+
cache_obj = None
|
|
769
|
+
if cache_obj is None:
|
|
770
|
+
return None
|
|
771
|
+
try:
|
|
772
|
+
tok = getattr(self.provider, "_prompt_cache_backend_token_count")(cache_obj)
|
|
773
|
+
return int(tok) if isinstance(tok, int) else None
|
|
774
|
+
except Exception:
|
|
775
|
+
return None
|
|
776
|
+
|
|
777
|
+
def _kv_refresh_tools_if_needed(self, *, reason: str, force: bool = False) -> bool:
|
|
778
|
+
"""Re-inject tool specs into the active KV cache when recency or origin requires it."""
|
|
779
|
+
if self.prompt_cache_mode != "kv":
|
|
780
|
+
return False
|
|
781
|
+
if not self._is_mlx_provider():
|
|
782
|
+
return False
|
|
783
|
+
if not self._supports_prompt_cache():
|
|
784
|
+
return False
|
|
785
|
+
if not getattr(self.session, "tools", None):
|
|
786
|
+
return False
|
|
787
|
+
|
|
788
|
+
key = self.prompt_cache_key
|
|
789
|
+
if not isinstance(key, str) or not key.strip():
|
|
790
|
+
return False
|
|
791
|
+
|
|
792
|
+
# Long-context models can “forget” early tool specs; re-inject near the end when the cache is very large.
|
|
793
|
+
threshold_default = 50_000
|
|
794
|
+
try:
|
|
795
|
+
threshold = int(os.getenv("ABSTRACTCORE_CLI_KV_REFRESH_TOOLS_AT", str(threshold_default)))
|
|
796
|
+
except Exception:
|
|
797
|
+
threshold = threshold_default
|
|
798
|
+
if threshold < 0:
|
|
799
|
+
threshold = threshold_default
|
|
800
|
+
|
|
801
|
+
tok = self._kv_cache_token_count(key)
|
|
802
|
+
should = bool(force) or (isinstance(tok, int) and tok >= threshold)
|
|
803
|
+
if not should:
|
|
804
|
+
return False
|
|
805
|
+
|
|
806
|
+
try:
|
|
807
|
+
getattr(self.provider, "prompt_cache_update")(
|
|
808
|
+
key,
|
|
809
|
+
system_prompt=None, # tools-only system message for recency
|
|
810
|
+
tools=self.session.tools,
|
|
811
|
+
add_generation_prompt=False,
|
|
812
|
+
)
|
|
813
|
+
except Exception as e:
|
|
814
|
+
self._print_warn(f"⚠️ Could not refresh tools into KV cache ({reason}): {e}")
|
|
815
|
+
return False
|
|
816
|
+
|
|
817
|
+
if not self.single_prompt_mode:
|
|
818
|
+
extra = f" (~{tok:,} tokens)" if isinstance(tok, int) and tok > 0 else ""
|
|
819
|
+
print(f"🧰 Tools refreshed into KV cache ({reason}){extra}")
|
|
820
|
+
return True
|
|
821
|
+
|
|
822
|
+
def _get_country_code(self) -> str:
|
|
823
|
+
val = os.getenv("ABSTRACTCORE_CLI_COUNTRY")
|
|
824
|
+
if isinstance(val, str) and val.strip():
|
|
825
|
+
cc = val.strip().upper()
|
|
826
|
+
return cc if len(cc) == 2 else cc[:2]
|
|
827
|
+
|
|
828
|
+
# Best-effort locale fallback (e.g. "en_US" -> "US")
|
|
829
|
+
try:
|
|
830
|
+
loc = locale.getlocale()[0] or ""
|
|
831
|
+
except Exception:
|
|
832
|
+
loc = ""
|
|
833
|
+
if isinstance(loc, str) and "_" in loc:
|
|
834
|
+
cc = loc.split("_", 1)[1].strip().upper()
|
|
835
|
+
if cc:
|
|
836
|
+
return cc[:2]
|
|
837
|
+
|
|
838
|
+
return "FR"
|
|
839
|
+
|
|
840
|
+
def _timestamp_user_message(self, text: str) -> str:
|
|
841
|
+
ts = datetime.now().strftime("%Y/%m/%d %H:%M")
|
|
842
|
+
return f"[{ts} {self.country_code}] {text}"
|
|
843
|
+
|
|
844
|
+
def _supports_prompt_cache(self) -> bool:
|
|
845
|
+
try:
|
|
846
|
+
fn = getattr(self.provider, "supports_prompt_cache", None)
|
|
847
|
+
return bool(fn and fn())
|
|
848
|
+
except Exception:
|
|
849
|
+
return False
|
|
850
|
+
|
|
851
|
+
def _is_mlx_provider(self) -> bool:
|
|
852
|
+
return str(self.provider_name or "").strip().lower() == "mlx"
|
|
853
|
+
|
|
854
|
+
def _analysis_provider(self) -> Any:
|
|
855
|
+
"""Provider to use for internal CLI analytics (never mutates KV prompt cache)."""
|
|
856
|
+
if self.prompt_cache_mode != "kv":
|
|
857
|
+
return self.provider
|
|
858
|
+
return _NoPromptCacheProvider(self.provider)
|
|
859
|
+
|
|
860
|
+
def _init_prompt_caching(self, *, show_banner: bool) -> None:
|
|
861
|
+
if not self._supports_prompt_cache():
|
|
862
|
+
self.prompt_cache_mode = "off"
|
|
863
|
+
return
|
|
864
|
+
|
|
865
|
+
# Default policy:
|
|
866
|
+
# - MLX: local KV cache (append-only) with explicit prefill (system+tools).
|
|
867
|
+
# - Other providers: key-only hint (pass-through / best-effort).
|
|
868
|
+
if self._is_mlx_provider():
|
|
869
|
+
self.prompt_cache_mode = "kv"
|
|
870
|
+
else:
|
|
871
|
+
self.prompt_cache_mode = "key"
|
|
872
|
+
|
|
873
|
+
self.prompt_cache_key = f"cli:{uuid.uuid4().hex[:12]}"
|
|
874
|
+
try:
|
|
875
|
+
ok = bool(getattr(self.provider, "prompt_cache_set")(self.prompt_cache_key, make_default=True))
|
|
876
|
+
except Exception:
|
|
877
|
+
ok = False
|
|
878
|
+
|
|
879
|
+
if not ok:
|
|
880
|
+
self.prompt_cache_mode = "off"
|
|
881
|
+
self.prompt_cache_key = None
|
|
882
|
+
return
|
|
883
|
+
|
|
884
|
+
if self.prompt_cache_mode == "kv":
|
|
885
|
+
# Prefill stable modules once so each turn can be appended safely.
|
|
886
|
+
try:
|
|
887
|
+
getattr(self.provider, "prompt_cache_update")(
|
|
888
|
+
self.prompt_cache_key,
|
|
889
|
+
system_prompt=self.session.system_prompt,
|
|
890
|
+
tools=self.session.tools,
|
|
891
|
+
add_generation_prompt=False,
|
|
892
|
+
)
|
|
893
|
+
except Exception as e:
|
|
894
|
+
self._print_warn(f"⚠️ Prompt cache prefill failed; falling back to key-only mode: {e}")
|
|
895
|
+
self.prompt_cache_mode = "key"
|
|
896
|
+
|
|
897
|
+
if show_banner:
|
|
898
|
+
if self.prompt_cache_mode == "kv":
|
|
899
|
+
print(f"🧠 Prompt caching: ON (KV local) key={self.prompt_cache_key}")
|
|
900
|
+
elif self.prompt_cache_mode == "key":
|
|
901
|
+
print(f"🧠 Prompt caching: ON (key hint) key={self.prompt_cache_key}")
|
|
902
|
+
|
|
903
|
+
def handle_clear(self) -> None:
|
|
904
|
+
"""Clear prompt cache and context (best-effort)."""
|
|
905
|
+
# Clear session transcript (keep system prompt for user visibility).
|
|
906
|
+
self.session.clear_history(keep_system=True)
|
|
907
|
+
|
|
908
|
+
if not self._supports_prompt_cache():
|
|
909
|
+
print("🧹 Context cleared (prompt caching unsupported)")
|
|
910
|
+
return
|
|
911
|
+
|
|
912
|
+
# Clear provider-side in-process caches (best-effort).
|
|
913
|
+
try:
|
|
914
|
+
getattr(self.provider, "prompt_cache_clear")(None)
|
|
915
|
+
except Exception:
|
|
916
|
+
pass
|
|
917
|
+
|
|
918
|
+
# Re-init caching for this run.
|
|
919
|
+
self.prompt_cache_key = None
|
|
920
|
+
self.prompt_cache_file = None
|
|
921
|
+
self._init_prompt_caching(show_banner=False)
|
|
922
|
+
|
|
923
|
+
if self.prompt_cache_mode == "off":
|
|
924
|
+
print("🧹 Context cleared (prompt caching disabled)")
|
|
925
|
+
else:
|
|
926
|
+
print("🧹 Context + prompt cache cleared")
|
|
927
|
+
|
|
928
|
+
def handle_cache_clear(self) -> None:
|
|
929
|
+
"""Clear prompt cache only (best-effort)."""
|
|
930
|
+
if not self._supports_prompt_cache():
|
|
931
|
+
print("🧹 Prompt cache cleared (prompt caching unsupported)")
|
|
932
|
+
return
|
|
933
|
+
|
|
934
|
+
# In KV mode the cache is the source-of-truth for model context; clearing it without clearing
|
|
935
|
+
# or resending history would desync the model and the transcript. Rebuild from transcript.
|
|
936
|
+
if self.prompt_cache_mode == "kv":
|
|
937
|
+
self._print_warn("⚠️ KV cache cleared; rebuilding from current session transcript")
|
|
938
|
+
try:
|
|
939
|
+
self._rebuild_kv_cache_from_session()
|
|
940
|
+
return
|
|
941
|
+
except Exception as e:
|
|
942
|
+
self._print_error(f"❌ KV cache rebuild failed: {e}")
|
|
943
|
+
self._print_warn("⚠️ Falling back to session-managed mode (no KV)")
|
|
944
|
+
self.prompt_cache_mode = "key"
|
|
945
|
+
|
|
946
|
+
# Key-only / remote mode: clear provider-side caches (best-effort) and rotate key.
|
|
947
|
+
try:
|
|
948
|
+
getattr(self.provider, "prompt_cache_clear")(None)
|
|
949
|
+
except Exception:
|
|
950
|
+
pass
|
|
951
|
+
|
|
952
|
+
self.prompt_cache_key = None
|
|
953
|
+
self.prompt_cache_file = None
|
|
954
|
+
self._init_prompt_caching(show_banner=False)
|
|
955
|
+
|
|
956
|
+
if self.prompt_cache_mode == "off":
|
|
957
|
+
print("🧹 Prompt cache cleared (prompt caching disabled)")
|
|
958
|
+
else:
|
|
959
|
+
print("🧹 Prompt cache cleared")
|
|
960
|
+
|
|
961
|
+
def handle_save_prompt_cache(self, filename: str, *, q8: bool = False) -> None:
|
|
962
|
+
"""Save MLX prompt cache to disk (writes a `.safetensors` file; model-locked)."""
|
|
963
|
+
if not self._is_mlx_provider():
|
|
964
|
+
self._print_error("❌ KV cache save is only supported for provider 'mlx'")
|
|
965
|
+
return
|
|
966
|
+
if not self._supports_prompt_cache():
|
|
967
|
+
self._print_error("❌ This provider does not support prompt caching")
|
|
968
|
+
return
|
|
969
|
+
filename = self._force_extension(filename, ".safetensors")
|
|
970
|
+
|
|
971
|
+
key = self.prompt_cache_key
|
|
972
|
+
if not isinstance(key, str) or not key.strip():
|
|
973
|
+
self._print_error("❌ No active prompt cache key; start chatting first or /clear to re-init caching")
|
|
974
|
+
return
|
|
975
|
+
|
|
976
|
+
try:
|
|
977
|
+
cache_obj = getattr(self.provider, "_prompt_cache_store").get(key)
|
|
978
|
+
except Exception:
|
|
979
|
+
cache_obj = None
|
|
980
|
+
|
|
981
|
+
if cache_obj is None:
|
|
982
|
+
self._print_error("❌ Prompt cache is empty; nothing to save yet")
|
|
983
|
+
return
|
|
984
|
+
|
|
985
|
+
try:
|
|
986
|
+
from mlx_lm.models.cache import save_prompt_cache
|
|
987
|
+
except Exception:
|
|
988
|
+
self._print_error("❌ MLX cache saving requires mlx-lm (install: `pip install \"abstractcore[mlx]\"`)")
|
|
989
|
+
return
|
|
990
|
+
|
|
991
|
+
meta: Dict[str, str] = {
|
|
992
|
+
"format": "abstractcore-cli-prompt-cache/v1",
|
|
993
|
+
"provider": str(self.provider_name),
|
|
994
|
+
"model": str(getattr(self.provider, "model", self.model_name)),
|
|
995
|
+
"saved_at": datetime.now().isoformat(),
|
|
996
|
+
}
|
|
997
|
+
try:
|
|
998
|
+
tok = getattr(self.provider, "_prompt_cache_backend_token_count")(cache_obj)
|
|
999
|
+
if isinstance(tok, int) and tok >= 0:
|
|
1000
|
+
meta["token_count"] = str(tok)
|
|
1001
|
+
except Exception:
|
|
1002
|
+
pass
|
|
1003
|
+
|
|
1004
|
+
cache_to_save = cache_obj
|
|
1005
|
+
if q8:
|
|
1006
|
+
try:
|
|
1007
|
+
cache_to_save = [layer.to_quantized(group_size=64, bits=8) for layer in cache_obj]
|
|
1008
|
+
meta["quantized"] = "q8"
|
|
1009
|
+
except Exception as e:
|
|
1010
|
+
self._print_warn(f"⚠️ q8 quantization failed; saving full-precision cache: {e}")
|
|
1011
|
+
|
|
1012
|
+
try:
|
|
1013
|
+
save_prompt_cache(filename, cache_to_save, metadata=meta)
|
|
1014
|
+
self.prompt_cache_file = filename
|
|
1015
|
+
extra = ""
|
|
1016
|
+
if "token_count" in meta:
|
|
1017
|
+
extra = f" ({meta['token_count']} tokens)"
|
|
1018
|
+
print(f"💾 Cache saved to {filename}{extra}")
|
|
1019
|
+
except Exception as e:
|
|
1020
|
+
self._print_error(f"❌ Failed to save prompt cache: {e}")
|
|
1021
|
+
|
|
1022
|
+
def handle_load_prompt_cache(self, filename: str) -> None:
|
|
1023
|
+
"""Load MLX prompt cache from disk (reads a `.safetensors` file; model-locked)."""
|
|
1024
|
+
if not self._is_mlx_provider():
|
|
1025
|
+
self._print_error("❌ KV cache load is only supported for provider 'mlx'")
|
|
1026
|
+
return
|
|
1027
|
+
if not self._supports_prompt_cache():
|
|
1028
|
+
self._print_error("❌ This provider does not support prompt caching")
|
|
1029
|
+
return
|
|
1030
|
+
resolved = self._resolve_cache_path(filename)
|
|
1031
|
+
if not resolved:
|
|
1032
|
+
self._print_error(f"❌ File not found: {self._force_extension(filename, '.safetensors')}")
|
|
1033
|
+
return
|
|
1034
|
+
|
|
1035
|
+
try:
|
|
1036
|
+
from mlx_lm.models.cache import load_prompt_cache
|
|
1037
|
+
except Exception:
|
|
1038
|
+
self._print_error("❌ MLX cache loading requires mlx-lm (install: `pip install \"abstractcore[mlx]\"`)")
|
|
1039
|
+
return
|
|
1040
|
+
|
|
1041
|
+
try:
|
|
1042
|
+
loaded_cache, meta = load_prompt_cache(resolved, return_metadata=True)
|
|
1043
|
+
except Exception as e:
|
|
1044
|
+
self._print_error(f"❌ Failed to load prompt cache: {e}")
|
|
1045
|
+
return
|
|
1046
|
+
|
|
1047
|
+
required_model = None
|
|
1048
|
+
if isinstance(meta, dict):
|
|
1049
|
+
required_model = meta.get("model") or meta.get("model_id")
|
|
1050
|
+
current_model = str(getattr(self.provider, "model", self.model_name))
|
|
1051
|
+
|
|
1052
|
+
if isinstance(required_model, str) and required_model.strip() and required_model.strip() != current_model:
|
|
1053
|
+
self._print_error(
|
|
1054
|
+
"❌ Prompt cache model mismatch:\n"
|
|
1055
|
+
f" cache expects: {required_model}\n"
|
|
1056
|
+
f" current model: {current_model}\n"
|
|
1057
|
+
f" hint: run `/model mlx:{required_model}` then `/cache load {self._force_extension(filename, '.safetensors')}`"
|
|
1058
|
+
)
|
|
1059
|
+
return
|
|
1060
|
+
if not isinstance(required_model, str) or not required_model.strip():
|
|
1061
|
+
# Best-effort structural check: layer count mismatch is a strong signal of wrong model.
|
|
1062
|
+
try:
|
|
1063
|
+
expected = getattr(self.provider, "_prompt_cache_backend_create")()
|
|
1064
|
+
if isinstance(expected, (list, tuple)) and isinstance(loaded_cache, (list, tuple)):
|
|
1065
|
+
if len(expected) != len(loaded_cache):
|
|
1066
|
+
self._print_error(
|
|
1067
|
+
"❌ Prompt cache appears incompatible with the current model (layer count mismatch).\n"
|
|
1068
|
+
f" cache layers: {len(loaded_cache)}\n"
|
|
1069
|
+
f" model layers: {len(expected)}\n"
|
|
1070
|
+
f" hint: regenerate the cache with this model, or switch model and retry"
|
|
1071
|
+
)
|
|
1072
|
+
return
|
|
1073
|
+
except Exception:
|
|
1074
|
+
pass
|
|
1075
|
+
self._print_warn("⚠️ Cache metadata has no model id; cannot fully verify compatibility (proceeding best-effort)")
|
|
1076
|
+
|
|
1077
|
+
# Clear existing caches and install the loaded cache under a fresh key.
|
|
1078
|
+
try:
|
|
1079
|
+
getattr(self.provider, "prompt_cache_clear")(None)
|
|
1080
|
+
except Exception:
|
|
1081
|
+
pass
|
|
1082
|
+
|
|
1083
|
+
new_key = f"cli:{uuid.uuid4().hex[:12]}"
|
|
1084
|
+
try:
|
|
1085
|
+
getattr(self.provider, "prompt_cache_set")(new_key, make_default=True)
|
|
1086
|
+
except Exception:
|
|
1087
|
+
pass
|
|
1088
|
+
|
|
1089
|
+
try:
|
|
1090
|
+
getattr(self.provider, "_prompt_cache_store").set(
|
|
1091
|
+
new_key,
|
|
1092
|
+
loaded_cache,
|
|
1093
|
+
meta={"backend": "mlx", "loaded_from": resolved, **(meta if isinstance(meta, dict) else {})},
|
|
1094
|
+
)
|
|
1095
|
+
except Exception as e:
|
|
1096
|
+
self._print_error(f"❌ Failed to install loaded cache into provider store: {e}")
|
|
1097
|
+
return
|
|
1098
|
+
|
|
1099
|
+
self.prompt_cache_mode = "kv"
|
|
1100
|
+
self.prompt_cache_key = new_key
|
|
1101
|
+
self.prompt_cache_file = resolved
|
|
1102
|
+
|
|
1103
|
+
# Reset transcript; the cache becomes the source of truth for context.
|
|
1104
|
+
self.session.clear_history(keep_system=False)
|
|
1105
|
+
token_note = ""
|
|
1106
|
+
if isinstance(meta, dict) and isinstance(meta.get("token_count"), str) and meta.get("token_count"):
|
|
1107
|
+
token_note = f" ({meta.get('token_count')} tokens)"
|
|
1108
|
+
print(f"📂 Cache loaded from {resolved}{token_note} (key={new_key})")
|
|
1109
|
+
|
|
1110
|
+
cache_format = meta.get("format") if isinstance(meta, dict) else None
|
|
1111
|
+
force_refresh = cache_format != "abstractcore-cli-prompt-cache/v1"
|
|
1112
|
+
if force_refresh and not self.single_prompt_mode:
|
|
1113
|
+
self._print_warn(
|
|
1114
|
+
"⚠️ Loaded cache has no AbstractCore CLI metadata; it may not include tool specs.\n"
|
|
1115
|
+
" Injecting current CLI tool definitions into the KV cache for recency."
|
|
1116
|
+
)
|
|
1117
|
+
self._kv_refresh_tools_if_needed(reason="cache load", force=force_refresh)
|
|
1118
|
+
|
|
393
1119
|
def handle_compact(self, focus: Optional[str] = None):
|
|
394
1120
|
"""Handle /compact [focus] command - compact chat history with optional focus"""
|
|
395
1121
|
messages = self.session.get_messages()
|
|
@@ -419,10 +1145,17 @@ class SimpleCLI:
|
|
|
419
1145
|
start_time = time.time()
|
|
420
1146
|
|
|
421
1147
|
# Perform in-place compaction with optional focus
|
|
422
|
-
self.session.
|
|
423
|
-
preserve_recent=4, # Keep last
|
|
424
|
-
focus=focus or "key information and ongoing context"
|
|
1148
|
+
compacted = self.session.compact(
|
|
1149
|
+
preserve_recent=4, # Keep last 4 messages (2 exchanges)
|
|
1150
|
+
focus=focus or "key information and ongoing context",
|
|
1151
|
+
compact_provider=compact_provider,
|
|
1152
|
+
reason="user_requested",
|
|
425
1153
|
)
|
|
1154
|
+
# Replace current session with compacted version (in-place).
|
|
1155
|
+
try:
|
|
1156
|
+
self.session._replace_with_compacted(compacted)
|
|
1157
|
+
except Exception:
|
|
1158
|
+
self.session = compacted
|
|
426
1159
|
|
|
427
1160
|
duration = time.time() - start_time
|
|
428
1161
|
|
|
@@ -439,10 +1172,10 @@ class SimpleCLI:
|
|
|
439
1172
|
else:
|
|
440
1173
|
print(f" {i+1}. ⚙️ System prompt")
|
|
441
1174
|
elif msg.role == 'user':
|
|
442
|
-
preview =
|
|
1175
|
+
preview = preview_text(msg.content, max_chars=50)
|
|
443
1176
|
print(f" {i+1}. 👤 {preview}")
|
|
444
1177
|
elif msg.role == 'assistant':
|
|
445
|
-
preview =
|
|
1178
|
+
preview = preview_text(msg.content, max_chars=50)
|
|
446
1179
|
print(f" {i+1}. 🤖 {preview}")
|
|
447
1180
|
|
|
448
1181
|
print(" 💡 Note: Token count may increase initially due to detailed summary")
|
|
@@ -463,7 +1196,7 @@ class SimpleCLI:
|
|
|
463
1196
|
print("🔍 Extracting facts from conversation history...")
|
|
464
1197
|
|
|
465
1198
|
# Create fact extractor using current provider for consistency
|
|
466
|
-
extractor = BasicExtractor(self.
|
|
1199
|
+
extractor = BasicExtractor(self._analysis_provider())
|
|
467
1200
|
|
|
468
1201
|
# Format conversation history as text
|
|
469
1202
|
conversation_text = self._format_conversation_for_extraction(messages)
|
|
@@ -539,7 +1272,7 @@ class SimpleCLI:
|
|
|
539
1272
|
print("⚖️ Evaluating conversation quality...")
|
|
540
1273
|
|
|
541
1274
|
# Create judge using current provider for consistency
|
|
542
|
-
judge = BasicJudge(self.
|
|
1275
|
+
judge = BasicJudge(self._analysis_provider())
|
|
543
1276
|
|
|
544
1277
|
# Format conversation history as text
|
|
545
1278
|
conversation_text = self._format_conversation_for_extraction(messages)
|
|
@@ -653,7 +1386,7 @@ class SimpleCLI:
|
|
|
653
1386
|
print("🎯 Analyzing conversation intents for all participants...")
|
|
654
1387
|
|
|
655
1388
|
# Create intent analyzer using current provider for consistency
|
|
656
|
-
analyzer = BasicIntentAnalyzer(self.
|
|
1389
|
+
analyzer = BasicIntentAnalyzer(self._analysis_provider())
|
|
657
1390
|
|
|
658
1391
|
# Convert session messages to the format expected by intent analyzer
|
|
659
1392
|
conversation_messages = [msg for msg in messages if msg.role != 'system']
|
|
@@ -717,7 +1450,7 @@ class SimpleCLI:
|
|
|
717
1450
|
# Truncate long response approaches for readability
|
|
718
1451
|
response_approach = analysis.suggested_response_approach
|
|
719
1452
|
if len(response_approach) > 200:
|
|
720
|
-
response_approach = response_approach
|
|
1453
|
+
response_approach = preview_text(response_approach, max_chars=200)
|
|
721
1454
|
print(f" {response_approach}")
|
|
722
1455
|
|
|
723
1456
|
# Analysis metadata
|
|
@@ -861,18 +1594,26 @@ class SimpleCLI:
|
|
|
861
1594
|
break
|
|
862
1595
|
else:
|
|
863
1596
|
# No existing system message, add one at the beginning
|
|
864
|
-
|
|
1597
|
+
created = self.session.add_message('system', new_prompt)
|
|
1598
|
+
# add_message appends; move the created system message to the front for correct ordering.
|
|
1599
|
+
try:
|
|
1600
|
+
self.session.messages.remove(created)
|
|
1601
|
+
except Exception:
|
|
1602
|
+
pass
|
|
1603
|
+
self.session.messages.insert(0, created)
|
|
865
1604
|
|
|
866
1605
|
print("✅ System prompt updated!")
|
|
867
1606
|
print(f"📝 Old: {old_prompt[:100]}{'...' if len(old_prompt) > 100 else ''}")
|
|
868
1607
|
print(f"📝 New: {new_prompt[:100]}{'...' if len(new_prompt) > 100 else ''}")
|
|
869
1608
|
|
|
1609
|
+
if self.prompt_cache_mode == "kv":
|
|
1610
|
+
self._print_warn("⚠️ KV prompt cache invalidated by system prompt change; clearing cache and context")
|
|
1611
|
+
self.handle_clear()
|
|
1612
|
+
|
|
870
1613
|
def handle_save(self, filename: str, summary: bool = False, assessment: bool = False, facts: bool = False):
|
|
871
1614
|
"""Handle /save <file> command - save current session to file with optional analytics"""
|
|
872
1615
|
try:
|
|
873
|
-
|
|
874
|
-
if not filename.endswith('.json'):
|
|
875
|
-
filename = f"{filename}.json"
|
|
1616
|
+
filename = self._force_extension(filename, ".json")
|
|
876
1617
|
|
|
877
1618
|
print(f"💾 Saving session to {filename}...")
|
|
878
1619
|
|
|
@@ -882,11 +1623,12 @@ class SimpleCLI:
|
|
|
882
1623
|
|
|
883
1624
|
# Generate optional analytics if requested
|
|
884
1625
|
analytics_generated = []
|
|
1626
|
+
analysis_provider = self._analysis_provider()
|
|
885
1627
|
|
|
886
1628
|
if summary:
|
|
887
1629
|
print(" 🔄 Generating summary...")
|
|
888
1630
|
try:
|
|
889
|
-
self.session.generate_summary(focus="key discussion points")
|
|
1631
|
+
self.session.generate_summary(focus="key discussion points", compact_provider=analysis_provider)
|
|
890
1632
|
analytics_generated.append("summary")
|
|
891
1633
|
print(" ✅ Summary generated")
|
|
892
1634
|
except Exception as e:
|
|
@@ -894,20 +1636,38 @@ class SimpleCLI:
|
|
|
894
1636
|
|
|
895
1637
|
if assessment:
|
|
896
1638
|
print(" 🔄 Generating assessment...")
|
|
1639
|
+
original_provider = None
|
|
897
1640
|
try:
|
|
1641
|
+
original_provider = self.session.provider
|
|
1642
|
+
self.session.provider = analysis_provider
|
|
898
1643
|
self.session.generate_assessment()
|
|
1644
|
+
self.session.provider = original_provider
|
|
899
1645
|
analytics_generated.append("assessment")
|
|
900
1646
|
print(" ✅ Assessment generated")
|
|
901
1647
|
except Exception as e:
|
|
1648
|
+
try:
|
|
1649
|
+
if original_provider is not None:
|
|
1650
|
+
self.session.provider = original_provider
|
|
1651
|
+
except Exception:
|
|
1652
|
+
pass
|
|
902
1653
|
print(f" ⚠️ Assessment generation failed: {e}")
|
|
903
1654
|
|
|
904
1655
|
if facts:
|
|
905
1656
|
print(" 🔄 Extracting facts...")
|
|
1657
|
+
original_provider = None
|
|
906
1658
|
try:
|
|
1659
|
+
original_provider = self.session.provider
|
|
1660
|
+
self.session.provider = analysis_provider
|
|
907
1661
|
self.session.extract_facts()
|
|
1662
|
+
self.session.provider = original_provider
|
|
908
1663
|
analytics_generated.append("facts")
|
|
909
1664
|
print(" ✅ Facts extracted")
|
|
910
1665
|
except Exception as e:
|
|
1666
|
+
try:
|
|
1667
|
+
if original_provider is not None:
|
|
1668
|
+
self.session.provider = original_provider
|
|
1669
|
+
except Exception:
|
|
1670
|
+
pass
|
|
911
1671
|
print(f" ⚠️ Fact extraction failed: {e}")
|
|
912
1672
|
|
|
913
1673
|
# Save using enhanced serialization
|
|
@@ -935,17 +1695,15 @@ class SimpleCLI:
|
|
|
935
1695
|
def handle_load(self, filename: str):
|
|
936
1696
|
"""Handle /load <file> command - load session from file"""
|
|
937
1697
|
try:
|
|
938
|
-
|
|
939
|
-
if not filename.endswith('.json'):
|
|
940
|
-
filename = f"{filename}.json"
|
|
1698
|
+
resolved = self._resolve_session_path(filename) or self._force_extension(filename, ".json")
|
|
941
1699
|
|
|
942
1700
|
# Check if file exists
|
|
943
1701
|
import os
|
|
944
|
-
if not os.path.exists(
|
|
945
|
-
print(f"❌ File not found: {
|
|
1702
|
+
if not os.path.exists(resolved):
|
|
1703
|
+
print(f"❌ File not found: {resolved}")
|
|
946
1704
|
return
|
|
947
1705
|
|
|
948
|
-
print(f"📂 Loading session from {
|
|
1706
|
+
print(f"📂 Loading session from {resolved}...")
|
|
949
1707
|
|
|
950
1708
|
# Store current session info for comparison
|
|
951
1709
|
old_messages = len(self.session.get_messages())
|
|
@@ -955,17 +1713,27 @@ class SimpleCLI:
|
|
|
955
1713
|
from ..tools.common_tools import list_files, read_file, write_file, execute_command, search_files
|
|
956
1714
|
tools = [list_files, read_file, write_file, execute_command, search_files]
|
|
957
1715
|
|
|
958
|
-
loaded_session = BasicSession.load(
|
|
1716
|
+
loaded_session = BasicSession.load(resolved, provider=self.provider, tools=tools)
|
|
959
1717
|
|
|
960
1718
|
# Replace current session
|
|
961
1719
|
self.session = loaded_session
|
|
962
|
-
|
|
1720
|
+
|
|
1721
|
+
# If we're in local KV cache mode (MLX), rebuild the cache from the loaded transcript so
|
|
1722
|
+
# the model context matches what the user sees.
|
|
1723
|
+
if self._is_mlx_provider() and self._supports_prompt_cache():
|
|
1724
|
+
try:
|
|
1725
|
+
self.prompt_cache_mode = "kv"
|
|
1726
|
+
self._rebuild_kv_cache_from_session()
|
|
1727
|
+
except Exception as e:
|
|
1728
|
+
self._print_warn(f"⚠️ KV cache rebuild from session failed; continuing without KV mode: {e}")
|
|
1729
|
+
self.prompt_cache_mode = "key"
|
|
1730
|
+
|
|
963
1731
|
# Get new session info
|
|
964
1732
|
new_messages = len(self.session.get_messages())
|
|
965
1733
|
new_tokens = self.session.get_token_estimate()
|
|
966
1734
|
|
|
967
1735
|
print(f"✅ Session loaded successfully!")
|
|
968
|
-
print(f" 📁 File: {
|
|
1736
|
+
print(f" 📁 File: {resolved}")
|
|
969
1737
|
print(f" 📝 Messages: {old_messages} → {new_messages}")
|
|
970
1738
|
print(f" 🔢 Tokens: ~{old_tokens:,} → ~{new_tokens:,}")
|
|
971
1739
|
print(f" 🤖 Provider: {self.provider_name}:{self.model_name} (current)")
|
|
@@ -994,6 +1762,69 @@ class SimpleCLI:
|
|
|
994
1762
|
import traceback
|
|
995
1763
|
traceback.print_exc()
|
|
996
1764
|
|
|
1765
|
+
def _rebuild_kv_cache_from_session(self) -> None:
|
|
1766
|
+
"""Best-effort rebuild of the local KV prompt cache from the current session transcript."""
|
|
1767
|
+
if not self._is_mlx_provider():
|
|
1768
|
+
return
|
|
1769
|
+
if not self._supports_prompt_cache():
|
|
1770
|
+
return
|
|
1771
|
+
|
|
1772
|
+
# Fresh cache key for the rebuilt state.
|
|
1773
|
+
try:
|
|
1774
|
+
getattr(self.provider, "prompt_cache_clear")(None)
|
|
1775
|
+
except Exception:
|
|
1776
|
+
pass
|
|
1777
|
+
|
|
1778
|
+
key = f"cli:{uuid.uuid4().hex[:12]}"
|
|
1779
|
+
ok = False
|
|
1780
|
+
try:
|
|
1781
|
+
ok = bool(getattr(self.provider, "prompt_cache_set")(key, make_default=True))
|
|
1782
|
+
except Exception:
|
|
1783
|
+
ok = False
|
|
1784
|
+
|
|
1785
|
+
if not ok:
|
|
1786
|
+
self.prompt_cache_mode = "off"
|
|
1787
|
+
self.prompt_cache_key = None
|
|
1788
|
+
raise RuntimeError("provider failed to create a prompt cache")
|
|
1789
|
+
|
|
1790
|
+
# Prefill stable modules.
|
|
1791
|
+
try:
|
|
1792
|
+
getattr(self.provider, "prompt_cache_update")(
|
|
1793
|
+
key,
|
|
1794
|
+
system_prompt=self.session.system_prompt,
|
|
1795
|
+
tools=self.session.tools,
|
|
1796
|
+
add_generation_prompt=False,
|
|
1797
|
+
)
|
|
1798
|
+
except Exception as e:
|
|
1799
|
+
raise RuntimeError(f"failed to prefill system/tools: {e}") from e
|
|
1800
|
+
|
|
1801
|
+
# Append any additional transcript messages (excluding the main system prompt we just prefixed).
|
|
1802
|
+
messages_to_append: List[Dict[str, Any]] = []
|
|
1803
|
+
for msg in self.session.get_messages():
|
|
1804
|
+
role = getattr(msg, "role", None)
|
|
1805
|
+
content = getattr(msg, "content", None)
|
|
1806
|
+
if role == "system":
|
|
1807
|
+
if isinstance(self.session.system_prompt, str) and content == self.session.system_prompt and not str(content).startswith("[CONVERSATION HISTORY]"):
|
|
1808
|
+
continue
|
|
1809
|
+
if role and content is not None:
|
|
1810
|
+
messages_to_append.append({"role": role, "content": content})
|
|
1811
|
+
|
|
1812
|
+
if messages_to_append:
|
|
1813
|
+
try:
|
|
1814
|
+
getattr(self.provider, "prompt_cache_update")(
|
|
1815
|
+
key,
|
|
1816
|
+
messages=messages_to_append,
|
|
1817
|
+
add_generation_prompt=False,
|
|
1818
|
+
)
|
|
1819
|
+
except Exception as e:
|
|
1820
|
+
raise RuntimeError(f"failed to append transcript messages: {e}") from e
|
|
1821
|
+
|
|
1822
|
+
self.prompt_cache_key = key
|
|
1823
|
+
self.prompt_cache_file = None
|
|
1824
|
+
self.prompt_cache_mode = "kv"
|
|
1825
|
+
print(f"🧠 KV prompt cache rebuilt from session (key={key}, messages={len(messages_to_append)})")
|
|
1826
|
+
self._kv_refresh_tools_if_needed(reason="session rebuild", force=False)
|
|
1827
|
+
|
|
997
1828
|
def handle_tooltag_test(self, opening_tag: str, closing_tag: str):
|
|
998
1829
|
"""Handle /tooltag command - demonstrate tool call format handling"""
|
|
999
1830
|
print(f"🏷️ Tool call format testing: {opening_tag}...{closing_tag}")
|
|
@@ -1010,6 +1841,29 @@ class SimpleCLI:
|
|
|
1010
1841
|
print(f"🔧 Provider: {self.provider_name}")
|
|
1011
1842
|
print(f"🤖 Model: {self.model_name}")
|
|
1012
1843
|
print(f"🌊 Streaming: {'Enabled' if self.stream_mode else 'Disabled'}")
|
|
1844
|
+
thinking_label = "auto" if self.thinking is None else ("on" if self.thinking is True else "off" if self.thinking is False else str(self.thinking))
|
|
1845
|
+
print(f"🧠 Thinking: {thinking_label}")
|
|
1846
|
+
show_reasoning_label = "auto" if self.show_reasoning is None else ("on" if self.show_reasoning else "off")
|
|
1847
|
+
print(f"🧠 Show reasoning: {show_reasoning_label}")
|
|
1848
|
+
if self.prompt_cache_mode != "off":
|
|
1849
|
+
cache_details = f"mode={self.prompt_cache_mode}"
|
|
1850
|
+
if self.prompt_cache_key:
|
|
1851
|
+
cache_details += f" key={self.prompt_cache_key}"
|
|
1852
|
+
if self.prompt_cache_file:
|
|
1853
|
+
cache_details += f" file={self.prompt_cache_file}"
|
|
1854
|
+
print(f"🧠 Prompt caching: {cache_details}")
|
|
1855
|
+
try:
|
|
1856
|
+
if hasattr(self.provider, "get_prompt_cache_stats"):
|
|
1857
|
+
stats = self.provider.get_prompt_cache_stats()
|
|
1858
|
+
if isinstance(stats, dict):
|
|
1859
|
+
entries = stats.get("entries")
|
|
1860
|
+
max_entries = stats.get("max_entries")
|
|
1861
|
+
if entries is not None and max_entries is not None:
|
|
1862
|
+
print(f" Cache store: {entries}/{max_entries} entries")
|
|
1863
|
+
except Exception:
|
|
1864
|
+
pass
|
|
1865
|
+
else:
|
|
1866
|
+
print("🧠 Prompt caching: off")
|
|
1013
1867
|
|
|
1014
1868
|
# Debug status - show both CLI and system logging
|
|
1015
1869
|
print(f"🐛 CLI Debug: {'Enabled' if self.debug_mode else 'Disabled'}")
|
|
@@ -1035,7 +1889,8 @@ class SimpleCLI:
|
|
|
1035
1889
|
|
|
1036
1890
|
# Token usage
|
|
1037
1891
|
current_tokens = self.session.get_token_estimate()
|
|
1038
|
-
print(f"💾
|
|
1892
|
+
print(f"💾 Context Usage: {current_tokens:,} / {self.max_tokens:,} tokens ({(current_tokens/self.max_tokens*100):.1f}%)")
|
|
1893
|
+
print(f"✍️ Max Output Tokens: {self.max_output_tokens:,}")
|
|
1039
1894
|
|
|
1040
1895
|
# Model capabilities
|
|
1041
1896
|
try:
|
|
@@ -1050,6 +1905,11 @@ class SimpleCLI:
|
|
|
1050
1905
|
print(f" Vision Support: {'Yes' if capabilities.get('vision_support', False) else 'No'}")
|
|
1051
1906
|
print(f" Audio Support: {'Yes' if capabilities.get('audio_support', False) else 'No'}")
|
|
1052
1907
|
print(f" Thinking Support: {'Yes' if capabilities.get('thinking_support', False) else 'No'}")
|
|
1908
|
+
reasoning_levels = capabilities.get("reasoning_levels")
|
|
1909
|
+
if isinstance(reasoning_levels, list) and reasoning_levels:
|
|
1910
|
+
levels_str = ", ".join([str(x) for x in reasoning_levels if isinstance(x, str) and x.strip()])
|
|
1911
|
+
if levels_str:
|
|
1912
|
+
print(f" Reasoning Levels: {levels_str}")
|
|
1053
1913
|
|
|
1054
1914
|
# Show aliases if any
|
|
1055
1915
|
aliases = capabilities.get('aliases', [])
|
|
@@ -1129,23 +1989,37 @@ class SimpleCLI:
|
|
|
1129
1989
|
if not clean_input and media_files:
|
|
1130
1990
|
clean_input = "Please analyze the attached file(s)."
|
|
1131
1991
|
|
|
1992
|
+
clean_input = self._timestamp_user_message(clean_input)
|
|
1993
|
+
|
|
1132
1994
|
if self.debug_mode:
|
|
1133
1995
|
print(f"🔍 Sending to {self.provider_name}:{self.model_name}")
|
|
1134
1996
|
if media_files:
|
|
1135
1997
|
print(f"🔍 Media files: {media_files}")
|
|
1136
1998
|
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1999
|
+
if self.prompt_cache_mode == "kv":
|
|
2000
|
+
response = self._generate_response_kv(
|
|
2001
|
+
clean_input,
|
|
2002
|
+
media=media_files if media_files else None,
|
|
2003
|
+
)
|
|
2004
|
+
else:
|
|
2005
|
+
# Generate response with media support (session-managed history)
|
|
2006
|
+
gen_kwargs: Dict[str, Any] = {
|
|
2007
|
+
"stream": self.stream_mode,
|
|
2008
|
+
"media": media_files if media_files else None,
|
|
2009
|
+
"max_output_tokens": self.max_output_tokens,
|
|
2010
|
+
}
|
|
2011
|
+
if self.thinking is not None:
|
|
2012
|
+
gen_kwargs["thinking"] = self.thinking
|
|
2013
|
+
response = self.session.generate(clean_input, **gen_kwargs)
|
|
1143
2014
|
|
|
1144
2015
|
if self.stream_mode:
|
|
1145
|
-
|
|
2016
|
+
show_reasoning = self._should_show_reasoning() and not self.single_prompt_mode
|
|
2017
|
+
buffer_for_reasoning_first = self._should_buffer_stream_for_reasoning_first()
|
|
2018
|
+
if not self.single_prompt_mode and not buffer_for_reasoning_first:
|
|
1146
2019
|
print("🤖 Assistant: ", end="", flush=True)
|
|
1147
2020
|
full_content = ""
|
|
1148
2021
|
display_buffer = "" # Buffer for cleaned display content
|
|
2022
|
+
reasoning_parts: List[str] = []
|
|
1149
2023
|
|
|
1150
2024
|
for chunk in response:
|
|
1151
2025
|
if hasattr(chunk, 'content') and chunk.content:
|
|
@@ -1170,17 +2044,34 @@ class SimpleCLI:
|
|
|
1170
2044
|
'```tool_code'
|
|
1171
2045
|
])
|
|
1172
2046
|
|
|
1173
|
-
|
|
1174
|
-
|
|
2047
|
+
# If we want reasoning-first display, buffer output (no live streaming).
|
|
2048
|
+
if buffer_for_reasoning_first:
|
|
1175
2049
|
display_buffer += chunk_text
|
|
1176
2050
|
else:
|
|
1177
|
-
|
|
1178
|
-
|
|
2051
|
+
if not has_tool_marker:
|
|
2052
|
+
print(chunk_text, end="", flush=True)
|
|
2053
|
+
display_buffer += chunk_text
|
|
2054
|
+
else:
|
|
2055
|
+
# Buffer the chunk, we'll process after streaming
|
|
2056
|
+
display_buffer += chunk_text
|
|
2057
|
+
|
|
2058
|
+
# Best-effort: capture streamed reasoning metadata (OpenAI-compatible deltas, etc.).
|
|
2059
|
+
if hasattr(chunk, "metadata") and isinstance(getattr(chunk, "metadata"), dict):
|
|
2060
|
+
r = chunk.metadata.get("reasoning")
|
|
2061
|
+
if isinstance(r, str) and r.strip():
|
|
2062
|
+
reasoning_parts.append(r.strip())
|
|
1179
2063
|
|
|
1180
|
-
|
|
2064
|
+
if not buffer_for_reasoning_first:
|
|
2065
|
+
print() # New line after streaming
|
|
1181
2066
|
|
|
1182
2067
|
# Parse and execute tool calls from full content
|
|
1183
2068
|
clean_content, tool_calls = self._parse_and_strip_tool_calls(full_content)
|
|
2069
|
+
if self.prompt_cache_mode == "kv":
|
|
2070
|
+
# Maintain transcript for UX; model context lives in KV cache.
|
|
2071
|
+
try:
|
|
2072
|
+
self.session.add_message("assistant", clean_content.strip() or full_content)
|
|
2073
|
+
except Exception:
|
|
2074
|
+
pass
|
|
1184
2075
|
|
|
1185
2076
|
# If we buffered tool call content, we should have shown clean content
|
|
1186
2077
|
# For now, if there's significant difference, show the clean version
|
|
@@ -1189,12 +2080,38 @@ class SimpleCLI:
|
|
|
1189
2080
|
# This happens when tool calls appear mid-stream
|
|
1190
2081
|
if self.debug_mode:
|
|
1191
2082
|
print(f"\n🔍 Cleaned content differs from streamed content")
|
|
1192
|
-
|
|
2083
|
+
|
|
2084
|
+
combined = "\n\n".join(reasoning_parts).strip() if reasoning_parts else ""
|
|
2085
|
+
if show_reasoning and combined:
|
|
2086
|
+
self._print_reasoning_block(combined)
|
|
2087
|
+
|
|
2088
|
+
# Reasoning-first UX: show the final answer after reasoning (buffered).
|
|
2089
|
+
if buffer_for_reasoning_first:
|
|
2090
|
+
if clean_content.strip():
|
|
2091
|
+
print(f"🤖 Assistant: {clean_content}")
|
|
2092
|
+
elif tool_calls and not self.single_prompt_mode:
|
|
2093
|
+
print("🤖 Assistant: ", end="")
|
|
2094
|
+
elif self.single_prompt_mode:
|
|
2095
|
+
print(clean_content or full_content)
|
|
2096
|
+
else:
|
|
2097
|
+
print(f"🤖 Assistant: {clean_content or full_content}")
|
|
2098
|
+
|
|
1193
2099
|
self._execute_tool_calls(tool_calls)
|
|
1194
2100
|
else:
|
|
1195
2101
|
# Non-streaming: parse content, display clean version, execute tools
|
|
1196
2102
|
clean_content, tool_calls = self._parse_and_strip_tool_calls(response.content)
|
|
2103
|
+
if self.prompt_cache_mode == "kv":
|
|
2104
|
+
try:
|
|
2105
|
+
self.session.add_message("assistant", clean_content.strip() or response.content)
|
|
2106
|
+
except Exception:
|
|
2107
|
+
pass
|
|
1197
2108
|
|
|
2109
|
+
meta = getattr(response, "metadata", None)
|
|
2110
|
+
if self._should_show_reasoning() and not self.single_prompt_mode and isinstance(meta, dict):
|
|
2111
|
+
r = meta.get("reasoning")
|
|
2112
|
+
if isinstance(r, str) and r.strip():
|
|
2113
|
+
self._print_reasoning_block(r.strip())
|
|
2114
|
+
|
|
1198
2115
|
# Display only the clean content (without tool call syntax)
|
|
1199
2116
|
if clean_content.strip():
|
|
1200
2117
|
if self.single_prompt_mode:
|
|
@@ -1204,14 +2121,14 @@ class SimpleCLI:
|
|
|
1204
2121
|
elif tool_calls:
|
|
1205
2122
|
# Only tool calls, no text response
|
|
1206
2123
|
if not self.single_prompt_mode:
|
|
1207
|
-
print(
|
|
2124
|
+
print("🤖 Assistant: ", end="")
|
|
1208
2125
|
else:
|
|
1209
2126
|
# Empty response
|
|
1210
2127
|
if self.single_prompt_mode:
|
|
1211
2128
|
print(response.content)
|
|
1212
2129
|
else:
|
|
1213
2130
|
print(f"🤖 Assistant: {response.content}")
|
|
1214
|
-
|
|
2131
|
+
|
|
1215
2132
|
# Execute tool calls
|
|
1216
2133
|
self._execute_tool_calls(tool_calls)
|
|
1217
2134
|
|
|
@@ -1227,6 +2144,96 @@ class SimpleCLI:
|
|
|
1227
2144
|
import traceback
|
|
1228
2145
|
traceback.print_exc()
|
|
1229
2146
|
|
|
2147
|
+
def _should_show_reasoning(self) -> bool:
|
|
2148
|
+
"""Decide whether to display reasoning in the CLI output."""
|
|
2149
|
+
if self.show_reasoning is not None:
|
|
2150
|
+
return bool(self.show_reasoning)
|
|
2151
|
+
# Auto: show when present unless explicitly disabled.
|
|
2152
|
+
if self.thinking is False:
|
|
2153
|
+
return False
|
|
2154
|
+
return True
|
|
2155
|
+
|
|
2156
|
+
def _should_buffer_stream_for_reasoning_first(self) -> bool:
|
|
2157
|
+
"""Decide whether to buffer streaming output to show reasoning before the answer."""
|
|
2158
|
+
if self.single_prompt_mode:
|
|
2159
|
+
return False
|
|
2160
|
+
if not self._should_show_reasoning():
|
|
2161
|
+
return False
|
|
2162
|
+
|
|
2163
|
+
# If the user explicitly enabled reasoning display or requested thinking, honor reasoning-first UX.
|
|
2164
|
+
if self.show_reasoning is True:
|
|
2165
|
+
return True
|
|
2166
|
+
if self.thinking is not None and self.thinking is not False:
|
|
2167
|
+
return True
|
|
2168
|
+
|
|
2169
|
+
# Auto mode: only buffer when the model is expected to emit a separate reasoning channel.
|
|
2170
|
+
try:
|
|
2171
|
+
from ..architectures.detection import detect_architecture, get_architecture_format, get_model_capabilities
|
|
2172
|
+
|
|
2173
|
+
caps = get_model_capabilities(self.model_name)
|
|
2174
|
+
arch = detect_architecture(self.model_name)
|
|
2175
|
+
arch_fmt = get_architecture_format(arch)
|
|
2176
|
+
except Exception:
|
|
2177
|
+
caps = {}
|
|
2178
|
+
arch_fmt = {}
|
|
2179
|
+
|
|
2180
|
+
resp_fmt = str((caps or {}).get("response_format") or "").strip().lower()
|
|
2181
|
+
if resp_fmt == "harmony":
|
|
2182
|
+
return True
|
|
2183
|
+
|
|
2184
|
+
for src in (caps, arch_fmt):
|
|
2185
|
+
if isinstance(src, dict):
|
|
2186
|
+
f = src.get("thinking_output_field")
|
|
2187
|
+
if isinstance(f, str) and f.strip():
|
|
2188
|
+
return True
|
|
2189
|
+
|
|
2190
|
+
return False
|
|
2191
|
+
|
|
2192
|
+
def _print_reasoning_block(self, reasoning: str) -> None:
|
|
2193
|
+
"""Print reasoning in a visually distinct style (best-effort)."""
|
|
2194
|
+
import sys
|
|
2195
|
+
|
|
2196
|
+
text = reasoning.strip()
|
|
2197
|
+
if not text:
|
|
2198
|
+
return
|
|
2199
|
+
|
|
2200
|
+
print("🧠 Reasoning:")
|
|
2201
|
+
if sys.stdout.isatty():
|
|
2202
|
+
# Grey + italic (best-effort; not all terminals support italics).
|
|
2203
|
+
print(f"\x1b[90m\x1b[3m{text}\x1b[0m")
|
|
2204
|
+
else:
|
|
2205
|
+
print(text)
|
|
2206
|
+
|
|
2207
|
+
def _generate_response_kv(self, prompt: str, *, media: Optional[list] = None):
|
|
2208
|
+
"""Generate response using append-only KV cache mode (local providers only)."""
|
|
2209
|
+
# Maintain a local transcript for UX, but do not send it to the model; the KV cache is source-of-truth.
|
|
2210
|
+
try:
|
|
2211
|
+
self.session.add_message("user", prompt)
|
|
2212
|
+
except Exception:
|
|
2213
|
+
pass
|
|
2214
|
+
|
|
2215
|
+
gen_kwargs: Dict[str, Any] = {
|
|
2216
|
+
"prompt": prompt,
|
|
2217
|
+
"messages": None,
|
|
2218
|
+
"system_prompt": None,
|
|
2219
|
+
"tools": None, # tools were prefixed into the cache during prefill
|
|
2220
|
+
"media": media,
|
|
2221
|
+
"stream": bool(self.stream_mode),
|
|
2222
|
+
"max_output_tokens": self.max_output_tokens,
|
|
2223
|
+
}
|
|
2224
|
+
if self.thinking is not None:
|
|
2225
|
+
gen_kwargs["thinking"] = self.thinking
|
|
2226
|
+
# Preserve session-level generation parameters for consistency.
|
|
2227
|
+
try:
|
|
2228
|
+
if getattr(self.session, "temperature", None) is not None:
|
|
2229
|
+
gen_kwargs["temperature"] = self.session.temperature
|
|
2230
|
+
if isinstance(getattr(self.session, "seed", None), int) and self.session.seed >= 0:
|
|
2231
|
+
gen_kwargs["seed"] = self.session.seed
|
|
2232
|
+
except Exception:
|
|
2233
|
+
pass
|
|
2234
|
+
|
|
2235
|
+
return self.provider.generate(**gen_kwargs)
|
|
2236
|
+
|
|
1230
2237
|
def _parse_and_strip_tool_calls(self, content: str):
|
|
1231
2238
|
"""
|
|
1232
2239
|
Parse tool calls from content and return (clean_content, tool_calls).
|
|
@@ -1337,7 +2344,7 @@ class SimpleCLI:
|
|
|
1337
2344
|
if not self.single_prompt_mode:
|
|
1338
2345
|
args_str = str(tool_args) if tool_args else "{}"
|
|
1339
2346
|
if len(args_str) > 100:
|
|
1340
|
-
args_str = args_str
|
|
2347
|
+
args_str = preview_text(args_str, max_chars=100)
|
|
1341
2348
|
print(f"**{tool_name}({args_str})**")
|
|
1342
2349
|
|
|
1343
2350
|
# Execute the tool
|
|
@@ -1435,14 +2442,18 @@ def main():
|
|
|
1435
2442
|
epilog="""
|
|
1436
2443
|
Examples:
|
|
1437
2444
|
python -m abstractcore.utils.cli --provider ollama --model qwen3-coder:30b
|
|
1438
|
-
python -m abstractcore.utils.cli --provider openai --model gpt-
|
|
1439
|
-
python -m abstractcore.utils.cli --provider anthropic --model claude-
|
|
2445
|
+
python -m abstractcore.utils.cli --provider openai --model gpt-5-mini --stream
|
|
2446
|
+
python -m abstractcore.utils.cli --provider anthropic --model claude-haiku-4-5
|
|
2447
|
+
python -m abstractcore.utils.cli --provider lmstudio --model qwen/qwen3-4b-2507 --base-url http://localhost:1234/v1
|
|
2448
|
+
python -m abstractcore.utils.cli --provider openrouter --model openai/gpt-4o-mini
|
|
1440
2449
|
python -m abstractcore.utils.cli --prompt "What is Python?" # Uses configured defaults
|
|
1441
2450
|
|
|
1442
2451
|
Key Commands:
|
|
1443
2452
|
/help Show comprehensive command guide
|
|
1444
|
-
/save <
|
|
1445
|
-
/load <
|
|
2453
|
+
/session save <name> [--summary --assessment --facts] Save session JSON (writes .json)
|
|
2454
|
+
/session load <name> Load saved session JSON (reads .json)
|
|
2455
|
+
/cache save <name> Save MLX prompt/KV cache (writes .safetensors)
|
|
2456
|
+
/cache load <name> Load MLX prompt/KV cache (reads .safetensors)
|
|
1446
2457
|
/status Show system status and capabilities
|
|
1447
2458
|
/history [n] Show conversation history
|
|
1448
2459
|
/model <provider:model> Switch LLM provider/model
|
|
@@ -1471,18 +2482,19 @@ build custom solutions using the AbstractCore framework directly.
|
|
|
1471
2482
|
|
|
1472
2483
|
# Optional arguments (no longer required - will use configured defaults)
|
|
1473
2484
|
parser.add_argument('--provider',
|
|
1474
|
-
choices=['openai', 'anthropic', 'ollama', 'huggingface', 'mlx', 'lmstudio'],
|
|
2485
|
+
choices=['openai', 'anthropic', 'openrouter', 'openai-compatible', 'vllm', 'ollama', 'huggingface', 'mlx', 'lmstudio'],
|
|
1475
2486
|
help='LLM provider to use (optional - uses configured default)')
|
|
1476
2487
|
parser.add_argument('--model', help='Model name to use (optional - uses configured default)')
|
|
1477
2488
|
|
|
1478
2489
|
# Optional arguments
|
|
1479
2490
|
parser.add_argument('--stream', action='store_true', help='Enable streaming mode')
|
|
1480
2491
|
parser.add_argument('--debug', action='store_true', help='Enable debug mode')
|
|
1481
|
-
parser.add_argument('--max-tokens', type=int, default=None, help='Maximum tokens (default: auto-detect from model capabilities)')
|
|
2492
|
+
parser.add_argument('--max-tokens', type=int, default=None, help='Maximum total context tokens (default: auto-detect from model capabilities)')
|
|
2493
|
+
parser.add_argument('--max-output-tokens', type=int, default=None, help='Maximum output tokens per response (default: provider/model default)')
|
|
1482
2494
|
parser.add_argument('--prompt', help='Execute single prompt and exit')
|
|
1483
2495
|
|
|
1484
2496
|
# Provider-specific
|
|
1485
|
-
parser.add_argument('--base-url', help='Base URL (
|
|
2497
|
+
parser.add_argument('--base-url', help='Base URL override (OpenAI-compatible /v1 servers, proxies, Ollama)')
|
|
1486
2498
|
parser.add_argument('--api-key', help='API key')
|
|
1487
2499
|
parser.add_argument('--temperature', type=float, default=0.7, help='Temperature (default: 0.7)')
|
|
1488
2500
|
|
|
@@ -1554,6 +2566,7 @@ build custom solutions using the AbstractCore framework directly.
|
|
|
1554
2566
|
model=model,
|
|
1555
2567
|
stream=stream_mode,
|
|
1556
2568
|
max_tokens=args.max_tokens,
|
|
2569
|
+
max_output_tokens=args.max_output_tokens,
|
|
1557
2570
|
debug=args.debug,
|
|
1558
2571
|
show_banner=not args.prompt, # Hide banner in single-prompt mode
|
|
1559
2572
|
**kwargs
|
|
@@ -1567,4 +2580,4 @@ build custom solutions using the AbstractCore framework directly.
|
|
|
1567
2580
|
|
|
1568
2581
|
|
|
1569
2582
|
if __name__ == "__main__":
|
|
1570
|
-
main()
|
|
2583
|
+
main()
|