abstractcore 2.9.1__py3-none-any.whl → 2.11.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. abstractcore/__init__.py +7 -27
  2. abstractcore/apps/deepsearch.py +9 -4
  3. abstractcore/apps/extractor.py +33 -100
  4. abstractcore/apps/intent.py +19 -0
  5. abstractcore/apps/judge.py +20 -1
  6. abstractcore/apps/summarizer.py +20 -1
  7. abstractcore/architectures/detection.py +34 -1
  8. abstractcore/architectures/response_postprocessing.py +313 -0
  9. abstractcore/assets/architecture_formats.json +38 -8
  10. abstractcore/assets/model_capabilities.json +882 -160
  11. abstractcore/compression/__init__.py +1 -2
  12. abstractcore/compression/glyph_processor.py +6 -4
  13. abstractcore/config/main.py +52 -20
  14. abstractcore/config/manager.py +390 -12
  15. abstractcore/config/vision_config.py +5 -5
  16. abstractcore/core/interface.py +151 -3
  17. abstractcore/core/session.py +16 -10
  18. abstractcore/download.py +1 -1
  19. abstractcore/embeddings/manager.py +20 -6
  20. abstractcore/endpoint/__init__.py +2 -0
  21. abstractcore/endpoint/app.py +458 -0
  22. abstractcore/mcp/client.py +3 -1
  23. abstractcore/media/__init__.py +52 -17
  24. abstractcore/media/auto_handler.py +42 -22
  25. abstractcore/media/base.py +44 -1
  26. abstractcore/media/capabilities.py +12 -33
  27. abstractcore/media/enrichment.py +105 -0
  28. abstractcore/media/handlers/anthropic_handler.py +19 -28
  29. abstractcore/media/handlers/local_handler.py +124 -70
  30. abstractcore/media/handlers/openai_handler.py +19 -31
  31. abstractcore/media/processors/__init__.py +4 -2
  32. abstractcore/media/processors/audio_processor.py +57 -0
  33. abstractcore/media/processors/office_processor.py +8 -3
  34. abstractcore/media/processors/pdf_processor.py +46 -3
  35. abstractcore/media/processors/text_processor.py +22 -24
  36. abstractcore/media/processors/video_processor.py +58 -0
  37. abstractcore/media/types.py +97 -4
  38. abstractcore/media/utils/image_scaler.py +20 -2
  39. abstractcore/media/utils/video_frames.py +219 -0
  40. abstractcore/media/vision_fallback.py +136 -22
  41. abstractcore/processing/__init__.py +32 -3
  42. abstractcore/processing/basic_deepsearch.py +15 -10
  43. abstractcore/processing/basic_intent.py +3 -2
  44. abstractcore/processing/basic_judge.py +3 -2
  45. abstractcore/processing/basic_summarizer.py +1 -1
  46. abstractcore/providers/__init__.py +3 -1
  47. abstractcore/providers/anthropic_provider.py +95 -8
  48. abstractcore/providers/base.py +1516 -81
  49. abstractcore/providers/huggingface_provider.py +546 -69
  50. abstractcore/providers/lmstudio_provider.py +30 -916
  51. abstractcore/providers/mlx_provider.py +382 -35
  52. abstractcore/providers/model_capabilities.py +5 -1
  53. abstractcore/providers/ollama_provider.py +99 -15
  54. abstractcore/providers/openai_compatible_provider.py +406 -180
  55. abstractcore/providers/openai_provider.py +188 -44
  56. abstractcore/providers/openrouter_provider.py +76 -0
  57. abstractcore/providers/registry.py +61 -5
  58. abstractcore/providers/streaming.py +138 -33
  59. abstractcore/providers/vllm_provider.py +92 -817
  60. abstractcore/server/app.py +478 -28
  61. abstractcore/server/audio_endpoints.py +139 -0
  62. abstractcore/server/vision_endpoints.py +1319 -0
  63. abstractcore/structured/handler.py +316 -41
  64. abstractcore/tools/common_tools.py +5501 -2012
  65. abstractcore/tools/comms_tools.py +1641 -0
  66. abstractcore/tools/core.py +37 -7
  67. abstractcore/tools/handler.py +4 -9
  68. abstractcore/tools/parser.py +49 -2
  69. abstractcore/tools/tag_rewriter.py +2 -1
  70. abstractcore/tools/telegram_tdlib.py +407 -0
  71. abstractcore/tools/telegram_tools.py +261 -0
  72. abstractcore/utils/cli.py +1085 -72
  73. abstractcore/utils/structured_logging.py +29 -8
  74. abstractcore/utils/token_utils.py +2 -0
  75. abstractcore/utils/truncation.py +29 -0
  76. abstractcore/utils/version.py +3 -4
  77. abstractcore/utils/vlm_token_calculator.py +12 -2
  78. abstractcore-2.11.4.dist-info/METADATA +562 -0
  79. abstractcore-2.11.4.dist-info/RECORD +133 -0
  80. {abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/WHEEL +1 -1
  81. {abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/entry_points.txt +1 -0
  82. abstractcore-2.9.1.dist-info/METADATA +0 -1190
  83. abstractcore-2.9.1.dist-info/RECORD +0 -119
  84. {abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/licenses/LICENSE +0 -0
  85. {abstractcore-2.9.1.dist-info → abstractcore-2.11.4.dist-info}/top_level.txt +0 -0
abstractcore/utils/cli.py CHANGED
@@ -16,14 +16,23 @@ AbstractCore framework directly.
16
16
 
17
17
  Usage:
18
18
  python -m abstractcore.utils.cli --provider ollama --model qwen3-coder:30b
19
- python -m abstractcore.utils.cli --provider openai --model gpt-4o-mini --stream
20
- python -m abstractcore.utils.cli --provider anthropic --model claude-3-5-haiku-20241022 --prompt "What is Python?"
19
+ python -m abstractcore.utils.cli --provider openai --model gpt-5-mini --stream
20
+ python -m abstractcore.utils.cli --provider anthropic --model claude-haiku-4-5 --prompt "What is Python?"
21
+ python -m abstractcore.utils.cli --provider lmstudio --model qwen/qwen3-4b-2507 --base-url http://localhost:1234/v1
22
+ python -m abstractcore.utils.cli --provider openrouter --model openai/gpt-4o-mini
21
23
  """
22
24
 
23
25
  import argparse
26
+ import os
24
27
  import sys
25
28
  import time
26
- from typing import Optional
29
+ import uuid
30
+ import locale
31
+ from datetime import datetime
32
+ from pathlib import Path
33
+ from typing import Optional, Any, Dict, Iterator, List, Union
34
+
35
+ from .truncation import preview_text
27
36
 
28
37
  # Enable command history and arrow key navigation
29
38
  try:
@@ -42,11 +51,30 @@ from ..tools.common_tools import list_files, read_file, write_file, execute_comm
42
51
  from ..processing import BasicExtractor, BasicJudge, BasicIntentAnalyzer
43
52
 
44
53
 
54
+ class _NoPromptCacheProvider:
55
+ """Proxy that forces `prompt_cache_key=None` for every call (to avoid polluting KV caches)."""
56
+
57
+ def __init__(self, provider: Any):
58
+ self._provider = provider
59
+
60
+ def generate(self, *args: Any, **kwargs: Any):
61
+ kwargs["prompt_cache_key"] = None
62
+ return self._provider.generate(*args, **kwargs)
63
+
64
+ async def agenerate(self, *args: Any, **kwargs: Any):
65
+ kwargs["prompt_cache_key"] = None
66
+ return await self._provider.agenerate(*args, **kwargs)
67
+
68
+ def __getattr__(self, name: str) -> Any:
69
+ return getattr(self._provider, name)
70
+
71
+
45
72
  class SimpleCLI:
46
73
  """Simplified CLI REPL for AbstractCore"""
47
74
 
48
75
  def __init__(self, provider: str, model: str, stream: bool = False,
49
- max_tokens: int = None, debug: bool = False, show_banner: bool = True, **kwargs):
76
+ max_tokens: int = None, max_output_tokens: int = None,
77
+ debug: bool = False, show_banner: bool = True, **kwargs):
50
78
  self.provider_name = provider
51
79
  self.model_name = model
52
80
  self.stream_mode = stream
@@ -55,6 +83,7 @@ class SimpleCLI:
55
83
  self.kwargs = kwargs
56
84
 
57
85
  # Auto-detect max_tokens from model capabilities if not specified
86
+ self.max_tokens_auto = max_tokens is None
58
87
  if max_tokens is None:
59
88
  try:
60
89
  from ..architectures.detection import get_model_capabilities
@@ -68,18 +97,41 @@ class SimpleCLI:
68
97
  print(f"⚠️ Failed to auto-detect max_tokens, using fallback: {max_tokens} ({e})")
69
98
 
70
99
  self.max_tokens = max_tokens
100
+ self.max_output_tokens_auto = max_output_tokens is None
101
+ # Unified thinking/reasoning control (best-effort, provider/model dependent).
102
+ # - None: auto (provider/model default)
103
+ # - bool: on/off
104
+ # - str: "low"|"medium"|"high" when supported
105
+ self.thinking: Optional[Union[bool, str]] = None
106
+ # Whether to display model-supplied reasoning/thinking separately.
107
+ # - None: auto (show when thinking != off)
108
+ # - bool: force on/off
109
+ self.show_reasoning: Optional[bool] = None
71
110
 
72
111
  # Initialize command history with persistent storage
73
112
  self._setup_command_history()
74
113
 
75
114
  # Initialize provider and session with tools
76
- self.provider = create_llm(provider, model=model, max_tokens=max_tokens, **kwargs)
115
+ provider_kwargs = dict(kwargs)
116
+ provider_kwargs["max_tokens"] = max_tokens
117
+ if max_output_tokens is not None:
118
+ provider_kwargs["max_output_tokens"] = max_output_tokens
119
+ self.provider = create_llm(provider, model=model, **provider_kwargs)
120
+ # Store the effective max_output_tokens (provider may auto-select based on model capabilities).
121
+ self.max_output_tokens = getattr(self.provider, "max_output_tokens", max_output_tokens or 2048)
77
122
  self.session = BasicSession(
78
123
  self.provider,
79
124
  system_prompt="You are a helpful AI assistant with vision capabilities. When users provide images or media files, analyze and describe them directly. You also have access to file operation tools.",
80
125
  tools=[list_files, read_file, write_file, execute_command, search_files]
81
126
  )
82
127
 
128
+ # Prompt caching (best-effort; provider-dependent).
129
+ self.country_code = self._get_country_code()
130
+ self.prompt_cache_mode = "off" # off | key | kv
131
+ self.prompt_cache_key: Optional[str] = None
132
+ self.prompt_cache_file: Optional[str] = None
133
+ self._init_prompt_caching(show_banner=show_banner)
134
+
83
135
  # Only show banner in interactive mode
84
136
  if show_banner:
85
137
  print("=" * 70)
@@ -89,7 +141,7 @@ class SimpleCLI:
89
141
  print(f"📝 Model: {model}")
90
142
  print(f"🌊 Streaming: {'ON' if stream else 'OFF'} | 🐛 Debug: {'ON' if debug else 'OFF'}")
91
143
  print()
92
- print("💬 Quick Commands: /help /save /load /status /history /quit")
144
+ print("💬 Quick Commands: /help /session /cache /status /history /quit")
93
145
  print("🛠️ Available Tools: list_files, search_files, read_file, write_file, execute_command")
94
146
  print()
95
147
  print("💡 Type '/help' for comprehensive command guide")
@@ -158,7 +210,8 @@ class SimpleCLI:
158
210
  print("─" * 50)
159
211
  print(" /help Show this comprehensive help")
160
212
  print(" /quit Exit the CLI")
161
- print(" /clear Clear the screen (like unix terminal)")
213
+ print(" /clear Clear prompt cache + context (like mlx-chat)")
214
+ print(" /cls Clear the screen (like unix terminal)")
162
215
  print(" /reset Reset conversation history")
163
216
  print(" /status Show system status and capabilities")
164
217
 
@@ -175,17 +228,25 @@ class SimpleCLI:
175
228
  print(" • /system - Show current prompt")
176
229
  print(" • /system <text> - Set new prompt")
177
230
 
178
- print("\n💾 SESSION PERSISTENCE")
231
+ print("\n💾 SESSION & CACHE")
179
232
  print("─" * 50)
180
- print(" /save <file> [options] Save session with optional analytics")
181
- print(" • /save chat.json")
182
- print(" • /save analyzed --summary --assessment --facts")
233
+ print(" /session save <name> [options] Save session to <name>.json with optional analytics")
234
+ print(" • /session save chat")
235
+ print(" • /session save analyzed --summary --assessment --facts")
183
236
  print(" Options:")
184
237
  print(" --summary Generate conversation summary")
185
238
  print(" --assessment Evaluate conversation quality")
186
239
  print(" --facts Extract knowledge as facts")
187
- print(" /load <file> Load saved session (replaces current)")
188
- print(" • /load chat.json")
240
+ print(" /session load <name> Load session from <name>.json (replaces current)")
241
+ print(" • /session load chat")
242
+ print(" /session clear Clear session + cache (same as /clear)")
243
+ print(" /save /load Aliases for /session save|load (sessions only)")
244
+ print(" /cache save <name> Save prompt/KV cache to <name>.safetensors (MLX only, model-locked)")
245
+ print(" • /cache save chat_cache")
246
+ print(" --q8 Quantize cache before saving (smaller, lossy)")
247
+ print(" /cache load <name> Load prompt/KV cache from <name>.safetensors (MLX only, model-locked)")
248
+ print(" • /cache load chat_cache")
249
+ print(" /cache clear Clear prompt cache only (KV mode rebuilds from transcript)")
189
250
 
190
251
  print("\n📊 ANALYTICS & INSIGHTS")
191
252
  print("─" * 50)
@@ -201,8 +262,15 @@ class SimpleCLI:
201
262
  print("\n⚙️ CONFIGURATION")
202
263
  print("─" * 50)
203
264
  print(" /model <provider:model> Switch LLM provider/model")
204
- print(" • /model openai:gpt-4o-mini")
205
- print(" • /model anthropic:claude-3-5-haiku")
265
+ print(" • /model openai:gpt-5-mini")
266
+ print(" • /model anthropic:claude-haiku-4-5")
267
+ print(" • /model openrouter:openai/gpt-4o-mini")
268
+ print(" /max-tokens <n|auto> Set context token budget")
269
+ print(" /max-output-tokens <n|auto> Set max output tokens per response")
270
+ print(" /thinking <mode> Set thinking/reasoning mode (best-effort)")
271
+ print(" • /thinking auto|on|off|low|medium|high")
272
+ print(" /show-reasoning <mode> Display reasoning separately (auto/on/off)")
273
+ print(" • /show-reasoning auto|on|off")
206
274
  print(" /stream Toggle streaming mode on/off")
207
275
  print(" /debug Toggle debug info (timing, detection)")
208
276
 
@@ -231,7 +299,7 @@ class SimpleCLI:
231
299
  print(" • Search inside files: 'Find all TODO comments in Python files'")
232
300
  print(" • Request file operations: 'Read the README.md file'")
233
301
  print(" • Attach files: 'What's in this image? @photo.jpg'")
234
- print(" • Save important conversations: '/save project_discussion --summary'")
302
+ print(" • Save important conversations: '/session save project_discussion --summary'")
235
303
  print(" • Switch models for different tasks: '/model ollama:qwen3-coder:30b'")
236
304
  print(" • Use /status to check token usage and model capabilities")
237
305
 
@@ -240,13 +308,17 @@ class SimpleCLI:
240
308
  print("=" * 70 + "\n")
241
309
 
242
310
  elif cmd == 'clear':
243
- # Clear the screen like in unix terminal
244
- import os
245
- os.system('cls' if os.name == 'nt' else 'clear')
311
+ self.handle_clear()
312
+
313
+ elif cmd == 'cls':
314
+ self._clear_screen()
246
315
 
247
316
  elif cmd == 'reset':
248
- self.session.clear_history(keep_system=True)
249
- print("🧹 Chat history reset")
317
+ if self.prompt_cache_mode == "kv":
318
+ self.handle_clear()
319
+ else:
320
+ self.session.clear_history(keep_system=True)
321
+ print("🧹 Chat history reset")
250
322
 
251
323
  elif cmd == 'stream':
252
324
  self.stream_mode = not self.stream_mode
@@ -260,6 +332,134 @@ class SimpleCLI:
260
332
  elif cmd == 'status':
261
333
  self.handle_status()
262
334
 
335
+ elif cmd.startswith('thinking'):
336
+ parts = cmd.split(maxsplit=1)
337
+ if len(parts) == 1:
338
+ current = "auto" if self.thinking is None else ("on" if self.thinking is True else "off" if self.thinking is False else str(self.thinking))
339
+ print(f"🧠 thinking: {current}")
340
+ print("❓ Usage: /thinking <auto|on|off|low|medium|high>")
341
+ return True
342
+
343
+ raw = parts[1].strip().lower()
344
+ if raw in {"auto", "none", "null"}:
345
+ self.thinking = None
346
+ elif raw in {"on", "true", "1", "yes"}:
347
+ self.thinking = True
348
+ elif raw in {"off", "false", "0", "no"}:
349
+ self.thinking = False
350
+ elif raw in {"low", "medium", "high"}:
351
+ self.thinking = raw
352
+ else:
353
+ print("❓ Usage: /thinking <auto|on|off|low|medium|high>")
354
+ return True
355
+
356
+ current = "auto" if self.thinking is None else ("on" if self.thinking is True else "off" if self.thinking is False else str(self.thinking))
357
+ print(f"✅ thinking set to: {current}")
358
+ return True
359
+
360
+ elif cmd.startswith('show-reasoning') or cmd.startswith('reasoning'):
361
+ parts = cmd.split(maxsplit=1)
362
+ if len(parts) == 1:
363
+ current = "auto" if self.show_reasoning is None else ("on" if self.show_reasoning else "off")
364
+ print(f"🧠 show-reasoning: {current}")
365
+ print("❓ Usage: /show-reasoning <auto|on|off>")
366
+ return True
367
+
368
+ raw = parts[1].strip().lower()
369
+ if raw in {"auto", "none", "null"}:
370
+ self.show_reasoning = None
371
+ elif raw in {"on", "true", "1", "yes"}:
372
+ self.show_reasoning = True
373
+ elif raw in {"off", "false", "0", "no"}:
374
+ self.show_reasoning = False
375
+ else:
376
+ print("❓ Usage: /show-reasoning <auto|on|off>")
377
+ return True
378
+
379
+ current = "auto" if self.show_reasoning is None else ("on" if self.show_reasoning else "off")
380
+ print(f"✅ show-reasoning set to: {current}")
381
+ return True
382
+
383
+ elif cmd.startswith('max-tokens'):
384
+ parts = cmd.split()
385
+ if len(parts) == 1:
386
+ print(f"💾 max_tokens (context budget): {self.max_tokens:,} ({'auto' if self.max_tokens_auto else 'manual'})")
387
+ print("❓ Usage: /max-tokens <n|auto>")
388
+ else:
389
+ raw_value = parts[1].strip().lower()
390
+ if raw_value in {"auto", "-1"}:
391
+ try:
392
+ from ..architectures.detection import get_model_capabilities
393
+ capabilities = get_model_capabilities(self.model_name)
394
+ detected = capabilities.get('max_tokens', 16384)
395
+ except Exception:
396
+ detected = 16384
397
+ self.max_tokens = int(detected)
398
+ self.max_tokens_auto = True
399
+ else:
400
+ try:
401
+ new_max = int(raw_value)
402
+ if new_max <= 0:
403
+ raise ValueError
404
+ self.max_tokens = new_max
405
+ self.max_tokens_auto = False
406
+ except ValueError:
407
+ print("❓ Usage: /max-tokens <n|auto> (n must be a positive integer)")
408
+ return True
409
+
410
+ # Apply to current provider (best-effort; mostly used for token budgeting/compaction).
411
+ try:
412
+ setattr(self.provider, "max_tokens", self.max_tokens)
413
+ except Exception:
414
+ pass
415
+
416
+ # Safety clamp: output should not exceed total budget.
417
+ if isinstance(self.max_output_tokens, int) and self.max_output_tokens > int(self.max_tokens):
418
+ self.max_output_tokens = int(self.max_tokens)
419
+ try:
420
+ setattr(self.provider, "max_output_tokens", self.max_output_tokens)
421
+ except Exception:
422
+ pass
423
+
424
+ print(f"✅ max_tokens set to {self.max_tokens:,}")
425
+
426
+ elif cmd.startswith('max-output-tokens'):
427
+ parts = cmd.split()
428
+ if len(parts) == 1:
429
+ print(f"✍️ max_output_tokens (per response): {self.max_output_tokens:,} ({'auto' if self.max_output_tokens_auto else 'manual'})")
430
+ print("❓ Usage: /max-output-tokens <n|auto>")
431
+ else:
432
+ raw_value = parts[1].strip().lower()
433
+ if raw_value in {"auto", "-1"}:
434
+ try:
435
+ from ..architectures.detection import get_model_capabilities
436
+ capabilities = get_model_capabilities(self.model_name)
437
+ detected = capabilities.get('max_output_tokens', getattr(self.provider, "max_output_tokens", 2048))
438
+ except Exception:
439
+ detected = getattr(self.provider, "max_output_tokens", 2048)
440
+ self.max_output_tokens = int(detected)
441
+ self.max_output_tokens_auto = True
442
+ else:
443
+ try:
444
+ new_max = int(raw_value)
445
+ if new_max <= 0:
446
+ raise ValueError
447
+ self.max_output_tokens = new_max
448
+ self.max_output_tokens_auto = False
449
+ except ValueError:
450
+ print("❓ Usage: /max-output-tokens <n|auto> (n must be a positive integer)")
451
+ return True
452
+
453
+ # Safety clamp: output should not exceed total budget.
454
+ if isinstance(self.max_tokens, int) and self.max_output_tokens > int(self.max_tokens):
455
+ self.max_output_tokens = int(self.max_tokens)
456
+
457
+ try:
458
+ setattr(self.provider, "max_output_tokens", self.max_output_tokens)
459
+ except Exception:
460
+ pass
461
+ print(f"✅ max_output_tokens set to {self.max_output_tokens:,}")
462
+
263
463
  elif cmd.startswith('history'):
264
464
  # Parse /history [n] command
265
465
  parts = cmd.split()
@@ -282,13 +482,46 @@ class SimpleCLI:
282
482
  self.model_name = model_spec
283
483
 
284
484
  print(f"🔄 Switching to {self.provider_name}:{self.model_name}...")
485
+ # If token limits were auto-detected, re-detect them for the new model.
486
+ next_max_tokens = self.max_tokens
487
+ if self.max_tokens_auto:
488
+ try:
489
+ from ..architectures.detection import get_model_capabilities
490
+ capabilities = get_model_capabilities(self.model_name)
491
+ next_max_tokens = int(capabilities.get('max_tokens', 16384))
492
+ except Exception:
493
+ next_max_tokens = 16384
494
+
495
+ next_max_output_tokens = self.max_output_tokens
496
+ if self.max_output_tokens_auto:
497
+ try:
498
+ from ..architectures.detection import get_model_capabilities
499
+ capabilities = get_model_capabilities(self.model_name)
500
+ next_max_output_tokens = int(capabilities.get('max_output_tokens', self.max_output_tokens))
501
+ except Exception:
502
+ next_max_output_tokens = self.max_output_tokens
503
+
504
+ # Safety clamp: output should not exceed total budget.
505
+ if isinstance(next_max_tokens, int) and isinstance(next_max_output_tokens, int):
506
+ if next_max_output_tokens > next_max_tokens:
507
+ next_max_output_tokens = next_max_tokens
508
+
285
509
  self.provider = create_llm(self.provider_name, model=self.model_name,
286
- max_tokens=self.max_tokens, **self.kwargs)
510
+ max_tokens=next_max_tokens,
511
+ max_output_tokens=next_max_output_tokens,
512
+ **self.kwargs)
513
+ self.max_tokens = next_max_tokens
514
+ self.max_output_tokens = getattr(self.provider, "max_output_tokens", next_max_output_tokens)
287
515
  self.session = BasicSession(
288
516
  self.provider,
289
517
  system_prompt="You are a helpful AI assistant with vision capabilities. When users provide images or media files, analyze and describe them directly. You also have access to file operation tools.",
290
518
  tools=[list_files, read_file, write_file, execute_command, search_files]
291
519
  )
520
+ # Reset caching state for the new provider+model.
521
+ self.prompt_cache_key = None
522
+ self.prompt_cache_file = None
523
+ self.prompt_cache_mode = "off"
524
+ self._init_prompt_caching(show_banner=False)
292
525
  print("✅ Model switched")
293
526
  except Exception as e:
294
527
  print(f"❌ Failed to switch: {e}")
@@ -345,12 +578,87 @@ class SimpleCLI:
345
578
  else:
346
579
  self.handle_system_show()
347
580
 
581
+ elif cmd.startswith('session'):
582
+ # /session save|load|clear ...
583
+ parts = cmd.split()
584
+ if len(parts) < 2:
585
+ print("❓ Usage: /session <save|load|clear> ...")
586
+ print(" Examples:")
587
+ print(" /session save my_conversation")
588
+ print(" /session save analyzed_session --summary --assessment --facts")
589
+ print(" /session load my_conversation")
590
+ print(" /session clear")
591
+ return True
592
+
593
+ action = parts[1].strip().lower()
594
+ if action == "save":
595
+ if len(parts) < 3:
596
+ print("❓ Usage: /session save <name> [--summary] [--assessment] [--facts]")
597
+ return True
598
+ filename = parts[2]
599
+ options = {
600
+ 'summary': '--summary' in parts[3:],
601
+ 'assessment': '--assessment' in parts[3:],
602
+ 'facts': '--facts' in parts[3:],
603
+ }
604
+ self.handle_save(filename, **options)
605
+ return True
606
+
607
+ if action == "load":
608
+ if len(parts) != 3:
609
+ print("❓ Usage: /session load <name>")
610
+ return True
611
+ self.handle_load(parts[2])
612
+ return True
613
+
614
+ if action == "clear":
615
+ self.handle_clear()
616
+ return True
617
+
618
+ print("❓ Usage: /session <save|load|clear> ...")
619
+ return True
620
+
621
+ elif cmd.startswith('cache'):
622
+ # /cache save|load|clear ...
623
+ parts = cmd.split()
624
+ if len(parts) < 2:
625
+ print("❓ Usage: /cache <save|load|clear> ...")
626
+ print(" Examples:")
627
+ print(" /cache save chat_cache")
628
+ print(" /cache load chat_cache")
629
+ print(" /cache clear")
630
+ return True
631
+
632
+ action = parts[1].strip().lower()
633
+ if action == "save":
634
+ if len(parts) < 3:
635
+ print("❓ Usage: /cache save <name> [--q8]")
636
+ return True
637
+ filename = parts[2]
638
+ self.handle_save_prompt_cache(filename, q8=("--q8" in parts[3:]))
639
+ return True
640
+
641
+ if action == "load":
642
+ if len(parts) != 3:
643
+ print("❓ Usage: /cache load <name>")
644
+ return True
645
+ self.handle_load_prompt_cache(parts[2])
646
+ return True
647
+
648
+ if action == "clear":
649
+ self.handle_cache_clear()
650
+ return True
651
+
652
+ print("❓ Usage: /cache <save|load|clear> ...")
653
+ return True
654
+
348
655
  elif cmd.startswith('save'):
349
656
  # Parse /save <file> [--summary] [--assessment] [--facts] command
350
657
  parts = cmd.split()
351
658
  if len(parts) < 2:
352
659
  print("❓ Usage: /save <filename> [--summary] [--assessment] [--facts]")
353
- print(" Example: /save my_conversation.json")
660
+ print(" Example: /save my_conversation")
661
+ print(" Hint: use /cache save <name> for prompt caches")
354
662
  print(" Example: /save analyzed_session --summary --assessment --facts")
355
663
  else:
356
664
  filename = parts[1]
@@ -366,7 +674,8 @@ class SimpleCLI:
366
674
  parts = cmd.split()
367
675
  if len(parts) != 2:
368
676
  print("❓ Usage: /load <filename>")
369
- print(" Example: /load my_conversation.json")
677
+ print(" Example: /load my_conversation")
678
+ print(" Hint: use /cache load <name> for prompt caches")
370
679
  else:
371
680
  filename = parts[1]
372
681
  self.handle_load(filename)
@@ -390,6 +699,423 @@ class SimpleCLI:
390
699
 
391
700
  return True
392
701
 
702
+ def _clear_screen(self) -> None:
703
+ os.system('cls' if os.name == 'nt' else 'clear')
704
+
705
+ def _print_error(self, msg: str) -> None:
706
+ red = "\033[31m"
707
+ reset = "\033[0m"
708
+ print(f"{red}{msg}{reset}")
709
+
710
+ def _print_warn(self, msg: str) -> None:
711
+ yellow = "\033[33m"
712
+ reset = "\033[0m"
713
+ print(f"{yellow}{msg}{reset}")
714
+
715
+ def _force_extension(self, filename: str, ext: str) -> str:
716
+ """Ensure `filename` ends with `ext` by replacing any existing suffix (best-effort)."""
717
+ ext = str(ext or "").strip()
718
+ if not ext:
719
+ return filename
720
+ if not ext.startswith("."):
721
+ ext = f".{ext}"
722
+ try:
723
+ p = Path(filename)
724
+ except Exception:
725
+ return f"{filename}{ext}"
726
+ if p.suffix:
727
+ return str(p.with_suffix(ext))
728
+ return f"{p}{ext}"
729
+
730
+ def _resolve_session_path(self, filename: str) -> Optional[str]:
731
+ """Resolve a session file path (prefers exact match, then `.json`)."""
732
+ if not isinstance(filename, str) or not filename.strip():
733
+ return None
734
+ raw = filename.strip()
735
+ candidates = [raw]
736
+ forced = self._force_extension(raw, ".json")
737
+ if forced != raw:
738
+ candidates.append(forced)
739
+ for cand in candidates:
740
+ if os.path.exists(cand):
741
+ return cand
742
+ return None
743
+
744
+ def _resolve_cache_path(self, filename: str) -> Optional[str]:
745
+ """Resolve a cache file path (prefers exact match, then `.safetensors` / `.safetensor`)."""
746
+ if not isinstance(filename, str) or not filename.strip():
747
+ return None
748
+ raw = filename.strip()
749
+ candidates = [raw]
750
+ forced = self._force_extension(raw, ".safetensors")
751
+ if forced != raw:
752
+ candidates.append(forced)
753
+ forced_alt = self._force_extension(raw, ".safetensor")
754
+ if forced_alt not in candidates:
755
+ candidates.append(forced_alt)
756
+ for cand in candidates:
757
+ if os.path.exists(cand):
758
+ return cand
759
+ return None
760
+
761
+ def _kv_cache_token_count(self, key: str) -> Optional[int]:
762
+ """Best-effort token count for the active KV cache key (MLX)."""
763
+ if not isinstance(key, str) or not key.strip():
764
+ return None
765
+ try:
766
+ cache_obj = getattr(self.provider, "_prompt_cache_store").get(key.strip())
767
+ except Exception:
768
+ cache_obj = None
769
+ if cache_obj is None:
770
+ return None
771
+ try:
772
+ tok = getattr(self.provider, "_prompt_cache_backend_token_count")(cache_obj)
773
+ return int(tok) if isinstance(tok, int) else None
774
+ except Exception:
775
+ return None
776
+
777
+ def _kv_refresh_tools_if_needed(self, *, reason: str, force: bool = False) -> bool:
778
+ """Re-inject tool specs into the active KV cache when recency or origin requires it."""
779
+ if self.prompt_cache_mode != "kv":
780
+ return False
781
+ if not self._is_mlx_provider():
782
+ return False
783
+ if not self._supports_prompt_cache():
784
+ return False
785
+ if not getattr(self.session, "tools", None):
786
+ return False
787
+
788
+ key = self.prompt_cache_key
789
+ if not isinstance(key, str) or not key.strip():
790
+ return False
791
+
792
+ # Long-context models can “forget” early tool specs; re-inject near the end when the cache is very large.
793
+ threshold_default = 50_000
794
+ try:
795
+ threshold = int(os.getenv("ABSTRACTCORE_CLI_KV_REFRESH_TOOLS_AT", str(threshold_default)))
796
+ except Exception:
797
+ threshold = threshold_default
798
+ if threshold < 0:
799
+ threshold = threshold_default
800
+
801
+ tok = self._kv_cache_token_count(key)
802
+ should = bool(force) or (isinstance(tok, int) and tok >= threshold)
803
+ if not should:
804
+ return False
805
+
806
+ try:
807
+ getattr(self.provider, "prompt_cache_update")(
808
+ key,
809
+ system_prompt=None, # tools-only system message for recency
810
+ tools=self.session.tools,
811
+ add_generation_prompt=False,
812
+ )
813
+ except Exception as e:
814
+ self._print_warn(f"⚠️ Could not refresh tools into KV cache ({reason}): {e}")
815
+ return False
816
+
817
+ if not self.single_prompt_mode:
818
+ extra = f" (~{tok:,} tokens)" if isinstance(tok, int) and tok > 0 else ""
819
+ print(f"🧰 Tools refreshed into KV cache ({reason}){extra}")
820
+ return True
821
+
822
+ def _get_country_code(self) -> str:
823
+ val = os.getenv("ABSTRACTCORE_CLI_COUNTRY")
824
+ if isinstance(val, str) and val.strip():
825
+ cc = val.strip().upper()
826
+ return cc if len(cc) == 2 else cc[:2]
827
+
828
+ # Best-effort locale fallback (e.g. "en_US" -> "US")
829
+ try:
830
+ loc = locale.getlocale()[0] or ""
831
+ except Exception:
832
+ loc = ""
833
+ if isinstance(loc, str) and "_" in loc:
834
+ cc = loc.split("_", 1)[1].strip().upper()
835
+ if cc:
836
+ return cc[:2]
837
+
838
+ return "FR"
839
+
840
+ def _timestamp_user_message(self, text: str) -> str:
841
+ ts = datetime.now().strftime("%Y/%m/%d %H:%M")
842
+ return f"[{ts} {self.country_code}] {text}"
843
+
844
+ def _supports_prompt_cache(self) -> bool:
845
+ try:
846
+ fn = getattr(self.provider, "supports_prompt_cache", None)
847
+ return bool(fn and fn())
848
+ except Exception:
849
+ return False
850
+
851
+ def _is_mlx_provider(self) -> bool:
852
+ return str(self.provider_name or "").strip().lower() == "mlx"
853
+
854
+ def _analysis_provider(self) -> Any:
855
+ """Provider to use for internal CLI analytics (never mutates KV prompt cache)."""
856
+ if self.prompt_cache_mode != "kv":
857
+ return self.provider
858
+ return _NoPromptCacheProvider(self.provider)
859
+
860
+ def _init_prompt_caching(self, *, show_banner: bool) -> None:
861
+ if not self._supports_prompt_cache():
862
+ self.prompt_cache_mode = "off"
863
+ return
864
+
865
+ # Default policy:
866
+ # - MLX: local KV cache (append-only) with explicit prefill (system+tools).
867
+ # - Other providers: key-only hint (pass-through / best-effort).
868
+ if self._is_mlx_provider():
869
+ self.prompt_cache_mode = "kv"
870
+ else:
871
+ self.prompt_cache_mode = "key"
872
+
873
+ self.prompt_cache_key = f"cli:{uuid.uuid4().hex[:12]}"
874
+ try:
875
+ ok = bool(getattr(self.provider, "prompt_cache_set")(self.prompt_cache_key, make_default=True))
876
+ except Exception:
877
+ ok = False
878
+
879
+ if not ok:
880
+ self.prompt_cache_mode = "off"
881
+ self.prompt_cache_key = None
882
+ return
883
+
884
+ if self.prompt_cache_mode == "kv":
885
+ # Prefill stable modules once so each turn can be appended safely.
886
+ try:
887
+ getattr(self.provider, "prompt_cache_update")(
888
+ self.prompt_cache_key,
889
+ system_prompt=self.session.system_prompt,
890
+ tools=self.session.tools,
891
+ add_generation_prompt=False,
892
+ )
893
+ except Exception as e:
894
+ self._print_warn(f"⚠️ Prompt cache prefill failed; falling back to key-only mode: {e}")
895
+ self.prompt_cache_mode = "key"
896
+
897
+ if show_banner:
898
+ if self.prompt_cache_mode == "kv":
899
+ print(f"🧠 Prompt caching: ON (KV local) key={self.prompt_cache_key}")
900
+ elif self.prompt_cache_mode == "key":
901
+ print(f"🧠 Prompt caching: ON (key hint) key={self.prompt_cache_key}")
902
+
903
+ def handle_clear(self) -> None:
904
+ """Clear prompt cache and context (best-effort)."""
905
+ # Clear session transcript (keep system prompt for user visibility).
906
+ self.session.clear_history(keep_system=True)
907
+
908
+ if not self._supports_prompt_cache():
909
+ print("🧹 Context cleared (prompt caching unsupported)")
910
+ return
911
+
912
+ # Clear provider-side in-process caches (best-effort).
913
+ try:
914
+ getattr(self.provider, "prompt_cache_clear")(None)
915
+ except Exception:
916
+ pass
917
+
918
+ # Re-init caching for this run.
919
+ self.prompt_cache_key = None
920
+ self.prompt_cache_file = None
921
+ self._init_prompt_caching(show_banner=False)
922
+
923
+ if self.prompt_cache_mode == "off":
924
+ print("🧹 Context cleared (prompt caching disabled)")
925
+ else:
926
+ print("🧹 Context + prompt cache cleared")
927
+
928
+ def handle_cache_clear(self) -> None:
929
+ """Clear prompt cache only (best-effort)."""
930
+ if not self._supports_prompt_cache():
931
+ print("🧹 Prompt cache cleared (prompt caching unsupported)")
932
+ return
933
+
934
+ # In KV mode the cache is the source-of-truth for model context; clearing it without clearing
935
+ # or resending history would desync the model and the transcript. Rebuild from transcript.
936
+ if self.prompt_cache_mode == "kv":
937
+ self._print_warn("⚠️ KV cache cleared; rebuilding from current session transcript")
938
+ try:
939
+ self._rebuild_kv_cache_from_session()
940
+ return
941
+ except Exception as e:
942
+ self._print_error(f"❌ KV cache rebuild failed: {e}")
943
+ self._print_warn("⚠️ Falling back to session-managed mode (no KV)")
944
+ self.prompt_cache_mode = "key"
945
+
946
+ # Key-only / remote mode: clear provider-side caches (best-effort) and rotate key.
947
+ try:
948
+ getattr(self.provider, "prompt_cache_clear")(None)
949
+ except Exception:
950
+ pass
951
+
952
+ self.prompt_cache_key = None
953
+ self.prompt_cache_file = None
954
+ self._init_prompt_caching(show_banner=False)
955
+
956
+ if self.prompt_cache_mode == "off":
957
+ print("🧹 Prompt cache cleared (prompt caching disabled)")
958
+ else:
959
+ print("🧹 Prompt cache cleared")
960
+
961
+ def handle_save_prompt_cache(self, filename: str, *, q8: bool = False) -> None:
962
+ """Save MLX prompt cache to disk (writes a `.safetensors` file; model-locked)."""
963
+ if not self._is_mlx_provider():
964
+ self._print_error("❌ KV cache save is only supported for provider 'mlx'")
965
+ return
966
+ if not self._supports_prompt_cache():
967
+ self._print_error("❌ This provider does not support prompt caching")
968
+ return
969
+ filename = self._force_extension(filename, ".safetensors")
970
+
971
+ key = self.prompt_cache_key
972
+ if not isinstance(key, str) or not key.strip():
973
+ self._print_error("❌ No active prompt cache key; start chatting first or /clear to re-init caching")
974
+ return
975
+
976
+ try:
977
+ cache_obj = getattr(self.provider, "_prompt_cache_store").get(key)
978
+ except Exception:
979
+ cache_obj = None
980
+
981
+ if cache_obj is None:
982
+ self._print_error("❌ Prompt cache is empty; nothing to save yet")
983
+ return
984
+
985
+ try:
986
+ from mlx_lm.models.cache import save_prompt_cache
987
+ except Exception:
988
+ self._print_error("❌ MLX cache saving requires mlx-lm (install: `pip install \"abstractcore[mlx]\"`)")
989
+ return
990
+
991
+ meta: Dict[str, str] = {
992
+ "format": "abstractcore-cli-prompt-cache/v1",
993
+ "provider": str(self.provider_name),
994
+ "model": str(getattr(self.provider, "model", self.model_name)),
995
+ "saved_at": datetime.now().isoformat(),
996
+ }
997
+ try:
998
+ tok = getattr(self.provider, "_prompt_cache_backend_token_count")(cache_obj)
999
+ if isinstance(tok, int) and tok >= 0:
1000
+ meta["token_count"] = str(tok)
1001
+ except Exception:
1002
+ pass
1003
+
1004
+ cache_to_save = cache_obj
1005
+ if q8:
1006
+ try:
1007
+ cache_to_save = [layer.to_quantized(group_size=64, bits=8) for layer in cache_obj]
1008
+ meta["quantized"] = "q8"
1009
+ except Exception as e:
1010
+ self._print_warn(f"⚠️ q8 quantization failed; saving full-precision cache: {e}")
1011
+
1012
+ try:
1013
+ save_prompt_cache(filename, cache_to_save, metadata=meta)
1014
+ self.prompt_cache_file = filename
1015
+ extra = ""
1016
+ if "token_count" in meta:
1017
+ extra = f" ({meta['token_count']} tokens)"
1018
+ print(f"💾 Cache saved to {filename}{extra}")
1019
+ except Exception as e:
1020
+ self._print_error(f"❌ Failed to save prompt cache: {e}")
1021
+
1022
+ def handle_load_prompt_cache(self, filename: str) -> None:
1023
+ """Load MLX prompt cache from disk (reads a `.safetensors` file; model-locked)."""
1024
+ if not self._is_mlx_provider():
1025
+ self._print_error("❌ KV cache load is only supported for provider 'mlx'")
1026
+ return
1027
+ if not self._supports_prompt_cache():
1028
+ self._print_error("❌ This provider does not support prompt caching")
1029
+ return
1030
+ resolved = self._resolve_cache_path(filename)
1031
+ if not resolved:
1032
+ self._print_error(f"❌ File not found: {self._force_extension(filename, '.safetensors')}")
1033
+ return
1034
+
1035
+ try:
1036
+ from mlx_lm.models.cache import load_prompt_cache
1037
+ except Exception:
1038
+ self._print_error("❌ MLX cache loading requires mlx-lm (install: `pip install \"abstractcore[mlx]\"`)")
1039
+ return
1040
+
1041
+ try:
1042
+ loaded_cache, meta = load_prompt_cache(resolved, return_metadata=True)
1043
+ except Exception as e:
1044
+ self._print_error(f"❌ Failed to load prompt cache: {e}")
1045
+ return
1046
+
1047
+ required_model = None
1048
+ if isinstance(meta, dict):
1049
+ required_model = meta.get("model") or meta.get("model_id")
1050
+ current_model = str(getattr(self.provider, "model", self.model_name))
1051
+
1052
+ if isinstance(required_model, str) and required_model.strip() and required_model.strip() != current_model:
1053
+ self._print_error(
1054
+ "❌ Prompt cache model mismatch:\n"
1055
+ f" cache expects: {required_model}\n"
1056
+ f" current model: {current_model}\n"
1057
+ f" hint: run `/model mlx:{required_model}` then `/cache load {self._force_extension(filename, '.safetensors')}`"
1058
+ )
1059
+ return
1060
+ if not isinstance(required_model, str) or not required_model.strip():
1061
+ # Best-effort structural check: layer count mismatch is a strong signal of wrong model.
1062
+ try:
1063
+ expected = getattr(self.provider, "_prompt_cache_backend_create")()
1064
+ if isinstance(expected, (list, tuple)) and isinstance(loaded_cache, (list, tuple)):
1065
+ if len(expected) != len(loaded_cache):
1066
+ self._print_error(
1067
+ "❌ Prompt cache appears incompatible with the current model (layer count mismatch).\n"
1068
+ f" cache layers: {len(loaded_cache)}\n"
1069
+ f" model layers: {len(expected)}\n"
1070
+ f" hint: regenerate the cache with this model, or switch model and retry"
1071
+ )
1072
+ return
1073
+ except Exception:
1074
+ pass
1075
+ self._print_warn("⚠️ Cache metadata has no model id; cannot fully verify compatibility (proceeding best-effort)")
1076
+
1077
+ # Clear existing caches and install the loaded cache under a fresh key.
1078
+ try:
1079
+ getattr(self.provider, "prompt_cache_clear")(None)
1080
+ except Exception:
1081
+ pass
1082
+
1083
+ new_key = f"cli:{uuid.uuid4().hex[:12]}"
1084
+ try:
1085
+ getattr(self.provider, "prompt_cache_set")(new_key, make_default=True)
1086
+ except Exception:
1087
+ pass
1088
+
1089
+ try:
1090
+ getattr(self.provider, "_prompt_cache_store").set(
1091
+ new_key,
1092
+ loaded_cache,
1093
+ meta={"backend": "mlx", "loaded_from": resolved, **(meta if isinstance(meta, dict) else {})},
1094
+ )
1095
+ except Exception as e:
1096
+ self._print_error(f"❌ Failed to install loaded cache into provider store: {e}")
1097
+ return
1098
+
1099
+ self.prompt_cache_mode = "kv"
1100
+ self.prompt_cache_key = new_key
1101
+ self.prompt_cache_file = resolved
1102
+
1103
+ # Reset transcript; the cache becomes the source of truth for context.
1104
+ self.session.clear_history(keep_system=False)
1105
+ token_note = ""
1106
+ if isinstance(meta, dict) and isinstance(meta.get("token_count"), str) and meta.get("token_count"):
1107
+ token_note = f" ({meta.get('token_count')} tokens)"
1108
+ print(f"📂 Cache loaded from {resolved}{token_note} (key={new_key})")
1109
+
1110
+ cache_format = meta.get("format") if isinstance(meta, dict) else None
1111
+ force_refresh = cache_format != "abstractcore-cli-prompt-cache/v1"
1112
+ if force_refresh and not self.single_prompt_mode:
1113
+ self._print_warn(
1114
+ "⚠️ Loaded cache has no AbstractCore CLI metadata; it may not include tool specs.\n"
1115
+ " Injecting current CLI tool definitions into the KV cache for recency."
1116
+ )
1117
+ self._kv_refresh_tools_if_needed(reason="cache load", force=force_refresh)
1118
+
393
1119
  def handle_compact(self, focus: Optional[str] = None):
394
1120
  """Handle /compact [focus] command - compact chat history with optional focus"""
395
1121
  messages = self.session.get_messages()
@@ -419,10 +1145,17 @@ class SimpleCLI:
419
1145
  start_time = time.time()
420
1146
 
421
1147
  # Perform in-place compaction with optional focus
422
- self.session.force_compact(
423
- preserve_recent=4, # Keep last 6 messages (3 exchanges)
424
- focus=focus or "key information and ongoing context"
1148
+ compacted = self.session.compact(
1149
+ preserve_recent=4, # Keep last 4 messages (2 exchanges)
1150
+ focus=focus or "key information and ongoing context",
1151
+ compact_provider=compact_provider,
1152
+ reason="user_requested",
425
1153
  )
1154
+ # Replace current session with compacted version (in-place).
1155
+ try:
1156
+ self.session._replace_with_compacted(compacted)
1157
+ except Exception:
1158
+ self.session = compacted
426
1159
 
427
1160
  duration = time.time() - start_time
428
1161
 
@@ -439,10 +1172,10 @@ class SimpleCLI:
439
1172
  else:
440
1173
  print(f" {i+1}. ⚙️ System prompt")
441
1174
  elif msg.role == 'user':
442
- preview = msg.content[:50] + "..." if len(msg.content) > 50 else msg.content
1175
+ preview = preview_text(msg.content, max_chars=50)
443
1176
  print(f" {i+1}. 👤 {preview}")
444
1177
  elif msg.role == 'assistant':
445
- preview = msg.content[:50] + "..." if len(msg.content) > 50 else msg.content
1178
+ preview = preview_text(msg.content, max_chars=50)
446
1179
  print(f" {i+1}. 🤖 {preview}")
447
1180
 
448
1181
  print(" 💡 Note: Token count may increase initially due to detailed summary")
@@ -463,7 +1196,7 @@ class SimpleCLI:
463
1196
  print("🔍 Extracting facts from conversation history...")
464
1197
 
465
1198
  # Create fact extractor using current provider for consistency
466
- extractor = BasicExtractor(self.provider)
1199
+ extractor = BasicExtractor(self._analysis_provider())
467
1200
 
468
1201
  # Format conversation history as text
469
1202
  conversation_text = self._format_conversation_for_extraction(messages)
@@ -539,7 +1272,7 @@ class SimpleCLI:
539
1272
  print("⚖️ Evaluating conversation quality...")
540
1273
 
541
1274
  # Create judge using current provider for consistency
542
- judge = BasicJudge(self.provider)
1275
+ judge = BasicJudge(self._analysis_provider())
543
1276
 
544
1277
  # Format conversation history as text
545
1278
  conversation_text = self._format_conversation_for_extraction(messages)
@@ -653,7 +1386,7 @@ class SimpleCLI:
653
1386
  print("🎯 Analyzing conversation intents for all participants...")
654
1387
 
655
1388
  # Create intent analyzer using current provider for consistency
656
- analyzer = BasicIntentAnalyzer(self.provider)
1389
+ analyzer = BasicIntentAnalyzer(self._analysis_provider())
657
1390
 
658
1391
  # Convert session messages to the format expected by intent analyzer
659
1392
  conversation_messages = [msg for msg in messages if msg.role != 'system']
@@ -717,7 +1450,7 @@ class SimpleCLI:
717
1450
  # Truncate long response approaches for readability
718
1451
  response_approach = analysis.suggested_response_approach
719
1452
  if len(response_approach) > 200:
720
- response_approach = response_approach[:197] + "..."
1453
+ response_approach = preview_text(response_approach, max_chars=200)
721
1454
  print(f" {response_approach}")
722
1455
 
723
1456
  # Analysis metadata
@@ -861,18 +1594,26 @@ class SimpleCLI:
861
1594
  break
862
1595
  else:
863
1596
  # No existing system message, add one at the beginning
864
- self.session.messages.insert(0, self.session.add_message('system', new_prompt))
1597
+ created = self.session.add_message('system', new_prompt)
1598
+ # add_message appends; move the created system message to the front for correct ordering.
1599
+ try:
1600
+ self.session.messages.remove(created)
1601
+ except Exception:
1602
+ pass
1603
+ self.session.messages.insert(0, created)
865
1604
 
866
1605
  print("✅ System prompt updated!")
867
1606
  print(f"📝 Old: {old_prompt[:100]}{'...' if len(old_prompt) > 100 else ''}")
868
1607
  print(f"📝 New: {new_prompt[:100]}{'...' if len(new_prompt) > 100 else ''}")
869
1608
 
1609
+ if self.prompt_cache_mode == "kv":
1610
+ self._print_warn("⚠️ KV prompt cache invalidated by system prompt change; clearing cache and context")
1611
+ self.handle_clear()
1612
+
870
1613
  def handle_save(self, filename: str, summary: bool = False, assessment: bool = False, facts: bool = False):
871
1614
  """Handle /save <file> command - save current session to file with optional analytics"""
872
1615
  try:
873
- # Ensure .json extension for consistency
874
- if not filename.endswith('.json'):
875
- filename = f"{filename}.json"
1616
+ filename = self._force_extension(filename, ".json")
876
1617
 
877
1618
  print(f"💾 Saving session to {filename}...")
878
1619
 
@@ -882,11 +1623,12 @@ class SimpleCLI:
882
1623
 
883
1624
  # Generate optional analytics if requested
884
1625
  analytics_generated = []
1626
+ analysis_provider = self._analysis_provider()
885
1627
 
886
1628
  if summary:
887
1629
  print(" 🔄 Generating summary...")
888
1630
  try:
889
- self.session.generate_summary(focus="key discussion points")
1631
+ self.session.generate_summary(focus="key discussion points", compact_provider=analysis_provider)
890
1632
  analytics_generated.append("summary")
891
1633
  print(" ✅ Summary generated")
892
1634
  except Exception as e:
@@ -894,20 +1636,38 @@ class SimpleCLI:
894
1636
 
895
1637
  if assessment:
896
1638
  print(" 🔄 Generating assessment...")
1639
+ original_provider = None
897
1640
  try:
1641
+ original_provider = self.session.provider
1642
+ self.session.provider = analysis_provider
898
1643
  self.session.generate_assessment()
1644
+ self.session.provider = original_provider
899
1645
  analytics_generated.append("assessment")
900
1646
  print(" ✅ Assessment generated")
901
1647
  except Exception as e:
1648
+ try:
1649
+ if original_provider is not None:
1650
+ self.session.provider = original_provider
1651
+ except Exception:
1652
+ pass
902
1653
  print(f" ⚠️ Assessment generation failed: {e}")
903
1654
 
904
1655
  if facts:
905
1656
  print(" 🔄 Extracting facts...")
1657
+ original_provider = None
906
1658
  try:
1659
+ original_provider = self.session.provider
1660
+ self.session.provider = analysis_provider
907
1661
  self.session.extract_facts()
1662
+ self.session.provider = original_provider
908
1663
  analytics_generated.append("facts")
909
1664
  print(" ✅ Facts extracted")
910
1665
  except Exception as e:
1666
+ try:
1667
+ if original_provider is not None:
1668
+ self.session.provider = original_provider
1669
+ except Exception:
1670
+ pass
911
1671
  print(f" ⚠️ Fact extraction failed: {e}")
912
1672
 
913
1673
  # Save using enhanced serialization
@@ -935,17 +1695,15 @@ class SimpleCLI:
935
1695
  def handle_load(self, filename: str):
936
1696
  """Handle /load <file> command - load session from file"""
937
1697
  try:
938
- # Ensure .json extension for consistency
939
- if not filename.endswith('.json'):
940
- filename = f"{filename}.json"
1698
+ resolved = self._resolve_session_path(filename) or self._force_extension(filename, ".json")
941
1699
 
942
1700
  # Check if file exists
943
1701
  import os
944
- if not os.path.exists(filename):
945
- print(f"❌ File not found: {filename}")
1702
+ if not os.path.exists(resolved):
1703
+ print(f"❌ File not found: {resolved}")
946
1704
  return
947
1705
 
948
- print(f"📂 Loading session from {filename}...")
1706
+ print(f"📂 Loading session from {resolved}...")
949
1707
 
950
1708
  # Store current session info for comparison
951
1709
  old_messages = len(self.session.get_messages())
@@ -955,17 +1713,27 @@ class SimpleCLI:
955
1713
  from ..tools.common_tools import list_files, read_file, write_file, execute_command, search_files
956
1714
  tools = [list_files, read_file, write_file, execute_command, search_files]
957
1715
 
958
- loaded_session = BasicSession.load(filename, provider=self.provider, tools=tools)
1716
+ loaded_session = BasicSession.load(resolved, provider=self.provider, tools=tools)
959
1717
 
960
1718
  # Replace current session
961
1719
  self.session = loaded_session
962
-
1720
+
1721
+ # If we're in local KV cache mode (MLX), rebuild the cache from the loaded transcript so
1722
+ # the model context matches what the user sees.
1723
+ if self._is_mlx_provider() and self._supports_prompt_cache():
1724
+ try:
1725
+ self.prompt_cache_mode = "kv"
1726
+ self._rebuild_kv_cache_from_session()
1727
+ except Exception as e:
1728
+ self._print_warn(f"⚠️ KV cache rebuild from session failed; continuing without KV mode: {e}")
1729
+ self.prompt_cache_mode = "key"
1730
+
963
1731
  # Get new session info
964
1732
  new_messages = len(self.session.get_messages())
965
1733
  new_tokens = self.session.get_token_estimate()
966
1734
 
967
1735
  print(f"✅ Session loaded successfully!")
968
- print(f" 📁 File: {filename}")
1736
+ print(f" 📁 File: {resolved}")
969
1737
  print(f" 📝 Messages: {old_messages} → {new_messages}")
970
1738
  print(f" 🔢 Tokens: ~{old_tokens:,} → ~{new_tokens:,}")
971
1739
  print(f" 🤖 Provider: {self.provider_name}:{self.model_name} (current)")
@@ -994,6 +1762,69 @@ class SimpleCLI:
994
1762
  import traceback
995
1763
  traceback.print_exc()
996
1764
 
1765
+ def _rebuild_kv_cache_from_session(self) -> None:
1766
+ """Best-effort rebuild of the local KV prompt cache from the current session transcript."""
1767
+ if not self._is_mlx_provider():
1768
+ return
1769
+ if not self._supports_prompt_cache():
1770
+ return
1771
+
1772
+ # Fresh cache key for the rebuilt state.
1773
+ try:
1774
+ getattr(self.provider, "prompt_cache_clear")(None)
1775
+ except Exception:
1776
+ pass
1777
+
1778
+ key = f"cli:{uuid.uuid4().hex[:12]}"
1779
+ ok = False
1780
+ try:
1781
+ ok = bool(getattr(self.provider, "prompt_cache_set")(key, make_default=True))
1782
+ except Exception:
1783
+ ok = False
1784
+
1785
+ if not ok:
1786
+ self.prompt_cache_mode = "off"
1787
+ self.prompt_cache_key = None
1788
+ raise RuntimeError("provider failed to create a prompt cache")
1789
+
1790
+ # Prefill stable modules.
1791
+ try:
1792
+ getattr(self.provider, "prompt_cache_update")(
1793
+ key,
1794
+ system_prompt=self.session.system_prompt,
1795
+ tools=self.session.tools,
1796
+ add_generation_prompt=False,
1797
+ )
1798
+ except Exception as e:
1799
+ raise RuntimeError(f"failed to prefill system/tools: {e}") from e
1800
+
1801
+ # Append any additional transcript messages (excluding the main system prompt we just prefixed).
1802
+ messages_to_append: List[Dict[str, Any]] = []
1803
+ for msg in self.session.get_messages():
1804
+ role = getattr(msg, "role", None)
1805
+ content = getattr(msg, "content", None)
1806
+ if role == "system":
1807
+ if isinstance(self.session.system_prompt, str) and content == self.session.system_prompt and not str(content).startswith("[CONVERSATION HISTORY]"):
1808
+ continue
1809
+ if role and content is not None:
1810
+ messages_to_append.append({"role": role, "content": content})
1811
+
1812
+ if messages_to_append:
1813
+ try:
1814
+ getattr(self.provider, "prompt_cache_update")(
1815
+ key,
1816
+ messages=messages_to_append,
1817
+ add_generation_prompt=False,
1818
+ )
1819
+ except Exception as e:
1820
+ raise RuntimeError(f"failed to append transcript messages: {e}") from e
1821
+
1822
+ self.prompt_cache_key = key
1823
+ self.prompt_cache_file = None
1824
+ self.prompt_cache_mode = "kv"
1825
+ print(f"🧠 KV prompt cache rebuilt from session (key={key}, messages={len(messages_to_append)})")
1826
+ self._kv_refresh_tools_if_needed(reason="session rebuild", force=False)
1827
+
997
1828
  def handle_tooltag_test(self, opening_tag: str, closing_tag: str):
998
1829
  """Handle /tooltag command - demonstrate tool call format handling"""
999
1830
  print(f"🏷️ Tool call format testing: {opening_tag}...{closing_tag}")
@@ -1010,6 +1841,29 @@ class SimpleCLI:
1010
1841
  print(f"🔧 Provider: {self.provider_name}")
1011
1842
  print(f"🤖 Model: {self.model_name}")
1012
1843
  print(f"🌊 Streaming: {'Enabled' if self.stream_mode else 'Disabled'}")
1844
+ thinking_label = "auto" if self.thinking is None else ("on" if self.thinking is True else "off" if self.thinking is False else str(self.thinking))
1845
+ print(f"🧠 Thinking: {thinking_label}")
1846
+ show_reasoning_label = "auto" if self.show_reasoning is None else ("on" if self.show_reasoning else "off")
1847
+ print(f"🧠 Show reasoning: {show_reasoning_label}")
1848
+ if self.prompt_cache_mode != "off":
1849
+ cache_details = f"mode={self.prompt_cache_mode}"
1850
+ if self.prompt_cache_key:
1851
+ cache_details += f" key={self.prompt_cache_key}"
1852
+ if self.prompt_cache_file:
1853
+ cache_details += f" file={self.prompt_cache_file}"
1854
+ print(f"🧠 Prompt caching: {cache_details}")
1855
+ try:
1856
+ if hasattr(self.provider, "get_prompt_cache_stats"):
1857
+ stats = self.provider.get_prompt_cache_stats()
1858
+ if isinstance(stats, dict):
1859
+ entries = stats.get("entries")
1860
+ max_entries = stats.get("max_entries")
1861
+ if entries is not None and max_entries is not None:
1862
+ print(f" Cache store: {entries}/{max_entries} entries")
1863
+ except Exception:
1864
+ pass
1865
+ else:
1866
+ print("🧠 Prompt caching: off")
1013
1867
 
1014
1868
  # Debug status - show both CLI and system logging
1015
1869
  print(f"🐛 CLI Debug: {'Enabled' if self.debug_mode else 'Disabled'}")
@@ -1035,7 +1889,8 @@ class SimpleCLI:
1035
1889
 
1036
1890
  # Token usage
1037
1891
  current_tokens = self.session.get_token_estimate()
1038
- print(f"💾 Token Usage: {current_tokens:,} / {self.max_tokens:,} tokens ({(current_tokens/self.max_tokens*100):.1f}%)")
1892
+ print(f"💾 Context Usage: {current_tokens:,} / {self.max_tokens:,} tokens ({(current_tokens/self.max_tokens*100):.1f}%)")
1893
+ print(f"✍️ Max Output Tokens: {self.max_output_tokens:,}")
1039
1894
 
1040
1895
  # Model capabilities
1041
1896
  try:
@@ -1050,6 +1905,11 @@ class SimpleCLI:
1050
1905
  print(f" Vision Support: {'Yes' if capabilities.get('vision_support', False) else 'No'}")
1051
1906
  print(f" Audio Support: {'Yes' if capabilities.get('audio_support', False) else 'No'}")
1052
1907
  print(f" Thinking Support: {'Yes' if capabilities.get('thinking_support', False) else 'No'}")
1908
+ reasoning_levels = capabilities.get("reasoning_levels")
1909
+ if isinstance(reasoning_levels, list) and reasoning_levels:
1910
+ levels_str = ", ".join([str(x) for x in reasoning_levels if isinstance(x, str) and x.strip()])
1911
+ if levels_str:
1912
+ print(f" Reasoning Levels: {levels_str}")
1053
1913
 
1054
1914
  # Show aliases if any
1055
1915
  aliases = capabilities.get('aliases', [])
@@ -1129,23 +1989,37 @@ class SimpleCLI:
1129
1989
  if not clean_input and media_files:
1130
1990
  clean_input = "Please analyze the attached file(s)."
1131
1991
 
1992
+ clean_input = self._timestamp_user_message(clean_input)
1993
+
1132
1994
  if self.debug_mode:
1133
1995
  print(f"🔍 Sending to {self.provider_name}:{self.model_name}")
1134
1996
  if media_files:
1135
1997
  print(f"🔍 Media files: {media_files}")
1136
1998
 
1137
- # Generate response with media support
1138
- response = self.session.generate(
1139
- clean_input,
1140
- stream=self.stream_mode,
1141
- media=media_files if media_files else None
1142
- )
1999
+ if self.prompt_cache_mode == "kv":
2000
+ response = self._generate_response_kv(
2001
+ clean_input,
2002
+ media=media_files if media_files else None,
2003
+ )
2004
+ else:
2005
+ # Generate response with media support (session-managed history)
2006
+ gen_kwargs: Dict[str, Any] = {
2007
+ "stream": self.stream_mode,
2008
+ "media": media_files if media_files else None,
2009
+ "max_output_tokens": self.max_output_tokens,
2010
+ }
2011
+ if self.thinking is not None:
2012
+ gen_kwargs["thinking"] = self.thinking
2013
+ response = self.session.generate(clean_input, **gen_kwargs)
1143
2014
 
1144
2015
  if self.stream_mode:
1145
- if not self.single_prompt_mode:
2016
+ show_reasoning = self._should_show_reasoning() and not self.single_prompt_mode
2017
+ buffer_for_reasoning_first = self._should_buffer_stream_for_reasoning_first()
2018
+ if not self.single_prompt_mode and not buffer_for_reasoning_first:
1146
2019
  print("🤖 Assistant: ", end="", flush=True)
1147
2020
  full_content = ""
1148
2021
  display_buffer = "" # Buffer for cleaned display content
2022
+ reasoning_parts: List[str] = []
1149
2023
 
1150
2024
  for chunk in response:
1151
2025
  if hasattr(chunk, 'content') and chunk.content:
@@ -1170,17 +2044,34 @@ class SimpleCLI:
1170
2044
  '```tool_code'
1171
2045
  ])
1172
2046
 
1173
- if not has_tool_marker:
1174
- print(chunk_text, end="", flush=True)
2047
+ # If we want reasoning-first display, buffer output (no live streaming).
2048
+ if buffer_for_reasoning_first:
1175
2049
  display_buffer += chunk_text
1176
2050
  else:
1177
- # Buffer the chunk, we'll process after streaming
1178
- display_buffer += chunk_text
2051
+ if not has_tool_marker:
2052
+ print(chunk_text, end="", flush=True)
2053
+ display_buffer += chunk_text
2054
+ else:
2055
+ # Buffer the chunk, we'll process after streaming
2056
+ display_buffer += chunk_text
2057
+
2058
+ # Best-effort: capture streamed reasoning metadata (OpenAI-compatible deltas, etc.).
2059
+ if hasattr(chunk, "metadata") and isinstance(getattr(chunk, "metadata"), dict):
2060
+ r = chunk.metadata.get("reasoning")
2061
+ if isinstance(r, str) and r.strip():
2062
+ reasoning_parts.append(r.strip())
1179
2063
 
1180
- print() # New line after streaming
2064
+ if not buffer_for_reasoning_first:
2065
+ print() # New line after streaming
1181
2066
 
1182
2067
  # Parse and execute tool calls from full content
1183
2068
  clean_content, tool_calls = self._parse_and_strip_tool_calls(full_content)
2069
+ if self.prompt_cache_mode == "kv":
2070
+ # Maintain transcript for UX; model context lives in KV cache.
2071
+ try:
2072
+ self.session.add_message("assistant", clean_content.strip() or full_content)
2073
+ except Exception:
2074
+ pass
1184
2075
 
1185
2076
  # If we buffered tool call content, we should have shown clean content
1186
2077
  # For now, if there's significant difference, show the clean version
@@ -1189,12 +2080,38 @@ class SimpleCLI:
1189
2080
  # This happens when tool calls appear mid-stream
1190
2081
  if self.debug_mode:
1191
2082
  print(f"\n🔍 Cleaned content differs from streamed content")
1192
-
2083
+
2084
+ combined = "\n\n".join(reasoning_parts).strip() if reasoning_parts else ""
2085
+ if show_reasoning and combined:
2086
+ self._print_reasoning_block(combined)
2087
+
2088
+ # Reasoning-first UX: show the final answer after reasoning (buffered).
2089
+ if buffer_for_reasoning_first:
2090
+ if clean_content.strip():
2091
+ print(f"🤖 Assistant: {clean_content}")
2092
+ elif tool_calls and not self.single_prompt_mode:
2093
+ print("🤖 Assistant: ", end="")
2094
+ elif self.single_prompt_mode:
2095
+ print(clean_content or full_content)
2096
+ else:
2097
+ print(f"🤖 Assistant: {clean_content or full_content}")
2098
+
1193
2099
  self._execute_tool_calls(tool_calls)
1194
2100
  else:
1195
2101
  # Non-streaming: parse content, display clean version, execute tools
1196
2102
  clean_content, tool_calls = self._parse_and_strip_tool_calls(response.content)
2103
+ if self.prompt_cache_mode == "kv":
2104
+ try:
2105
+ self.session.add_message("assistant", clean_content.strip() or response.content)
2106
+ except Exception:
2107
+ pass
1197
2108
 
2109
+ meta = getattr(response, "metadata", None)
2110
+ if self._should_show_reasoning() and not self.single_prompt_mode and isinstance(meta, dict):
2111
+ r = meta.get("reasoning")
2112
+ if isinstance(r, str) and r.strip():
2113
+ self._print_reasoning_block(r.strip())
2114
+
1198
2115
  # Display only the clean content (without tool call syntax)
1199
2116
  if clean_content.strip():
1200
2117
  if self.single_prompt_mode:
@@ -1204,14 +2121,14 @@ class SimpleCLI:
1204
2121
  elif tool_calls:
1205
2122
  # Only tool calls, no text response
1206
2123
  if not self.single_prompt_mode:
1207
- print(f"🤖 Assistant: ", end="")
2124
+ print("🤖 Assistant: ", end="")
1208
2125
  else:
1209
2126
  # Empty response
1210
2127
  if self.single_prompt_mode:
1211
2128
  print(response.content)
1212
2129
  else:
1213
2130
  print(f"🤖 Assistant: {response.content}")
1214
-
2131
+
1215
2132
  # Execute tool calls
1216
2133
  self._execute_tool_calls(tool_calls)
1217
2134
 
@@ -1227,6 +2144,96 @@ class SimpleCLI:
1227
2144
  import traceback
1228
2145
  traceback.print_exc()
1229
2146
 
2147
+ def _should_show_reasoning(self) -> bool:
2148
+ """Decide whether to display reasoning in the CLI output."""
2149
+ if self.show_reasoning is not None:
2150
+ return bool(self.show_reasoning)
2151
+ # Auto: show when present unless explicitly disabled.
2152
+ if self.thinking is False:
2153
+ return False
2154
+ return True
2155
+
2156
+ def _should_buffer_stream_for_reasoning_first(self) -> bool:
2157
+ """Decide whether to buffer streaming output to show reasoning before the answer."""
2158
+ if self.single_prompt_mode:
2159
+ return False
2160
+ if not self._should_show_reasoning():
2161
+ return False
2162
+
2163
+ # If the user explicitly enabled reasoning display or requested thinking, honor reasoning-first UX.
2164
+ if self.show_reasoning is True:
2165
+ return True
2166
+ if self.thinking is not None and self.thinking is not False:
2167
+ return True
2168
+
2169
+ # Auto mode: only buffer when the model is expected to emit a separate reasoning channel.
2170
+ try:
2171
+ from ..architectures.detection import detect_architecture, get_architecture_format, get_model_capabilities
2172
+
2173
+ caps = get_model_capabilities(self.model_name)
2174
+ arch = detect_architecture(self.model_name)
2175
+ arch_fmt = get_architecture_format(arch)
2176
+ except Exception:
2177
+ caps = {}
2178
+ arch_fmt = {}
2179
+
2180
+ resp_fmt = str((caps or {}).get("response_format") or "").strip().lower()
2181
+ if resp_fmt == "harmony":
2182
+ return True
2183
+
2184
+ for src in (caps, arch_fmt):
2185
+ if isinstance(src, dict):
2186
+ f = src.get("thinking_output_field")
2187
+ if isinstance(f, str) and f.strip():
2188
+ return True
2189
+
2190
+ return False
2191
+
2192
+ def _print_reasoning_block(self, reasoning: str) -> None:
2193
+ """Print reasoning in a visually distinct style (best-effort)."""
2194
+ import sys
2195
+
2196
+ text = reasoning.strip()
2197
+ if not text:
2198
+ return
2199
+
2200
+ print("🧠 Reasoning:")
2201
+ if sys.stdout.isatty():
2202
+ # Grey + italic (best-effort; not all terminals support italics).
2203
+ print(f"\x1b[90m\x1b[3m{text}\x1b[0m")
2204
+ else:
2205
+ print(text)
2206
+
2207
+ def _generate_response_kv(self, prompt: str, *, media: Optional[list] = None):
2208
+ """Generate response using append-only KV cache mode (local providers only)."""
2209
+ # Maintain a local transcript for UX, but do not send it to the model; the KV cache is source-of-truth.
2210
+ try:
2211
+ self.session.add_message("user", prompt)
2212
+ except Exception:
2213
+ pass
2214
+
2215
+ gen_kwargs: Dict[str, Any] = {
2216
+ "prompt": prompt,
2217
+ "messages": None,
2218
+ "system_prompt": None,
2219
+ "tools": None, # tools were prefixed into the cache during prefill
2220
+ "media": media,
2221
+ "stream": bool(self.stream_mode),
2222
+ "max_output_tokens": self.max_output_tokens,
2223
+ }
2224
+ if self.thinking is not None:
2225
+ gen_kwargs["thinking"] = self.thinking
2226
+ # Preserve session-level generation parameters for consistency.
2227
+ try:
2228
+ if getattr(self.session, "temperature", None) is not None:
2229
+ gen_kwargs["temperature"] = self.session.temperature
2230
+ if isinstance(getattr(self.session, "seed", None), int) and self.session.seed >= 0:
2231
+ gen_kwargs["seed"] = self.session.seed
2232
+ except Exception:
2233
+ pass
2234
+
2235
+ return self.provider.generate(**gen_kwargs)
2236
+
1230
2237
  def _parse_and_strip_tool_calls(self, content: str):
1231
2238
  """
1232
2239
  Parse tool calls from content and return (clean_content, tool_calls).
@@ -1337,7 +2344,7 @@ class SimpleCLI:
1337
2344
  if not self.single_prompt_mode:
1338
2345
  args_str = str(tool_args) if tool_args else "{}"
1339
2346
  if len(args_str) > 100:
1340
- args_str = args_str[:97] + "..."
2347
+ args_str = preview_text(args_str, max_chars=100)
1341
2348
  print(f"**{tool_name}({args_str})**")
1342
2349
 
1343
2350
  # Execute the tool
@@ -1435,14 +2442,18 @@ def main():
1435
2442
  epilog="""
1436
2443
  Examples:
1437
2444
  python -m abstractcore.utils.cli --provider ollama --model qwen3-coder:30b
1438
- python -m abstractcore.utils.cli --provider openai --model gpt-4o-mini --stream
1439
- python -m abstractcore.utils.cli --provider anthropic --model claude-3-5-haiku-20241022
2445
+ python -m abstractcore.utils.cli --provider openai --model gpt-5-mini --stream
2446
+ python -m abstractcore.utils.cli --provider anthropic --model claude-haiku-4-5
2447
+ python -m abstractcore.utils.cli --provider lmstudio --model qwen/qwen3-4b-2507 --base-url http://localhost:1234/v1
2448
+ python -m abstractcore.utils.cli --provider openrouter --model openai/gpt-4o-mini
1440
2449
  python -m abstractcore.utils.cli --prompt "What is Python?" # Uses configured defaults
1441
2450
 
1442
2451
  Key Commands:
1443
2452
  /help Show comprehensive command guide
1444
- /save <file> [--summary --assessment --facts] Save session with analytics
1445
- /load <file> Load saved session
2453
+ /session save <name> [--summary --assessment --facts] Save session JSON (writes .json)
2454
+ /session load <name> Load saved session JSON (reads .json)
2455
+ /cache save <name> Save MLX prompt/KV cache (writes .safetensors)
2456
+ /cache load <name> Load MLX prompt/KV cache (reads .safetensors)
1446
2457
  /status Show system status and capabilities
1447
2458
  /history [n] Show conversation history
1448
2459
  /model <provider:model> Switch LLM provider/model
@@ -1471,18 +2482,19 @@ build custom solutions using the AbstractCore framework directly.
1471
2482
 
1472
2483
  # Optional arguments (no longer required - will use configured defaults)
1473
2484
  parser.add_argument('--provider',
1474
- choices=['openai', 'anthropic', 'ollama', 'huggingface', 'mlx', 'lmstudio'],
2485
+ choices=['openai', 'anthropic', 'openrouter', 'openai-compatible', 'vllm', 'ollama', 'huggingface', 'mlx', 'lmstudio'],
1475
2486
  help='LLM provider to use (optional - uses configured default)')
1476
2487
  parser.add_argument('--model', help='Model name to use (optional - uses configured default)')
1477
2488
 
1478
2489
  # Optional arguments
1479
2490
  parser.add_argument('--stream', action='store_true', help='Enable streaming mode')
1480
2491
  parser.add_argument('--debug', action='store_true', help='Enable debug mode')
1481
- parser.add_argument('--max-tokens', type=int, default=None, help='Maximum tokens (default: auto-detect from model capabilities)')
2492
+ parser.add_argument('--max-tokens', type=int, default=None, help='Maximum total context tokens (default: auto-detect from model capabilities)')
2493
+ parser.add_argument('--max-output-tokens', type=int, default=None, help='Maximum output tokens per response (default: provider/model default)')
1482
2494
  parser.add_argument('--prompt', help='Execute single prompt and exit')
1483
2495
 
1484
2496
  # Provider-specific
1485
- parser.add_argument('--base-url', help='Base URL (ollama, lmstudio)')
2497
+ parser.add_argument('--base-url', help='Base URL override (OpenAI-compatible /v1 servers, proxies, Ollama)')
1486
2498
  parser.add_argument('--api-key', help='API key')
1487
2499
  parser.add_argument('--temperature', type=float, default=0.7, help='Temperature (default: 0.7)')
1488
2500
 
@@ -1554,6 +2566,7 @@ build custom solutions using the AbstractCore framework directly.
1554
2566
  model=model,
1555
2567
  stream=stream_mode,
1556
2568
  max_tokens=args.max_tokens,
2569
+ max_output_tokens=args.max_output_tokens,
1557
2570
  debug=args.debug,
1558
2571
  show_banner=not args.prompt, # Hide banner in single-prompt mode
1559
2572
  **kwargs
@@ -1567,4 +2580,4 @@ build custom solutions using the AbstractCore framework directly.
1567
2580
 
1568
2581
 
1569
2582
  if __name__ == "__main__":
1570
- main()
2583
+ main()