cortex-llm 1.0.7__tar.gz → 1.0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/PKG-INFO +5 -1
  2. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/README.md +4 -0
  3. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/__init__.py +1 -1
  4. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/inference_engine.py +48 -8
  5. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/metal/mlx_converter.py +105 -4
  6. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/ui/cli.py +231 -124
  7. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/ui/markdown_render.py +9 -0
  8. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex_llm.egg-info/PKG-INFO +5 -1
  9. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/pyproject.toml +1 -1
  10. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/setup.py +1 -1
  11. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/LICENSE +0 -0
  12. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/__main__.py +0 -0
  13. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/config.py +0 -0
  14. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/conversation_manager.py +0 -0
  15. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/fine_tuning/__init__.py +0 -0
  16. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/fine_tuning/dataset.py +0 -0
  17. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/fine_tuning/mlx_lora_trainer.py +0 -0
  18. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/fine_tuning/trainer.py +0 -0
  19. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/fine_tuning/wizard.py +0 -0
  20. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/gpu_validator.py +0 -0
  21. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/metal/__init__.py +0 -0
  22. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/metal/gpu_validator.py +0 -0
  23. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/metal/memory_pool.py +0 -0
  24. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/metal/mlx_accelerator.py +0 -0
  25. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/metal/mlx_compat.py +0 -0
  26. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/metal/mps_optimizer.py +0 -0
  27. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/metal/optimizer.py +0 -0
  28. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/metal/performance_profiler.py +0 -0
  29. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/model_downloader.py +0 -0
  30. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/model_manager.py +0 -0
  31. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/quantization/__init__.py +0 -0
  32. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/quantization/dynamic_quantizer.py +0 -0
  33. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/template_registry/__init__.py +0 -0
  34. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/template_registry/auto_detector.py +0 -0
  35. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/template_registry/config_manager.py +0 -0
  36. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/template_registry/interactive.py +0 -0
  37. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/template_registry/registry.py +0 -0
  38. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/__init__.py +0 -0
  39. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/base.py +0 -0
  40. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/complex/__init__.py +0 -0
  41. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/complex/reasoning.py +0 -0
  42. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/standard/__init__.py +0 -0
  43. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/standard/alpaca.py +0 -0
  44. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/standard/chatml.py +0 -0
  45. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/standard/gemma.py +0 -0
  46. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/standard/llama.py +0 -0
  47. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/standard/simple.py +0 -0
  48. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/ui/__init__.py +0 -0
  49. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex/ui/terminal_app.py +0 -0
  50. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex_llm.egg-info/SOURCES.txt +0 -0
  51. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex_llm.egg-info/dependency_links.txt +0 -0
  52. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex_llm.egg-info/entry_points.txt +0 -0
  53. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex_llm.egg-info/not-zip-safe +0 -0
  54. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex_llm.egg-info/requires.txt +0 -0
  55. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/cortex_llm.egg-info/top_level.txt +0 -0
  56. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/setup.cfg +0 -0
  57. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/tests/test_apple_silicon.py +0 -0
  58. {cortex_llm-1.0.7 → cortex_llm-1.0.9}/tests/test_metal_optimization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cortex-llm
3
- Version: 1.0.7
3
+ Version: 1.0.9
4
4
  Summary: GPU-Accelerated LLM Terminal for Apple Silicon
5
5
  Home-page: https://github.com/faisalmumtaz/Cortex
6
6
  Author: Cortex Development Team
@@ -131,6 +131,10 @@ Cortex supports:
131
131
  - `docs/template-registry.md`
132
132
  - **Inference engine details** and backend behavior
133
133
  - `docs/inference-engine.md`
134
+ - **Tooling (experimental, WIP)** for repo-scoped read/search and optional file edits with explicit confirmation
135
+ - `docs/cli.md`
136
+
137
+ **Important (Work in Progress):** Tooling is actively evolving and should be considered experimental. Behavior, output format, and available actions may change; tool calls can fail; and UI presentation may be adjusted. Use tooling on non-critical work first, and always review any proposed file changes before approving them.
134
138
 
135
139
  ## Configuration
136
140
 
@@ -73,6 +73,10 @@ Cortex supports:
73
73
  - `docs/template-registry.md`
74
74
  - **Inference engine details** and backend behavior
75
75
  - `docs/inference-engine.md`
76
+ - **Tooling (experimental, WIP)** for repo-scoped read/search and optional file edits with explicit confirmation
77
+ - `docs/cli.md`
78
+
79
+ **Important (Work in Progress):** Tooling is actively evolving and should be considered experimental. Behavior, output format, and available actions may change; tool calls can fail; and UI presentation may be adjusted. Use tooling on non-critical work first, and always review any proposed file changes before approving them.
76
80
 
77
81
  ## Configuration
78
82
 
@@ -5,7 +5,7 @@ A high-performance terminal interface for running Hugging Face LLMs locally
5
5
  with exclusive GPU acceleration via Metal Performance Shaders (MPS) and MLX.
6
6
  """
7
7
 
8
- __version__ = "1.0.7"
8
+ __version__ = "1.0.9"
9
9
  __author__ = "Cortex Development Team"
10
10
  __license__ = "MIT"
11
11
 
@@ -243,6 +243,33 @@ class InferenceEngine:
243
243
  tokens_generated = 0
244
244
  first_token_time = None
245
245
  last_metrics_update = time.time()
246
+ stream_total_text = ""
247
+ stream_cumulative = False
248
+
249
+ def normalize_stream_chunk(chunk: Any) -> str:
250
+ """Normalize streaming output to delta chunks when backend yields cumulative text."""
251
+ nonlocal stream_total_text, stream_cumulative
252
+ if chunk is None:
253
+ return ""
254
+ if not isinstance(chunk, str):
255
+ chunk = str(chunk)
256
+
257
+ if stream_cumulative:
258
+ if chunk.startswith(stream_total_text):
259
+ delta = chunk[len(stream_total_text):]
260
+ stream_total_text = chunk
261
+ return delta
262
+ stream_total_text += chunk
263
+ return chunk
264
+
265
+ if stream_total_text and len(chunk) > len(stream_total_text) and chunk.startswith(stream_total_text):
266
+ stream_cumulative = True
267
+ delta = chunk[len(stream_total_text):]
268
+ stream_total_text = chunk
269
+ return delta
270
+
271
+ stream_total_text += chunk
272
+ return chunk
246
273
 
247
274
  try:
248
275
  # Use MLX accelerator's optimized generation if available
@@ -262,10 +289,14 @@ class InferenceEngine:
262
289
  if self._cancel_event.is_set():
263
290
  self.status = InferenceStatus.CANCELLED
264
291
  break
265
-
292
+
293
+ delta = normalize_stream_chunk(token) if request.stream else str(token)
294
+ if not delta:
295
+ continue
296
+
266
297
  if first_token_time is None:
267
298
  first_token_time = time.time() - start_time
268
-
299
+
269
300
  tokens_generated += 1
270
301
 
271
302
  # Update metrics less frequently
@@ -284,13 +315,18 @@ class InferenceEngine:
284
315
  last_metrics_update = current_time
285
316
 
286
317
  # Token is already a string from generate_optimized
287
- yield token
318
+ yield delta
288
319
 
289
320
  if any(stop in token for stop in request.stop_sequences):
290
321
  break
291
322
  elif mlx_generate:
292
323
  # Fallback to standard MLX generation
293
- logger.info("Using standard MLX generation")
324
+ if request.stream and mlx_stream_generate:
325
+ logger.info("Using MLX streaming generation")
326
+ generate_fn = mlx_stream_generate
327
+ else:
328
+ logger.info("Using standard MLX generation")
329
+ generate_fn = mlx_generate
294
330
 
295
331
  # Import sample_utils for creating sampler
296
332
  try:
@@ -314,7 +350,7 @@ class InferenceEngine:
314
350
  if request.seed is not None and request.seed >= 0:
315
351
  mx.random.seed(request.seed)
316
352
 
317
- for response in mlx_generate(
353
+ for response in generate_fn(
318
354
  model,
319
355
  tokenizer,
320
356
  **generation_kwargs
@@ -328,10 +364,14 @@ class InferenceEngine:
328
364
  token = response.text
329
365
  else:
330
366
  token = str(response)
331
-
367
+
368
+ delta = normalize_stream_chunk(token) if request.stream else token
369
+ if request.stream and not delta:
370
+ continue
371
+
332
372
  if first_token_time is None:
333
373
  first_token_time = time.time() - start_time
334
-
374
+
335
375
  tokens_generated += 1
336
376
 
337
377
  # Update metrics less frequently to reduce overhead
@@ -352,7 +392,7 @@ class InferenceEngine:
352
392
  )
353
393
  last_metrics_update = current_time
354
394
 
355
- yield token
395
+ yield delta
356
396
 
357
397
  if any(stop in token for stop in request.stop_sequences):
358
398
  break
@@ -66,9 +66,22 @@ class MLXConverter:
66
66
  self.cache_dir.mkdir(parents=True, exist_ok=True)
67
67
  self.conversion_cache = self.cache_dir / "conversion_cache.json"
68
68
  self._load_conversion_cache()
69
+ self._warned_mlx_lm_compat = False
69
70
 
70
71
  logger.info(f"MLX Converter initialized with cache dir: {self.cache_dir}")
71
72
  logger.info(f"MLX LM available: {mlx_utils is not None and load is not None}")
73
+
74
+ def _warn_mlx_lm_compat(self, missing: str) -> None:
75
+ """Warn once when mlx-lm is missing newer helper APIs."""
76
+ if self._warned_mlx_lm_compat:
77
+ return
78
+ self._warned_mlx_lm_compat = True
79
+ message = (
80
+ f"[WARN] mlx-lm is missing '{missing}'. Using compatibility fallback. "
81
+ "For best support, upgrade mlx-lm to a newer version."
82
+ )
83
+ logger.warning(message)
84
+ print(message)
72
85
 
73
86
  def _load_conversion_cache(self) -> None:
74
87
  """Load conversion cache metadata."""
@@ -206,6 +219,83 @@ class MLXConverter:
206
219
 
207
220
  return download_dir
208
221
 
222
+ def _mlx_get_model_path(self, source_path: Path) -> Tuple[Path, Optional[str]]:
223
+ """Resolve model path with MLX LM compatibility fallbacks."""
224
+ if mlx_utils is not None and hasattr(mlx_utils, "get_model_path"):
225
+ return mlx_utils.get_model_path(str(source_path))
226
+ self._warn_mlx_lm_compat("get_model_path")
227
+
228
+ # Fallback: local path or direct HF download.
229
+ model_path = Path(source_path)
230
+ if model_path.exists():
231
+ hf_repo = None
232
+ try:
233
+ from huggingface_hub import ModelCard
234
+
235
+ card_path = model_path / "README.md"
236
+ if card_path.is_file():
237
+ card = ModelCard.load(card_path)
238
+ hf_repo = getattr(card.data, "base_model", None)
239
+ except Exception:
240
+ hf_repo = None
241
+ return model_path, hf_repo
242
+
243
+ try:
244
+ model_path = Path(
245
+ snapshot_download(
246
+ str(source_path),
247
+ allow_patterns=[
248
+ "*.json",
249
+ "model*.safetensors",
250
+ "*.py",
251
+ "tokenizer.model",
252
+ "*.tiktoken",
253
+ "tiktoken.model",
254
+ "*.txt",
255
+ "*.jsonl",
256
+ "*.jinja",
257
+ ],
258
+ )
259
+ )
260
+ except Exception as e:
261
+ raise RuntimeError(f"Failed to download model from Hugging Face: {e}") from e
262
+
263
+ return model_path, str(source_path)
264
+
265
+ def _mlx_fetch_from_hub(
266
+ self,
267
+ model_path: Path,
268
+ trust_remote_code: bool = False
269
+ ) -> Tuple[Any, Dict[str, Any], Any]:
270
+ """Fetch model/config/tokenizer with MLX LM compatibility fallbacks."""
271
+ if mlx_utils is not None and hasattr(mlx_utils, "fetch_from_hub"):
272
+ return mlx_utils.fetch_from_hub(
273
+ model_path,
274
+ lazy=True,
275
+ trust_remote_code=trust_remote_code
276
+ )
277
+ self._warn_mlx_lm_compat("fetch_from_hub")
278
+
279
+ if mlx_utils is not None and hasattr(mlx_utils, "load_model") and hasattr(mlx_utils, "load_tokenizer"):
280
+ model, model_config = mlx_utils.load_model(model_path, lazy=True)
281
+ try:
282
+ tokenizer = mlx_utils.load_tokenizer(
283
+ model_path,
284
+ eos_token_ids=model_config.get("eos_token_id", None),
285
+ tokenizer_config_extra={"trust_remote_code": trust_remote_code},
286
+ )
287
+ except TypeError:
288
+ tokenizer = mlx_utils.load_tokenizer(
289
+ model_path,
290
+ eos_token_ids=model_config.get("eos_token_id", None),
291
+ )
292
+ return model, model_config, tokenizer
293
+
294
+ raise RuntimeError(
295
+ "mlx_lm.utils is missing required helpers (fetch_from_hub/load_model). "
296
+ "Upgrade mlx-lm to a newer version."
297
+ )
298
+
209
299
  def _requires_sentencepiece(self, model_path: Path) -> bool:
210
300
  """Return True if the model likely needs SentencePiece."""
211
301
  # If a fast tokenizer is present, SentencePiece should not be required.
@@ -379,10 +469,17 @@ class MLXConverter:
379
469
  # Build quantization configuration
380
470
  quantize_config = self._build_quantization_config(config)
381
471
 
382
- model_path, hf_repo = mlx_utils.get_model_path(str(source_path))
383
- model, model_config, tokenizer = mlx_utils.fetch_from_hub(
384
- model_path, lazy=True, trust_remote_code=False
385
- )
472
+ try:
473
+ model_path, hf_repo = self._mlx_get_model_path(Path(source_path))
474
+ except Exception as e:
475
+ return False, f"Model path resolution failed: {e}", None
476
+
477
+ try:
478
+ model, model_config, tokenizer = self._mlx_fetch_from_hub(
479
+ model_path, trust_remote_code=False
480
+ )
481
+ except Exception as e:
482
+ return False, f"Model fetch failed: {e}", None
386
483
 
387
484
  dtype = model_config.get("torch_dtype", None)
388
485
  if dtype in ["float16", "bfloat16", "float32"]:
@@ -398,6 +495,8 @@ class MLXConverter:
398
495
  model.update(tree_map_with_path(set_dtype, model.parameters()))
399
496
 
400
497
  if config.quantization != QuantizationRecipe.NONE:
498
+ if mlx_utils is None or not hasattr(mlx_utils, "quantize_model"):
499
+ return False, "MLX LM quantize_model not available; upgrade mlx-lm.", None
401
500
  quant_predicate = None
402
501
  if quantize_config and "quant_predicate" in quantize_config:
403
502
  quant_predicate = quantize_config["quant_predicate"]
@@ -411,6 +510,8 @@ class MLXConverter:
411
510
  )
412
511
 
413
512
  normalized_hf_repo = self._normalize_hf_repo(hf_repo)
513
+ if mlx_utils is None or not hasattr(mlx_utils, "save"):
514
+ return False, "MLX LM save() not available; upgrade mlx-lm.", None
414
515
  mlx_utils.save(output_path, model_path, model, tokenizer, model_config, hf_repo=normalized_hf_repo)
415
516
  logger.info("MLX conversion completed")
416
517
 
@@ -18,6 +18,7 @@ from textwrap import wrap
18
18
 
19
19
  from rich.live import Live
20
20
  from rich.style import Style
21
+ from rich.text import Text
21
22
 
22
23
 
23
24
  logger = logging.getLogger(__name__)
@@ -30,6 +31,8 @@ from cortex.conversation_manager import ConversationManager, MessageRole
30
31
  from cortex.model_downloader import ModelDownloader
31
32
  from cortex.template_registry import TemplateRegistry
32
33
  from cortex.fine_tuning import FineTuneWizard
34
+ from cortex.tools import ToolRunner
35
+ from cortex.tools import protocol as tool_protocol
33
36
  from cortex.ui.markdown_render import ThinkMarkdown, PrefixedRenderable, render_plain_with_think
34
37
 
35
38
 
@@ -58,6 +61,11 @@ class CortexCLI:
58
61
 
59
62
  # Initialize fine-tuning wizard
60
63
  self.fine_tune_wizard = FineTuneWizard(model_manager, config)
64
+
65
+ # Tooling support (always enabled)
66
+ self.tool_runner = ToolRunner(Path.cwd())
67
+ self.tool_runner.set_confirm_callback(self._confirm_tool_change)
68
+ self.max_tool_iterations = 4
61
69
 
62
70
 
63
71
  self.running = True
@@ -132,6 +140,86 @@ class CortexCLI:
132
140
  # Don't call sys.exit() here - let the main loop exit naturally
133
141
  # This prevents traceback from the parent process
134
142
  print("\n", file=sys.stderr) # Just add a newline for cleaner output
143
+
144
+ def _confirm_tool_change(self, prompt: str) -> bool:
145
+ """Prompt user to approve a tool-driven change."""
146
+ print("\n" + prompt)
147
+ response = input("Apply change? [y/N]: ").strip().lower()
148
+ return response in {"y", "yes"}
149
+
150
+ def _ensure_tool_instructions(self) -> None:
151
+ """Inject tool instructions into the conversation once."""
152
+ conversation = self.conversation_manager.get_current_conversation()
153
+ if conversation is None:
154
+ conversation = self.conversation_manager.new_conversation()
155
+ marker = "[CORTEX_TOOL_INSTRUCTIONS v2]"
156
+ for message in conversation.messages:
157
+ if message.role == MessageRole.SYSTEM and marker in message.content:
158
+ return
159
+ self.conversation_manager.add_message(MessageRole.SYSTEM, self.tool_runner.tool_instructions())
160
+
161
+ def _summarize_tool_call(self, call: dict) -> str:
162
+ name = str(call.get("name", "tool"))
163
+ args = call.get("arguments") or {}
164
+ parts = []
165
+ preferred = ("path", "query", "anchor", "start_line", "end_line", "recursive", "max_results")
166
+ for key in preferred:
167
+ if key in args:
168
+ value = args[key]
169
+ if isinstance(value, str) and len(value) > 60:
170
+ value = value[:57] + "..."
171
+ parts.append(f"{key}={value!r}")
172
+ if not parts and args:
173
+ for key in list(args.keys())[:3]:
174
+ value = args[key]
175
+ if isinstance(value, str) and len(value) > 60:
176
+ value = value[:57] + "..."
177
+ parts.append(f"{key}={value!r}")
178
+ arg_str = ", ".join(parts)
179
+ return f"{name}({arg_str})" if arg_str else f"{name}()"
180
+
181
+ def _summarize_tool_result(self, result: dict) -> str:
182
+ name = str(result.get("name", "tool"))
183
+ if not result.get("ok", False):
184
+ error = result.get("error") or "unknown error"
185
+ return f"{name} -> error: {error}"
186
+ payload = result.get("result") or {}
187
+ if name == "list_dir":
188
+ entries = payload.get("entries") or []
189
+ return f"{name} -> entries={len(entries)}"
190
+ if name == "search":
191
+ matches = payload.get("results") or []
192
+ return f"{name} -> results={len(matches)}"
193
+ if name == "read_file":
194
+ path = payload.get("path") or ""
195
+ start = payload.get("start_line")
196
+ end = payload.get("end_line")
197
+ if start and end:
198
+ return f"{name} -> {path} lines {start}-{end}"
199
+ if start:
200
+ return f"{name} -> {path} from line {start}"
201
+ return f"{name} -> {path}"
202
+ if name in {"write_file", "create_file", "delete_file", "replace_in_file", "insert_after", "insert_before"}:
203
+ path = payload.get("path") or ""
204
+ return f"{name} -> {path}"
205
+ return f"{name} -> ok"
206
+
207
+ def _print_tool_activity(self, tool_calls: list, tool_results: list) -> None:
208
+ lines = []
209
+ for call, result in zip(tool_calls, tool_results):
210
+ lines.append(f"tool {self._summarize_tool_call(call)} -> {self._summarize_tool_result(result)}")
211
+ if not lines:
212
+ return
213
+ text = Text("\n".join(lines), style=Style(color="bright_black", italic=True))
214
+ renderable = PrefixedRenderable(text, prefix=" ", prefix_style=Style(dim=True), indent=" ", auto_space=False)
215
+ original_console_width = self.console._width
216
+ target_width = max(40, int(self.get_terminal_width() * 0.75))
217
+ self.console.width = target_width
218
+ try:
219
+ self.console.print(renderable, highlight=False, soft_wrap=True)
220
+ self.console.print()
221
+ finally:
222
+ self.console._width = original_console_width
135
223
 
136
224
 
137
225
  def get_terminal_width(self) -> int:
@@ -1110,16 +1198,10 @@ class CortexCLI:
1110
1198
  except Exception as e:
1111
1199
  logger.debug(f"Failed to get template profile: {e}")
1112
1200
 
1113
- # Build conversation context with proper formatting BEFORE adding to conversation
1114
- formatted_prompt = self._format_prompt_with_chat_template(user_input)
1115
-
1116
- # DEBUG: Uncomment these lines to see the exact prompt being sent to the model
1117
- # This is crucial for debugging when models give unexpected responses
1118
- # It shows the formatted prompt with all special tokens and formatting
1119
- # print(f"\033[33m[DEBUG] Formatted prompt being sent to model:\033[0m", file=sys.stderr)
1120
- # print(f"\033[33m{repr(formatted_prompt[:200])}...\033[0m", file=sys.stderr)
1121
-
1122
- # Now add user message to conversation history
1201
+ # Ensure tool instructions are present before adding user message
1202
+ self._ensure_tool_instructions()
1203
+
1204
+ # Now add user message to conversation history
1123
1205
  self.conversation_manager.add_message(MessageRole.USER, user_input)
1124
1206
 
1125
1207
  # Start response on a new line; prefix is rendered with the markdown output.
@@ -1134,130 +1216,154 @@ class CortexCLI:
1134
1216
  except Exception as e:
1135
1217
  logger.debug(f"Could not get stop sequences: {e}")
1136
1218
 
1137
- # Create generation request with formatted prompt
1138
- request = GenerationRequest(
1139
- prompt=formatted_prompt,
1140
- max_tokens=self.config.inference.max_tokens,
1141
- temperature=self.config.inference.temperature,
1142
- top_p=self.config.inference.top_p,
1143
- top_k=self.config.inference.top_k,
1144
- repetition_penalty=self.config.inference.repetition_penalty,
1145
- stream=self.config.inference.stream_output,
1146
- seed=self.config.inference.seed if self.config.inference.seed >= 0 else None,
1147
- stop_sequences=stop_sequences
1148
- )
1149
-
1150
- # Generate response
1219
+ # Generate response (with tool loop)
1151
1220
  self.generating = True
1152
- generated_text = ""
1153
- start_time = time.time()
1154
- token_count = 0
1155
- first_token_time = None
1156
1221
 
1157
1222
  try:
1158
- # Reset streaming state for reasoning templates if supported
1159
- if uses_reasoning_template and template_profile and template_profile.supports_streaming():
1160
- if hasattr(template_profile, 'reset_streaming_state'):
1161
- template_profile.reset_streaming_state()
1223
+ tool_iterations = 0
1224
+ while tool_iterations < self.max_tool_iterations:
1225
+ tool_iterations += 1
1162
1226
 
1163
- display_text = ""
1164
- accumulated_response = ""
1165
- last_render_time = 0.0
1166
- render_interval = 0.05 # seconds
1167
- prefix_style = Style(color="cyan")
1227
+ formatted_prompt = self._format_prompt_with_chat_template(user_input, include_user=False)
1168
1228
 
1169
- def build_renderable(text: str):
1170
- if getattr(self.config.ui, "markdown_rendering", True):
1171
- markdown = ThinkMarkdown(
1172
- text,
1173
- code_theme="monokai",
1174
- use_line_numbers=False,
1175
- syntax_highlighting=getattr(self.config.ui, "syntax_highlighting", True),
1176
- )
1177
- renderable = markdown
1178
- else:
1179
- renderable = render_plain_with_think(text)
1229
+ # DEBUG: Uncomment these lines to see the exact prompt being sent to the model
1230
+ # print(f"\033[33m[DEBUG] Formatted prompt being sent to model:\033[0m", file=sys.stderr)
1231
+ # print(f"\033[33m{repr(formatted_prompt[:200])}...\033[0m", file=sys.stderr)
1180
1232
 
1181
- return PrefixedRenderable(renderable, prefix="⏺ ", prefix_style=prefix_style, indent=" ")
1233
+ request = GenerationRequest(
1234
+ prompt=formatted_prompt,
1235
+ max_tokens=self.config.inference.max_tokens,
1236
+ temperature=self.config.inference.temperature,
1237
+ top_p=self.config.inference.top_p,
1238
+ top_k=self.config.inference.top_k,
1239
+ repetition_penalty=self.config.inference.repetition_penalty,
1240
+ stream=self.config.inference.stream_output,
1241
+ seed=self.config.inference.seed if self.config.inference.seed >= 0 else None,
1242
+ stop_sequences=stop_sequences
1243
+ )
1182
1244
 
1183
- original_console_width = self.console._width
1184
- target_width = max(40, int(self.get_terminal_width() * 0.75))
1185
- self.console.width = target_width
1186
- try:
1187
- with Live(
1188
- build_renderable(""),
1189
- console=self.console,
1190
- auto_refresh=False,
1191
- refresh_per_second=20,
1192
- transient=False,
1193
- vertical_overflow="visible",
1194
- ) as live:
1195
- for token in self.inference_engine.generate(request):
1196
- if first_token_time is None:
1197
- first_token_time = time.time()
1245
+ generated_text = ""
1246
+ start_time = time.time()
1247
+ token_count = 0
1248
+ first_token_time = None
1249
+ tool_calls_started = False
1198
1250
 
1199
- generated_text += token
1200
- token_count += 1
1251
+ if uses_reasoning_template and template_profile and template_profile.supports_streaming():
1252
+ if hasattr(template_profile, 'reset_streaming_state'):
1253
+ template_profile.reset_streaming_state()
1201
1254
 
1202
- display_token = token
1203
- if uses_reasoning_template and template_profile and template_profile.supports_streaming():
1204
- display_token, should_display = template_profile.process_streaming_response(
1205
- token, accumulated_response
1206
- )
1207
- accumulated_response += token
1208
- if not should_display:
1209
- display_token = ""
1255
+ display_text = ""
1256
+ accumulated_response = ""
1257
+ last_render_time = 0.0
1258
+ render_interval = 0.05 # seconds
1259
+ prefix_style = Style(color="cyan")
1260
+
1261
+ def build_renderable(text: str):
1262
+ if getattr(self.config.ui, "markdown_rendering", True):
1263
+ markdown = ThinkMarkdown(
1264
+ text,
1265
+ code_theme="monokai",
1266
+ use_line_numbers=False,
1267
+ syntax_highlighting=getattr(self.config.ui, "syntax_highlighting", True),
1268
+ )
1269
+ renderable = markdown
1270
+ else:
1271
+ renderable = render_plain_with_think(text)
1210
1272
 
1211
- if display_token:
1212
- display_text += display_token
1273
+ return PrefixedRenderable(renderable, prefix="⏺", prefix_style=prefix_style, indent=" ", auto_space=True)
1213
1274
 
1214
- now = time.time()
1215
- if display_token and ("\n" in display_token or now - last_render_time >= render_interval):
1275
+ original_console_width = self.console._width
1276
+ target_width = max(40, int(self.get_terminal_width() * 0.75))
1277
+ self.console.width = target_width
1278
+ try:
1279
+ with Live(
1280
+ build_renderable(""),
1281
+ console=self.console,
1282
+ auto_refresh=False,
1283
+ refresh_per_second=20,
1284
+ transient=False,
1285
+ vertical_overflow="visible",
1286
+ ) as live:
1287
+ for token in self.inference_engine.generate(request):
1288
+ if first_token_time is None:
1289
+ first_token_time = time.time()
1290
+
1291
+ generated_text += token
1292
+ token_count += 1
1293
+
1294
+ if not tool_calls_started and tool_protocol.find_tool_calls_block(generated_text)[0] is not None:
1295
+ tool_calls_started = True
1296
+ display_text = "<think>tools running...</think>"
1297
+ live.update(build_renderable(display_text), refresh=True)
1298
+
1299
+ display_token = token
1300
+ if uses_reasoning_template and template_profile and template_profile.supports_streaming():
1301
+ display_token, should_display = template_profile.process_streaming_response(
1302
+ token, accumulated_response
1303
+ )
1304
+ accumulated_response += token
1305
+ if not should_display:
1306
+ display_token = ""
1307
+
1308
+ if not tool_calls_started and display_token:
1309
+ display_text += display_token
1310
+
1311
+ now = time.time()
1312
+ if (not tool_calls_started and display_token and
1313
+ ("\n" in display_token or now - last_render_time >= render_interval)):
1314
+ live.update(build_renderable(display_text), refresh=True)
1315
+ last_render_time = now
1316
+
1317
+ if not tool_calls_started and uses_reasoning_template and template_profile:
1318
+ final_text = template_profile.process_response(generated_text)
1319
+ generated_text = final_text
1320
+ if not template_profile.config.show_reasoning:
1321
+ display_text = final_text
1216
1322
  live.update(build_renderable(display_text), refresh=True)
1217
- last_render_time = now
1323
+ finally:
1324
+ self.console._width = original_console_width
1218
1325
 
1219
- if uses_reasoning_template and template_profile:
1220
- final_text = template_profile.process_response(generated_text)
1221
- generated_text = final_text
1222
- if not template_profile.config.show_reasoning:
1223
- display_text = final_text
1326
+ tool_calls, parse_error = tool_protocol.parse_tool_calls(generated_text)
1327
+ if parse_error:
1328
+ print(f"\n\033[31m✗ Tool call parse error:\033[0m {parse_error}", file=sys.stderr)
1224
1329
 
1225
- live.update(build_renderable(display_text), refresh=True)
1226
- finally:
1227
- self.console._width = original_console_width
1330
+ if tool_calls:
1331
+ tool_results = self.tool_runner.run_calls(tool_calls)
1332
+ self._print_tool_activity(tool_calls, tool_results)
1333
+ self.conversation_manager.add_message(
1334
+ MessageRole.SYSTEM,
1335
+ tool_protocol.format_tool_results(tool_results)
1336
+ )
1337
+ if tool_iterations >= self.max_tool_iterations:
1338
+ print("\n\033[31m✗\033[0m Tool loop limit reached.", file=sys.stderr)
1339
+ break
1340
+ continue
1228
1341
 
1229
- # Display final metrics in a clean, professional way
1230
- elapsed = time.time() - start_time
1231
- if token_count > 0 and elapsed > 0:
1232
- tokens_per_sec = token_count / elapsed
1233
- first_token_latency = first_token_time - start_time if first_token_time else 0
1234
-
1235
- # Build metrics parts - all will be wrapped in dim for subtlety
1236
- metrics_parts = []
1237
-
1238
- if first_token_latency > 0.1:
1239
- # First token latency
1240
- metrics_parts.append(f"first {first_token_latency:.2f}s")
1241
-
1242
- # Total time
1243
- metrics_parts.append(f"total {elapsed:.1f}s")
1244
-
1245
- # Token count
1246
- metrics_parts.append(f"tokens {token_count}")
1247
-
1248
- # Throughput
1249
- metrics_parts.append(f"speed {tokens_per_sec:.1f} tok/s")
1250
-
1251
- # Print entire metrics line as dim/secondary to make it less prominent
1252
- # Indent metrics to align with response text
1253
- metrics_line = " · ".join(metrics_parts)
1254
- print(f" \033[2m{metrics_line}\033[0m")
1255
-
1256
- if token_count >= request.max_tokens:
1257
- print(f" \033[2m(output truncated at max_tokens={request.max_tokens}; increase in config.yaml)\033[0m")
1258
-
1259
- # Add assistant message to conversation history
1260
- self.conversation_manager.add_message(MessageRole.ASSISTANT, generated_text)
1342
+ final_text = generated_text
1343
+ if parse_error:
1344
+ final_text = tool_protocol.strip_tool_blocks(generated_text)
1345
+ if tool_calls_started and final_text.strip():
1346
+ self.console.print(build_renderable(final_text))
1347
+
1348
+ elapsed = time.time() - start_time
1349
+ if token_count > 0 and elapsed > 0:
1350
+ tokens_per_sec = token_count / elapsed
1351
+ first_token_latency = first_token_time - start_time if first_token_time else 0
1352
+
1353
+ metrics_parts = []
1354
+ if first_token_latency > 0.1:
1355
+ metrics_parts.append(f"first {first_token_latency:.2f}s")
1356
+ metrics_parts.append(f"total {elapsed:.1f}s")
1357
+ metrics_parts.append(f"tokens {token_count}")
1358
+ metrics_parts.append(f"speed {tokens_per_sec:.1f} tok/s")
1359
+ metrics_line = " · ".join(metrics_parts)
1360
+ print(f" \033[2m{metrics_line}\033[0m")
1361
+
1362
+ if token_count >= request.max_tokens:
1363
+ print(f" \033[2m(output truncated at max_tokens={request.max_tokens}; increase in config.yaml)\033[0m")
1364
+
1365
+ self.conversation_manager.add_message(MessageRole.ASSISTANT, final_text)
1366
+ break
1261
1367
 
1262
1368
  except Exception as e:
1263
1369
  print(f"\n\033[31m✗ Error:\033[0m {str(e)}", file=sys.stderr)
@@ -1274,7 +1380,7 @@ class CortexCLI:
1274
1380
  except (KeyboardInterrupt, EOFError):
1275
1381
  raise
1276
1382
 
1277
- def _format_prompt_with_chat_template(self, user_input: str) -> str:
1383
+ def _format_prompt_with_chat_template(self, user_input: str, include_user: bool = True) -> str:
1278
1384
  """Format the prompt with appropriate chat template for the model."""
1279
1385
  # Get current conversation context
1280
1386
  conversation = self.conversation_manager.get_current_conversation()
@@ -1297,10 +1403,11 @@ class CortexCLI:
1297
1403
  })
1298
1404
 
1299
1405
  # Add current user message
1300
- messages.append({
1301
- "role": "user",
1302
- "content": user_input
1303
- })
1406
+ if include_user:
1407
+ messages.append({
1408
+ "role": "user",
1409
+ "content": user_input
1410
+ })
1304
1411
 
1305
1412
  # Use template registry to format messages
1306
1413
  try:
@@ -190,11 +190,13 @@ class PrefixedRenderable:
190
190
  prefix: str,
191
191
  prefix_style: Style | None = None,
192
192
  indent: str | None = None,
193
+ auto_space: bool = False,
193
194
  ) -> None:
194
195
  self.renderable = renderable
195
196
  self.prefix = prefix
196
197
  self.prefix_style = prefix_style
197
198
  self.indent = indent if indent is not None else " " * len(prefix)
199
+ self.auto_space = auto_space
198
200
 
199
201
  def __rich_console__(self, console: Console, options):
200
202
  prefix_width = cell_len(self.prefix)
@@ -205,6 +207,7 @@ class PrefixedRenderable:
205
207
 
206
208
  yield Segment(self.prefix, self.prefix_style)
207
209
 
210
+ inserted_space = False
208
211
  for segment in console.render(self.renderable, inner_options):
209
212
  if segment.control:
210
213
  yield segment
@@ -213,6 +216,12 @@ class PrefixedRenderable:
213
216
  text = segment.text
214
217
  style = segment.style
215
218
 
219
+ if self.auto_space and not inserted_space:
220
+ if text:
221
+ if not text[0].isspace():
222
+ yield Segment(" ", None)
223
+ inserted_space = True
224
+
216
225
  if "\n" not in text:
217
226
  yield segment
218
227
  continue
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cortex-llm
3
- Version: 1.0.7
3
+ Version: 1.0.9
4
4
  Summary: GPU-Accelerated LLM Terminal for Apple Silicon
5
5
  Home-page: https://github.com/faisalmumtaz/Cortex
6
6
  Author: Cortex Development Team
@@ -131,6 +131,10 @@ Cortex supports:
131
131
  - `docs/template-registry.md`
132
132
  - **Inference engine details** and backend behavior
133
133
  - `docs/inference-engine.md`
134
+ - **Tooling (experimental, WIP)** for repo-scoped read/search and optional file edits with explicit confirmation
135
+ - `docs/cli.md`
136
+
137
+ **Important (Work in Progress):** Tooling is actively evolving and should be considered experimental. Behavior, output format, and available actions may change; tool calls can fail; and UI presentation may be adjusted. Use tooling on non-critical work first, and always review any proposed file changes before approving them.
134
138
 
135
139
  ## Configuration
136
140
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "cortex-llm"
7
- version = "1.0.7"
7
+ version = "1.0.9"
8
8
  description = "GPU-Accelerated LLM Terminal for Apple Silicon"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -26,7 +26,7 @@ def read_requirements():
26
26
 
27
27
  setup(
28
28
  name="cortex-llm",
29
- version="1.0.7",
29
+ version="1.0.9",
30
30
  author="Cortex Development Team",
31
31
  description="GPU-Accelerated LLM Terminal for Apple Silicon",
32
32
  long_description=README,
File without changes
File without changes
File without changes