cortex-llm 1.0.8__tar.gz → 1.0.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/PKG-INFO +5 -1
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/README.md +4 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/__init__.py +1 -1
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/inference_engine.py +48 -8
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/ui/cli.py +231 -124
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/ui/markdown_render.py +9 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex_llm.egg-info/PKG-INFO +5 -1
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/pyproject.toml +1 -1
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/setup.py +1 -1
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/LICENSE +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/__main__.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/config.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/conversation_manager.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/fine_tuning/__init__.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/fine_tuning/dataset.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/fine_tuning/mlx_lora_trainer.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/fine_tuning/trainer.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/fine_tuning/wizard.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/gpu_validator.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/metal/__init__.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/metal/gpu_validator.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/metal/memory_pool.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/metal/mlx_accelerator.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/metal/mlx_compat.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/metal/mlx_converter.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/metal/mps_optimizer.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/metal/optimizer.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/metal/performance_profiler.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/model_downloader.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/model_manager.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/quantization/__init__.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/quantization/dynamic_quantizer.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/template_registry/__init__.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/template_registry/auto_detector.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/template_registry/config_manager.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/template_registry/interactive.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/template_registry/registry.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/__init__.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/base.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/complex/__init__.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/complex/reasoning.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/standard/__init__.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/standard/alpaca.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/standard/chatml.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/standard/gemma.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/standard/llama.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/standard/simple.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/ui/__init__.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/ui/terminal_app.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex_llm.egg-info/SOURCES.txt +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex_llm.egg-info/dependency_links.txt +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex_llm.egg-info/entry_points.txt +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex_llm.egg-info/not-zip-safe +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex_llm.egg-info/requires.txt +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex_llm.egg-info/top_level.txt +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/setup.cfg +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/tests/test_apple_silicon.py +0 -0
- {cortex_llm-1.0.8 → cortex_llm-1.0.9}/tests/test_metal_optimization.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cortex-llm
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.9
|
|
4
4
|
Summary: GPU-Accelerated LLM Terminal for Apple Silicon
|
|
5
5
|
Home-page: https://github.com/faisalmumtaz/Cortex
|
|
6
6
|
Author: Cortex Development Team
|
|
@@ -131,6 +131,10 @@ Cortex supports:
|
|
|
131
131
|
- `docs/template-registry.md`
|
|
132
132
|
- **Inference engine details** and backend behavior
|
|
133
133
|
- `docs/inference-engine.md`
|
|
134
|
+
- **Tooling (experimental, WIP)** for repo-scoped read/search and optional file edits with explicit confirmation
|
|
135
|
+
- `docs/cli.md`
|
|
136
|
+
|
|
137
|
+
**Important (Work in Progress):** Tooling is actively evolving and should be considered experimental. Behavior, output format, and available actions may change; tool calls can fail; and UI presentation may be adjusted. Use tooling on non-critical work first, and always review any proposed file changes before approving them.
|
|
134
138
|
|
|
135
139
|
## Configuration
|
|
136
140
|
|
|
@@ -73,6 +73,10 @@ Cortex supports:
|
|
|
73
73
|
- `docs/template-registry.md`
|
|
74
74
|
- **Inference engine details** and backend behavior
|
|
75
75
|
- `docs/inference-engine.md`
|
|
76
|
+
- **Tooling (experimental, WIP)** for repo-scoped read/search and optional file edits with explicit confirmation
|
|
77
|
+
- `docs/cli.md`
|
|
78
|
+
|
|
79
|
+
**Important (Work in Progress):** Tooling is actively evolving and should be considered experimental. Behavior, output format, and available actions may change; tool calls can fail; and UI presentation may be adjusted. Use tooling on non-critical work first, and always review any proposed file changes before approving them.
|
|
76
80
|
|
|
77
81
|
## Configuration
|
|
78
82
|
|
|
@@ -5,7 +5,7 @@ A high-performance terminal interface for running Hugging Face LLMs locally
|
|
|
5
5
|
with exclusive GPU acceleration via Metal Performance Shaders (MPS) and MLX.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
__version__ = "1.0.
|
|
8
|
+
__version__ = "1.0.9"
|
|
9
9
|
__author__ = "Cortex Development Team"
|
|
10
10
|
__license__ = "MIT"
|
|
11
11
|
|
|
@@ -243,6 +243,33 @@ class InferenceEngine:
|
|
|
243
243
|
tokens_generated = 0
|
|
244
244
|
first_token_time = None
|
|
245
245
|
last_metrics_update = time.time()
|
|
246
|
+
stream_total_text = ""
|
|
247
|
+
stream_cumulative = False
|
|
248
|
+
|
|
249
|
+
def normalize_stream_chunk(chunk: Any) -> str:
|
|
250
|
+
"""Normalize streaming output to delta chunks when backend yields cumulative text."""
|
|
251
|
+
nonlocal stream_total_text, stream_cumulative
|
|
252
|
+
if chunk is None:
|
|
253
|
+
return ""
|
|
254
|
+
if not isinstance(chunk, str):
|
|
255
|
+
chunk = str(chunk)
|
|
256
|
+
|
|
257
|
+
if stream_cumulative:
|
|
258
|
+
if chunk.startswith(stream_total_text):
|
|
259
|
+
delta = chunk[len(stream_total_text):]
|
|
260
|
+
stream_total_text = chunk
|
|
261
|
+
return delta
|
|
262
|
+
stream_total_text += chunk
|
|
263
|
+
return chunk
|
|
264
|
+
|
|
265
|
+
if stream_total_text and len(chunk) > len(stream_total_text) and chunk.startswith(stream_total_text):
|
|
266
|
+
stream_cumulative = True
|
|
267
|
+
delta = chunk[len(stream_total_text):]
|
|
268
|
+
stream_total_text = chunk
|
|
269
|
+
return delta
|
|
270
|
+
|
|
271
|
+
stream_total_text += chunk
|
|
272
|
+
return chunk
|
|
246
273
|
|
|
247
274
|
try:
|
|
248
275
|
# Use MLX accelerator's optimized generation if available
|
|
@@ -262,10 +289,14 @@ class InferenceEngine:
|
|
|
262
289
|
if self._cancel_event.is_set():
|
|
263
290
|
self.status = InferenceStatus.CANCELLED
|
|
264
291
|
break
|
|
265
|
-
|
|
292
|
+
|
|
293
|
+
delta = normalize_stream_chunk(token) if request.stream else str(token)
|
|
294
|
+
if not delta:
|
|
295
|
+
continue
|
|
296
|
+
|
|
266
297
|
if first_token_time is None:
|
|
267
298
|
first_token_time = time.time() - start_time
|
|
268
|
-
|
|
299
|
+
|
|
269
300
|
tokens_generated += 1
|
|
270
301
|
|
|
271
302
|
# Update metrics less frequently
|
|
@@ -284,13 +315,18 @@ class InferenceEngine:
|
|
|
284
315
|
last_metrics_update = current_time
|
|
285
316
|
|
|
286
317
|
# Token is already a string from generate_optimized
|
|
287
|
-
yield
|
|
318
|
+
yield delta
|
|
288
319
|
|
|
289
320
|
if any(stop in token for stop in request.stop_sequences):
|
|
290
321
|
break
|
|
291
322
|
elif mlx_generate:
|
|
292
323
|
# Fallback to standard MLX generation
|
|
293
|
-
|
|
324
|
+
if request.stream and mlx_stream_generate:
|
|
325
|
+
logger.info("Using MLX streaming generation")
|
|
326
|
+
generate_fn = mlx_stream_generate
|
|
327
|
+
else:
|
|
328
|
+
logger.info("Using standard MLX generation")
|
|
329
|
+
generate_fn = mlx_generate
|
|
294
330
|
|
|
295
331
|
# Import sample_utils for creating sampler
|
|
296
332
|
try:
|
|
@@ -314,7 +350,7 @@ class InferenceEngine:
|
|
|
314
350
|
if request.seed is not None and request.seed >= 0:
|
|
315
351
|
mx.random.seed(request.seed)
|
|
316
352
|
|
|
317
|
-
for response in
|
|
353
|
+
for response in generate_fn(
|
|
318
354
|
model,
|
|
319
355
|
tokenizer,
|
|
320
356
|
**generation_kwargs
|
|
@@ -328,10 +364,14 @@ class InferenceEngine:
|
|
|
328
364
|
token = response.text
|
|
329
365
|
else:
|
|
330
366
|
token = str(response)
|
|
331
|
-
|
|
367
|
+
|
|
368
|
+
delta = normalize_stream_chunk(token) if request.stream else token
|
|
369
|
+
if request.stream and not delta:
|
|
370
|
+
continue
|
|
371
|
+
|
|
332
372
|
if first_token_time is None:
|
|
333
373
|
first_token_time = time.time() - start_time
|
|
334
|
-
|
|
374
|
+
|
|
335
375
|
tokens_generated += 1
|
|
336
376
|
|
|
337
377
|
# Update metrics less frequently to reduce overhead
|
|
@@ -352,7 +392,7 @@ class InferenceEngine:
|
|
|
352
392
|
)
|
|
353
393
|
last_metrics_update = current_time
|
|
354
394
|
|
|
355
|
-
yield
|
|
395
|
+
yield delta
|
|
356
396
|
|
|
357
397
|
if any(stop in token for stop in request.stop_sequences):
|
|
358
398
|
break
|
|
@@ -18,6 +18,7 @@ from textwrap import wrap
|
|
|
18
18
|
|
|
19
19
|
from rich.live import Live
|
|
20
20
|
from rich.style import Style
|
|
21
|
+
from rich.text import Text
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
logger = logging.getLogger(__name__)
|
|
@@ -30,6 +31,8 @@ from cortex.conversation_manager import ConversationManager, MessageRole
|
|
|
30
31
|
from cortex.model_downloader import ModelDownloader
|
|
31
32
|
from cortex.template_registry import TemplateRegistry
|
|
32
33
|
from cortex.fine_tuning import FineTuneWizard
|
|
34
|
+
from cortex.tools import ToolRunner
|
|
35
|
+
from cortex.tools import protocol as tool_protocol
|
|
33
36
|
from cortex.ui.markdown_render import ThinkMarkdown, PrefixedRenderable, render_plain_with_think
|
|
34
37
|
|
|
35
38
|
|
|
@@ -58,6 +61,11 @@ class CortexCLI:
|
|
|
58
61
|
|
|
59
62
|
# Initialize fine-tuning wizard
|
|
60
63
|
self.fine_tune_wizard = FineTuneWizard(model_manager, config)
|
|
64
|
+
|
|
65
|
+
# Tooling support (always enabled)
|
|
66
|
+
self.tool_runner = ToolRunner(Path.cwd())
|
|
67
|
+
self.tool_runner.set_confirm_callback(self._confirm_tool_change)
|
|
68
|
+
self.max_tool_iterations = 4
|
|
61
69
|
|
|
62
70
|
|
|
63
71
|
self.running = True
|
|
@@ -132,6 +140,86 @@ class CortexCLI:
|
|
|
132
140
|
# Don't call sys.exit() here - let the main loop exit naturally
|
|
133
141
|
# This prevents traceback from the parent process
|
|
134
142
|
print("\n", file=sys.stderr) # Just add a newline for cleaner output
|
|
143
|
+
|
|
144
|
+
def _confirm_tool_change(self, prompt: str) -> bool:
|
|
145
|
+
"""Prompt user to approve a tool-driven change."""
|
|
146
|
+
print("\n" + prompt)
|
|
147
|
+
response = input("Apply change? [y/N]: ").strip().lower()
|
|
148
|
+
return response in {"y", "yes"}
|
|
149
|
+
|
|
150
|
+
def _ensure_tool_instructions(self) -> None:
|
|
151
|
+
"""Inject tool instructions into the conversation once."""
|
|
152
|
+
conversation = self.conversation_manager.get_current_conversation()
|
|
153
|
+
if conversation is None:
|
|
154
|
+
conversation = self.conversation_manager.new_conversation()
|
|
155
|
+
marker = "[CORTEX_TOOL_INSTRUCTIONS v2]"
|
|
156
|
+
for message in conversation.messages:
|
|
157
|
+
if message.role == MessageRole.SYSTEM and marker in message.content:
|
|
158
|
+
return
|
|
159
|
+
self.conversation_manager.add_message(MessageRole.SYSTEM, self.tool_runner.tool_instructions())
|
|
160
|
+
|
|
161
|
+
def _summarize_tool_call(self, call: dict) -> str:
|
|
162
|
+
name = str(call.get("name", "tool"))
|
|
163
|
+
args = call.get("arguments") or {}
|
|
164
|
+
parts = []
|
|
165
|
+
preferred = ("path", "query", "anchor", "start_line", "end_line", "recursive", "max_results")
|
|
166
|
+
for key in preferred:
|
|
167
|
+
if key in args:
|
|
168
|
+
value = args[key]
|
|
169
|
+
if isinstance(value, str) and len(value) > 60:
|
|
170
|
+
value = value[:57] + "..."
|
|
171
|
+
parts.append(f"{key}={value!r}")
|
|
172
|
+
if not parts and args:
|
|
173
|
+
for key in list(args.keys())[:3]:
|
|
174
|
+
value = args[key]
|
|
175
|
+
if isinstance(value, str) and len(value) > 60:
|
|
176
|
+
value = value[:57] + "..."
|
|
177
|
+
parts.append(f"{key}={value!r}")
|
|
178
|
+
arg_str = ", ".join(parts)
|
|
179
|
+
return f"{name}({arg_str})" if arg_str else f"{name}()"
|
|
180
|
+
|
|
181
|
+
def _summarize_tool_result(self, result: dict) -> str:
|
|
182
|
+
name = str(result.get("name", "tool"))
|
|
183
|
+
if not result.get("ok", False):
|
|
184
|
+
error = result.get("error") or "unknown error"
|
|
185
|
+
return f"{name} -> error: {error}"
|
|
186
|
+
payload = result.get("result") or {}
|
|
187
|
+
if name == "list_dir":
|
|
188
|
+
entries = payload.get("entries") or []
|
|
189
|
+
return f"{name} -> entries={len(entries)}"
|
|
190
|
+
if name == "search":
|
|
191
|
+
matches = payload.get("results") or []
|
|
192
|
+
return f"{name} -> results={len(matches)}"
|
|
193
|
+
if name == "read_file":
|
|
194
|
+
path = payload.get("path") or ""
|
|
195
|
+
start = payload.get("start_line")
|
|
196
|
+
end = payload.get("end_line")
|
|
197
|
+
if start and end:
|
|
198
|
+
return f"{name} -> {path} lines {start}-{end}"
|
|
199
|
+
if start:
|
|
200
|
+
return f"{name} -> {path} from line {start}"
|
|
201
|
+
return f"{name} -> {path}"
|
|
202
|
+
if name in {"write_file", "create_file", "delete_file", "replace_in_file", "insert_after", "insert_before"}:
|
|
203
|
+
path = payload.get("path") or ""
|
|
204
|
+
return f"{name} -> {path}"
|
|
205
|
+
return f"{name} -> ok"
|
|
206
|
+
|
|
207
|
+
def _print_tool_activity(self, tool_calls: list, tool_results: list) -> None:
|
|
208
|
+
lines = []
|
|
209
|
+
for call, result in zip(tool_calls, tool_results):
|
|
210
|
+
lines.append(f"tool {self._summarize_tool_call(call)} -> {self._summarize_tool_result(result)}")
|
|
211
|
+
if not lines:
|
|
212
|
+
return
|
|
213
|
+
text = Text("\n".join(lines), style=Style(color="bright_black", italic=True))
|
|
214
|
+
renderable = PrefixedRenderable(text, prefix=" ", prefix_style=Style(dim=True), indent=" ", auto_space=False)
|
|
215
|
+
original_console_width = self.console._width
|
|
216
|
+
target_width = max(40, int(self.get_terminal_width() * 0.75))
|
|
217
|
+
self.console.width = target_width
|
|
218
|
+
try:
|
|
219
|
+
self.console.print(renderable, highlight=False, soft_wrap=True)
|
|
220
|
+
self.console.print()
|
|
221
|
+
finally:
|
|
222
|
+
self.console._width = original_console_width
|
|
135
223
|
|
|
136
224
|
|
|
137
225
|
def get_terminal_width(self) -> int:
|
|
@@ -1110,16 +1198,10 @@ class CortexCLI:
|
|
|
1110
1198
|
except Exception as e:
|
|
1111
1199
|
logger.debug(f"Failed to get template profile: {e}")
|
|
1112
1200
|
|
|
1113
|
-
#
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
#
|
|
1117
|
-
# This is crucial for debugging when models give unexpected responses
|
|
1118
|
-
# It shows the formatted prompt with all special tokens and formatting
|
|
1119
|
-
# print(f"\033[33m[DEBUG] Formatted prompt being sent to model:\033[0m", file=sys.stderr)
|
|
1120
|
-
# print(f"\033[33m{repr(formatted_prompt[:200])}...\033[0m", file=sys.stderr)
|
|
1121
|
-
|
|
1122
|
-
# Now add user message to conversation history
|
|
1201
|
+
# Ensure tool instructions are present before adding user message
|
|
1202
|
+
self._ensure_tool_instructions()
|
|
1203
|
+
|
|
1204
|
+
# Now add user message to conversation history
|
|
1123
1205
|
self.conversation_manager.add_message(MessageRole.USER, user_input)
|
|
1124
1206
|
|
|
1125
1207
|
# Start response on a new line; prefix is rendered with the markdown output.
|
|
@@ -1134,130 +1216,154 @@ class CortexCLI:
|
|
|
1134
1216
|
except Exception as e:
|
|
1135
1217
|
logger.debug(f"Could not get stop sequences: {e}")
|
|
1136
1218
|
|
|
1137
|
-
#
|
|
1138
|
-
request = GenerationRequest(
|
|
1139
|
-
prompt=formatted_prompt,
|
|
1140
|
-
max_tokens=self.config.inference.max_tokens,
|
|
1141
|
-
temperature=self.config.inference.temperature,
|
|
1142
|
-
top_p=self.config.inference.top_p,
|
|
1143
|
-
top_k=self.config.inference.top_k,
|
|
1144
|
-
repetition_penalty=self.config.inference.repetition_penalty,
|
|
1145
|
-
stream=self.config.inference.stream_output,
|
|
1146
|
-
seed=self.config.inference.seed if self.config.inference.seed >= 0 else None,
|
|
1147
|
-
stop_sequences=stop_sequences
|
|
1148
|
-
)
|
|
1149
|
-
|
|
1150
|
-
# Generate response
|
|
1219
|
+
# Generate response (with tool loop)
|
|
1151
1220
|
self.generating = True
|
|
1152
|
-
generated_text = ""
|
|
1153
|
-
start_time = time.time()
|
|
1154
|
-
token_count = 0
|
|
1155
|
-
first_token_time = None
|
|
1156
1221
|
|
|
1157
1222
|
try:
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
template_profile.reset_streaming_state()
|
|
1223
|
+
tool_iterations = 0
|
|
1224
|
+
while tool_iterations < self.max_tool_iterations:
|
|
1225
|
+
tool_iterations += 1
|
|
1162
1226
|
|
|
1163
|
-
|
|
1164
|
-
accumulated_response = ""
|
|
1165
|
-
last_render_time = 0.0
|
|
1166
|
-
render_interval = 0.05 # seconds
|
|
1167
|
-
prefix_style = Style(color="cyan")
|
|
1227
|
+
formatted_prompt = self._format_prompt_with_chat_template(user_input, include_user=False)
|
|
1168
1228
|
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
text,
|
|
1173
|
-
code_theme="monokai",
|
|
1174
|
-
use_line_numbers=False,
|
|
1175
|
-
syntax_highlighting=getattr(self.config.ui, "syntax_highlighting", True),
|
|
1176
|
-
)
|
|
1177
|
-
renderable = markdown
|
|
1178
|
-
else:
|
|
1179
|
-
renderable = render_plain_with_think(text)
|
|
1229
|
+
# DEBUG: Uncomment these lines to see the exact prompt being sent to the model
|
|
1230
|
+
# print(f"\033[33m[DEBUG] Formatted prompt being sent to model:\033[0m", file=sys.stderr)
|
|
1231
|
+
# print(f"\033[33m{repr(formatted_prompt[:200])}...\033[0m", file=sys.stderr)
|
|
1180
1232
|
|
|
1181
|
-
|
|
1233
|
+
request = GenerationRequest(
|
|
1234
|
+
prompt=formatted_prompt,
|
|
1235
|
+
max_tokens=self.config.inference.max_tokens,
|
|
1236
|
+
temperature=self.config.inference.temperature,
|
|
1237
|
+
top_p=self.config.inference.top_p,
|
|
1238
|
+
top_k=self.config.inference.top_k,
|
|
1239
|
+
repetition_penalty=self.config.inference.repetition_penalty,
|
|
1240
|
+
stream=self.config.inference.stream_output,
|
|
1241
|
+
seed=self.config.inference.seed if self.config.inference.seed >= 0 else None,
|
|
1242
|
+
stop_sequences=stop_sequences
|
|
1243
|
+
)
|
|
1182
1244
|
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
build_renderable(""),
|
|
1189
|
-
console=self.console,
|
|
1190
|
-
auto_refresh=False,
|
|
1191
|
-
refresh_per_second=20,
|
|
1192
|
-
transient=False,
|
|
1193
|
-
vertical_overflow="visible",
|
|
1194
|
-
) as live:
|
|
1195
|
-
for token in self.inference_engine.generate(request):
|
|
1196
|
-
if first_token_time is None:
|
|
1197
|
-
first_token_time = time.time()
|
|
1245
|
+
generated_text = ""
|
|
1246
|
+
start_time = time.time()
|
|
1247
|
+
token_count = 0
|
|
1248
|
+
first_token_time = None
|
|
1249
|
+
tool_calls_started = False
|
|
1198
1250
|
|
|
1199
|
-
|
|
1200
|
-
|
|
1251
|
+
if uses_reasoning_template and template_profile and template_profile.supports_streaming():
|
|
1252
|
+
if hasattr(template_profile, 'reset_streaming_state'):
|
|
1253
|
+
template_profile.reset_streaming_state()
|
|
1201
1254
|
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1255
|
+
display_text = ""
|
|
1256
|
+
accumulated_response = ""
|
|
1257
|
+
last_render_time = 0.0
|
|
1258
|
+
render_interval = 0.05 # seconds
|
|
1259
|
+
prefix_style = Style(color="cyan")
|
|
1260
|
+
|
|
1261
|
+
def build_renderable(text: str):
|
|
1262
|
+
if getattr(self.config.ui, "markdown_rendering", True):
|
|
1263
|
+
markdown = ThinkMarkdown(
|
|
1264
|
+
text,
|
|
1265
|
+
code_theme="monokai",
|
|
1266
|
+
use_line_numbers=False,
|
|
1267
|
+
syntax_highlighting=getattr(self.config.ui, "syntax_highlighting", True),
|
|
1268
|
+
)
|
|
1269
|
+
renderable = markdown
|
|
1270
|
+
else:
|
|
1271
|
+
renderable = render_plain_with_think(text)
|
|
1210
1272
|
|
|
1211
|
-
|
|
1212
|
-
display_text += display_token
|
|
1273
|
+
return PrefixedRenderable(renderable, prefix="⏺", prefix_style=prefix_style, indent=" ", auto_space=True)
|
|
1213
1274
|
|
|
1214
|
-
|
|
1215
|
-
|
|
1275
|
+
original_console_width = self.console._width
|
|
1276
|
+
target_width = max(40, int(self.get_terminal_width() * 0.75))
|
|
1277
|
+
self.console.width = target_width
|
|
1278
|
+
try:
|
|
1279
|
+
with Live(
|
|
1280
|
+
build_renderable(""),
|
|
1281
|
+
console=self.console,
|
|
1282
|
+
auto_refresh=False,
|
|
1283
|
+
refresh_per_second=20,
|
|
1284
|
+
transient=False,
|
|
1285
|
+
vertical_overflow="visible",
|
|
1286
|
+
) as live:
|
|
1287
|
+
for token in self.inference_engine.generate(request):
|
|
1288
|
+
if first_token_time is None:
|
|
1289
|
+
first_token_time = time.time()
|
|
1290
|
+
|
|
1291
|
+
generated_text += token
|
|
1292
|
+
token_count += 1
|
|
1293
|
+
|
|
1294
|
+
if not tool_calls_started and tool_protocol.find_tool_calls_block(generated_text)[0] is not None:
|
|
1295
|
+
tool_calls_started = True
|
|
1296
|
+
display_text = "<think>tools running...</think>"
|
|
1297
|
+
live.update(build_renderable(display_text), refresh=True)
|
|
1298
|
+
|
|
1299
|
+
display_token = token
|
|
1300
|
+
if uses_reasoning_template and template_profile and template_profile.supports_streaming():
|
|
1301
|
+
display_token, should_display = template_profile.process_streaming_response(
|
|
1302
|
+
token, accumulated_response
|
|
1303
|
+
)
|
|
1304
|
+
accumulated_response += token
|
|
1305
|
+
if not should_display:
|
|
1306
|
+
display_token = ""
|
|
1307
|
+
|
|
1308
|
+
if not tool_calls_started and display_token:
|
|
1309
|
+
display_text += display_token
|
|
1310
|
+
|
|
1311
|
+
now = time.time()
|
|
1312
|
+
if (not tool_calls_started and display_token and
|
|
1313
|
+
("\n" in display_token or now - last_render_time >= render_interval)):
|
|
1314
|
+
live.update(build_renderable(display_text), refresh=True)
|
|
1315
|
+
last_render_time = now
|
|
1316
|
+
|
|
1317
|
+
if not tool_calls_started and uses_reasoning_template and template_profile:
|
|
1318
|
+
final_text = template_profile.process_response(generated_text)
|
|
1319
|
+
generated_text = final_text
|
|
1320
|
+
if not template_profile.config.show_reasoning:
|
|
1321
|
+
display_text = final_text
|
|
1216
1322
|
live.update(build_renderable(display_text), refresh=True)
|
|
1217
|
-
|
|
1323
|
+
finally:
|
|
1324
|
+
self.console._width = original_console_width
|
|
1218
1325
|
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
if not template_profile.config.show_reasoning:
|
|
1223
|
-
display_text = final_text
|
|
1326
|
+
tool_calls, parse_error = tool_protocol.parse_tool_calls(generated_text)
|
|
1327
|
+
if parse_error:
|
|
1328
|
+
print(f"\n\033[31m✗ Tool call parse error:\033[0m {parse_error}", file=sys.stderr)
|
|
1224
1329
|
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1330
|
+
if tool_calls:
|
|
1331
|
+
tool_results = self.tool_runner.run_calls(tool_calls)
|
|
1332
|
+
self._print_tool_activity(tool_calls, tool_results)
|
|
1333
|
+
self.conversation_manager.add_message(
|
|
1334
|
+
MessageRole.SYSTEM,
|
|
1335
|
+
tool_protocol.format_tool_results(tool_results)
|
|
1336
|
+
)
|
|
1337
|
+
if tool_iterations >= self.max_tool_iterations:
|
|
1338
|
+
print("\n\033[31m✗\033[0m Tool loop limit reached.", file=sys.stderr)
|
|
1339
|
+
break
|
|
1340
|
+
continue
|
|
1228
1341
|
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
metrics_parts
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
print(f" \033[2m{metrics_line}\033[0m")
|
|
1255
|
-
|
|
1256
|
-
if token_count >= request.max_tokens:
|
|
1257
|
-
print(f" \033[2m(output truncated at max_tokens={request.max_tokens}; increase in config.yaml)\033[0m")
|
|
1258
|
-
|
|
1259
|
-
# Add assistant message to conversation history
|
|
1260
|
-
self.conversation_manager.add_message(MessageRole.ASSISTANT, generated_text)
|
|
1342
|
+
final_text = generated_text
|
|
1343
|
+
if parse_error:
|
|
1344
|
+
final_text = tool_protocol.strip_tool_blocks(generated_text)
|
|
1345
|
+
if tool_calls_started and final_text.strip():
|
|
1346
|
+
self.console.print(build_renderable(final_text))
|
|
1347
|
+
|
|
1348
|
+
elapsed = time.time() - start_time
|
|
1349
|
+
if token_count > 0 and elapsed > 0:
|
|
1350
|
+
tokens_per_sec = token_count / elapsed
|
|
1351
|
+
first_token_latency = first_token_time - start_time if first_token_time else 0
|
|
1352
|
+
|
|
1353
|
+
metrics_parts = []
|
|
1354
|
+
if first_token_latency > 0.1:
|
|
1355
|
+
metrics_parts.append(f"first {first_token_latency:.2f}s")
|
|
1356
|
+
metrics_parts.append(f"total {elapsed:.1f}s")
|
|
1357
|
+
metrics_parts.append(f"tokens {token_count}")
|
|
1358
|
+
metrics_parts.append(f"speed {tokens_per_sec:.1f} tok/s")
|
|
1359
|
+
metrics_line = " · ".join(metrics_parts)
|
|
1360
|
+
print(f" \033[2m{metrics_line}\033[0m")
|
|
1361
|
+
|
|
1362
|
+
if token_count >= request.max_tokens:
|
|
1363
|
+
print(f" \033[2m(output truncated at max_tokens={request.max_tokens}; increase in config.yaml)\033[0m")
|
|
1364
|
+
|
|
1365
|
+
self.conversation_manager.add_message(MessageRole.ASSISTANT, final_text)
|
|
1366
|
+
break
|
|
1261
1367
|
|
|
1262
1368
|
except Exception as e:
|
|
1263
1369
|
print(f"\n\033[31m✗ Error:\033[0m {str(e)}", file=sys.stderr)
|
|
@@ -1274,7 +1380,7 @@ class CortexCLI:
|
|
|
1274
1380
|
except (KeyboardInterrupt, EOFError):
|
|
1275
1381
|
raise
|
|
1276
1382
|
|
|
1277
|
-
def _format_prompt_with_chat_template(self, user_input: str) -> str:
|
|
1383
|
+
def _format_prompt_with_chat_template(self, user_input: str, include_user: bool = True) -> str:
|
|
1278
1384
|
"""Format the prompt with appropriate chat template for the model."""
|
|
1279
1385
|
# Get current conversation context
|
|
1280
1386
|
conversation = self.conversation_manager.get_current_conversation()
|
|
@@ -1297,10 +1403,11 @@ class CortexCLI:
|
|
|
1297
1403
|
})
|
|
1298
1404
|
|
|
1299
1405
|
# Add current user message
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1406
|
+
if include_user:
|
|
1407
|
+
messages.append({
|
|
1408
|
+
"role": "user",
|
|
1409
|
+
"content": user_input
|
|
1410
|
+
})
|
|
1304
1411
|
|
|
1305
1412
|
# Use template registry to format messages
|
|
1306
1413
|
try:
|
|
@@ -190,11 +190,13 @@ class PrefixedRenderable:
|
|
|
190
190
|
prefix: str,
|
|
191
191
|
prefix_style: Style | None = None,
|
|
192
192
|
indent: str | None = None,
|
|
193
|
+
auto_space: bool = False,
|
|
193
194
|
) -> None:
|
|
194
195
|
self.renderable = renderable
|
|
195
196
|
self.prefix = prefix
|
|
196
197
|
self.prefix_style = prefix_style
|
|
197
198
|
self.indent = indent if indent is not None else " " * len(prefix)
|
|
199
|
+
self.auto_space = auto_space
|
|
198
200
|
|
|
199
201
|
def __rich_console__(self, console: Console, options):
|
|
200
202
|
prefix_width = cell_len(self.prefix)
|
|
@@ -205,6 +207,7 @@ class PrefixedRenderable:
|
|
|
205
207
|
|
|
206
208
|
yield Segment(self.prefix, self.prefix_style)
|
|
207
209
|
|
|
210
|
+
inserted_space = False
|
|
208
211
|
for segment in console.render(self.renderable, inner_options):
|
|
209
212
|
if segment.control:
|
|
210
213
|
yield segment
|
|
@@ -213,6 +216,12 @@ class PrefixedRenderable:
|
|
|
213
216
|
text = segment.text
|
|
214
217
|
style = segment.style
|
|
215
218
|
|
|
219
|
+
if self.auto_space and not inserted_space:
|
|
220
|
+
if text:
|
|
221
|
+
if not text[0].isspace():
|
|
222
|
+
yield Segment(" ", None)
|
|
223
|
+
inserted_space = True
|
|
224
|
+
|
|
216
225
|
if "\n" not in text:
|
|
217
226
|
yield segment
|
|
218
227
|
continue
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cortex-llm
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.9
|
|
4
4
|
Summary: GPU-Accelerated LLM Terminal for Apple Silicon
|
|
5
5
|
Home-page: https://github.com/faisalmumtaz/Cortex
|
|
6
6
|
Author: Cortex Development Team
|
|
@@ -131,6 +131,10 @@ Cortex supports:
|
|
|
131
131
|
- `docs/template-registry.md`
|
|
132
132
|
- **Inference engine details** and backend behavior
|
|
133
133
|
- `docs/inference-engine.md`
|
|
134
|
+
- **Tooling (experimental, WIP)** for repo-scoped read/search and optional file edits with explicit confirmation
|
|
135
|
+
- `docs/cli.md`
|
|
136
|
+
|
|
137
|
+
**Important (Work in Progress):** Tooling is actively evolving and should be considered experimental. Behavior, output format, and available actions may change; tool calls can fail; and UI presentation may be adjusted. Use tooling on non-critical work first, and always review any proposed file changes before approving them.
|
|
134
138
|
|
|
135
139
|
## Configuration
|
|
136
140
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/complex/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/standard/alpaca.py
RENAMED
|
File without changes
|
{cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/standard/chatml.py
RENAMED
|
File without changes
|
{cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/standard/gemma.py
RENAMED
|
File without changes
|
{cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/standard/llama.py
RENAMED
|
File without changes
|
{cortex_llm-1.0.8 → cortex_llm-1.0.9}/cortex/template_registry/template_profiles/standard/simple.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|