cortex-llm 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. cortex/__init__.py +73 -0
  2. cortex/__main__.py +83 -0
  3. cortex/config.py +329 -0
  4. cortex/conversation_manager.py +468 -0
  5. cortex/fine_tuning/__init__.py +8 -0
  6. cortex/fine_tuning/dataset.py +332 -0
  7. cortex/fine_tuning/mlx_lora_trainer.py +502 -0
  8. cortex/fine_tuning/trainer.py +957 -0
  9. cortex/fine_tuning/wizard.py +707 -0
  10. cortex/gpu_validator.py +467 -0
  11. cortex/inference_engine.py +727 -0
  12. cortex/metal/__init__.py +275 -0
  13. cortex/metal/gpu_validator.py +177 -0
  14. cortex/metal/memory_pool.py +886 -0
  15. cortex/metal/mlx_accelerator.py +678 -0
  16. cortex/metal/mlx_converter.py +638 -0
  17. cortex/metal/mps_optimizer.py +417 -0
  18. cortex/metal/optimizer.py +665 -0
  19. cortex/metal/performance_profiler.py +364 -0
  20. cortex/model_downloader.py +130 -0
  21. cortex/model_manager.py +2187 -0
  22. cortex/quantization/__init__.py +5 -0
  23. cortex/quantization/dynamic_quantizer.py +736 -0
  24. cortex/template_registry/__init__.py +15 -0
  25. cortex/template_registry/auto_detector.py +144 -0
  26. cortex/template_registry/config_manager.py +234 -0
  27. cortex/template_registry/interactive.py +260 -0
  28. cortex/template_registry/registry.py +347 -0
  29. cortex/template_registry/template_profiles/__init__.py +5 -0
  30. cortex/template_registry/template_profiles/base.py +142 -0
  31. cortex/template_registry/template_profiles/complex/__init__.py +5 -0
  32. cortex/template_registry/template_profiles/complex/reasoning.py +263 -0
  33. cortex/template_registry/template_profiles/standard/__init__.py +9 -0
  34. cortex/template_registry/template_profiles/standard/alpaca.py +73 -0
  35. cortex/template_registry/template_profiles/standard/chatml.py +82 -0
  36. cortex/template_registry/template_profiles/standard/gemma.py +103 -0
  37. cortex/template_registry/template_profiles/standard/llama.py +87 -0
  38. cortex/template_registry/template_profiles/standard/simple.py +65 -0
  39. cortex/ui/__init__.py +120 -0
  40. cortex/ui/cli.py +1685 -0
  41. cortex/ui/markdown_render.py +185 -0
  42. cortex/ui/terminal_app.py +534 -0
  43. cortex_llm-1.0.0.dist-info/METADATA +275 -0
  44. cortex_llm-1.0.0.dist-info/RECORD +48 -0
  45. cortex_llm-1.0.0.dist-info/WHEEL +5 -0
  46. cortex_llm-1.0.0.dist-info/entry_points.txt +2 -0
  47. cortex_llm-1.0.0.dist-info/licenses/LICENSE +21 -0
  48. cortex_llm-1.0.0.dist-info/top_level.txt +1 -0
cortex/ui/cli.py ADDED
@@ -0,0 +1,1685 @@
1
+ """CLI interface for Cortex with Claude Code-style UI."""
2
+
3
+ import os
4
+ import sys
5
+ import signal
6
+ import shutil
7
+ import readline
8
+ import time
9
+ import threading
10
+ import logging
11
+ import termios
12
+ import tty
13
+ import getpass
14
+ from typing import Optional, List, Tuple
15
+ from datetime import datetime
16
+ from pathlib import Path
17
+ from textwrap import wrap
18
+
19
+ from rich.live import Live
20
+ from rich.style import Style
21
+
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ from cortex.config import Config
26
+ from cortex.gpu_validator import GPUValidator
27
+ from cortex.model_manager import ModelManager
28
+ from cortex.inference_engine import InferenceEngine, GenerationRequest
29
+ from cortex.conversation_manager import ConversationManager, MessageRole
30
+ from cortex.model_downloader import ModelDownloader
31
+ from cortex.template_registry import TemplateRegistry
32
+ from cortex.fine_tuning import FineTuneWizard
33
+ from cortex.ui.markdown_render import ThinkMarkdown, PrefixedRenderable
34
+
35
+
36
+ class CortexCLI:
37
+ """Command-line interface for Cortex with Claude Code-style UI."""
38
+
39
+ def __init__(
40
+ self,
41
+ config: Config,
42
+ gpu_validator: GPUValidator,
43
+ model_manager: ModelManager,
44
+ inference_engine: InferenceEngine,
45
+ conversation_manager: ConversationManager
46
+ ):
47
+ self.config = config
48
+ self.gpu_validator = gpu_validator
49
+ self.model_manager = model_manager
50
+ self.inference_engine = inference_engine
51
+ self.conversation_manager = conversation_manager
52
+ self.model_downloader = ModelDownloader(config.model.model_path)
53
+
54
+ # Initialize template registry with console for interactive setup
55
+ from rich.console import Console
56
+ self.console = Console()
57
+ self.template_registry = TemplateRegistry(console=self.console)
58
+
59
+ # Initialize fine-tuning wizard
60
+ self.fine_tune_wizard = FineTuneWizard(model_manager, config)
61
+
62
+
63
+ self.running = True
64
+ self.generating = False
65
+
66
+ # Set up readline for better input handling (fallback)
67
+ self._setup_readline()
68
+
69
+ # Set up signal handlers
70
+ signal.signal(signal.SIGINT, self._handle_interrupt)
71
+ # SIGTSTP (Ctrl+Z) - let it suspend normally, no special handling needed
72
+ # The default behavior is fine for suspension
73
+
74
+ def _setup_readline(self):
75
+ """Set up readline for better command-line editing."""
76
+ # Enable tab completion
77
+ readline.parse_and_bind("tab: complete")
78
+
79
+ # Set up command history
80
+ histfile = Path.home() / ".cortex_history"
81
+ try:
82
+ readline.read_history_file(histfile)
83
+ except FileNotFoundError:
84
+ pass
85
+
86
+ # Save history on exit
87
+ import atexit
88
+ atexit.register(readline.write_history_file, histfile)
89
+
90
+ # Set up auto-completion
91
+ readline.set_completer(self._completer)
92
+
93
+ def get_input_with_escape(self, prompt: str = "Select option") -> Optional[str]:
94
+ """Get user input with ESC key support for cancellation.
95
+
96
+ Returns:
97
+ User input string, or None if cancelled (ESC, Ctrl+C, or '0')
98
+ """
99
+ # Get input with ESC to cancel
100
+ print()
101
+ print(f"\033[96m▶\033[0m {prompt}: ", end='')
102
+ user_input = input().strip()
103
+
104
+ # Check for cancel input
105
+ if user_input == '0':
106
+ return None
107
+
108
+ return user_input
109
+
110
+ def _completer(self, text, state):
111
+ """Auto-complete commands."""
112
+ commands = ['/help', '/status', '/download', '/model',
113
+ '/clear', '/save', '/gpu', '/benchmark', '/template', '/finetune', '/login', '/quit']
114
+
115
+
116
+ # Filter matching commands
117
+ matches = [cmd for cmd in commands if cmd.startswith(text)]
118
+
119
+ if state < len(matches):
120
+ return matches[state]
121
+ return None
122
+
123
+ def _handle_interrupt(self, signum, frame):
124
+ """Handle Ctrl+C interruption."""
125
+ if self.generating:
126
+ print("\n\nGeneration cancelled.", file=sys.stderr)
127
+ self.inference_engine.cancel_generation()
128
+ self.generating = False
129
+ else:
130
+ # Set running to False to exit the main loop gracefully
131
+ self.running = False
132
+ # Don't call sys.exit() here - let the main loop exit naturally
133
+ # This prevents traceback from the parent process
134
+ print("\n", file=sys.stderr) # Just add a newline for cleaner output
135
+
136
+
137
+ def get_terminal_width(self) -> int:
138
+ """Get terminal width."""
139
+ return shutil.get_terminal_size(fallback=(80, 24)).columns
140
+
141
+ def get_terminal_height(self) -> int:
142
+ """Get terminal height."""
143
+ return shutil.get_terminal_size(fallback=(80, 24)).lines
144
+
145
+ def get_visible_length(self, text: str) -> int:
146
+ """Get visible length of text, ignoring ANSI escape codes and accounting for wide characters."""
147
+ import re
148
+ import unicodedata
149
+
150
+ # Remove ANSI escape sequences
151
+ ansi_escape = re.compile(r'\x1b\[[0-9;]*m')
152
+ visible_text = ansi_escape.sub('', text)
153
+
154
+ # Calculate display width accounting for wide/ambiguous characters
155
+ display_width = 0
156
+ for char in visible_text:
157
+ width = unicodedata.east_asian_width(char)
158
+ if width in ('W', 'F'): # Wide or Fullwidth - always 2 columns
159
+ display_width += 2
160
+ elif width == 'A' and char in '●○': # Ambiguous - might be 2 in some terminals
161
+ # For now, treat these as single-width since most Western terminals do
162
+ # But if alignment issues appear with these characters, change to += 2
163
+ display_width += 1
164
+ else:
165
+ display_width += 1
166
+
167
+ return display_width
168
+
169
+ def print_box_line(self, content: str, width: int, align: str = 'left'):
170
+ """Print a single line in a box with proper padding."""
171
+ visible_len = self.get_visible_length(content)
172
+ padding = width - visible_len - 2 # -2 for the borders
173
+
174
+ if align == 'center':
175
+ left_pad = padding // 2
176
+ right_pad = padding - left_pad
177
+ print(f"│{' ' * left_pad}{content}{' ' * right_pad}│")
178
+ else: # left align
179
+ print(f"│{content}{' ' * padding}│")
180
+
181
+ def print_box_header(self, title: str, width: int):
182
+ """Print a box header with title."""
183
+ if title:
184
+ title_with_color = f" \033[96m{title}\033[0m "
185
+ visible_len = self.get_visible_length(title_with_color)
186
+ padding = width - visible_len - 3 # -3 for "╭─" and "╮"
187
+ print(f"╭─{title_with_color}" + "─" * padding + "╮")
188
+ else:
189
+ print("╭" + "─" * (width - 2) + "╮")
190
+
191
+ def print_box_footer(self, width: int):
192
+ """Print a box footer."""
193
+ print("╰" + "─" * (width - 2) + "╯")
194
+
195
+ def print_box_separator(self, width: int):
196
+ """Print a separator line inside a box."""
197
+ # Width already includes space for borders, so we need exact width-2 for the line
198
+ print("├" + "─" * (width - 2) + "┤")
199
+
200
+ def print_empty_line(self, width: int):
201
+ """Print an empty line inside a box."""
202
+ print("│" + " " * (width - 2) + "│")
203
+
204
+ def create_box(self, lines: List[str], width: Optional[int] = None) -> str:
205
+ """Create a box with Unicode borders."""
206
+ if width is None:
207
+ width = min(self.get_terminal_width() - 2, 80)
208
+
209
+ # Box drawing characters
210
+ top_left = "╭"
211
+ top_right = "╮"
212
+ bottom_left = "╰"
213
+ bottom_right = "╯"
214
+ horizontal = "─"
215
+ vertical = "│"
216
+
217
+ # Calculate inner width
218
+ inner_width = width - 4 # Account for borders and padding
219
+
220
+ # Build box
221
+ result = []
222
+ result.append(top_left + horizontal * (width - 2) + top_right)
223
+
224
+ for line in lines:
225
+ # Calculate visible length to handle ANSI codes
226
+ visible_len = self.get_visible_length(line)
227
+ # Calculate padding needed
228
+ padding_needed = inner_width - visible_len
229
+ # Create padded line with correct spacing
230
+ padded = f" {line}{' ' * padding_needed} "
231
+ result.append(vertical + padded + vertical)
232
+
233
+ result.append(bottom_left + horizontal * (width - 2) + bottom_right)
234
+
235
+ return "\n".join(result)
236
+
237
+ def print_welcome(self):
238
+ """Print welcome message in Claude Code style."""
239
+ width = min(self.get_terminal_width() - 2, 70)
240
+
241
+ # Get current working directory
242
+ cwd = os.getcwd()
243
+
244
+ # Welcome box content
245
+ welcome_lines = [
246
+ "\033[96m✻ Welcome to Cortex!\033[0m",
247
+ "",
248
+ " \033[93m/help\033[0m for help, \033[93m/status\033[0m for your current setup",
249
+ "",
250
+ f" \033[2mcwd:\033[0m {cwd}"
251
+ ]
252
+
253
+ print(self.create_box(welcome_lines, width))
254
+ print()
255
+
256
+ # Show last used model if configured
257
+ if self.config.model.last_used_model:
258
+ # Clean up the model name for display
259
+ display_name = self.config.model.last_used_model
260
+ if display_name.startswith("_Users_") and ("_4bit" in display_name or "_5bit" in display_name or "_8bit" in display_name):
261
+ # Extract clean model name from cached path
262
+ parts = display_name.replace("_4bit", "").replace("_5bit", "").replace("_8bit", "").split("_")
263
+ if len(parts) > 3:
264
+ display_name = parts[-1] # Get just the model name
265
+ print(f" \033[2m※ Last model:\033[0m \033[93m{display_name}\033[0m")
266
+
267
+ print(" \033[2m※ Tip: Use\033[0m \033[93m/download\033[0m \033[2mto get models from HuggingFace\033[0m")
268
+
269
+ # Show input mode info
270
+ print(" \033[2m※ Basic input mode (install prompt-toolkit for enhanced features)\033[0m")
271
+ print()
272
+
273
+ def load_default_model(self):
274
+ """Load the last used model or default model if configured."""
275
+ # Try to load last used model first
276
+ model_to_load = self.config.model.last_used_model or self.config.model.default_model
277
+
278
+ if not model_to_load:
279
+ print("\n \033[96m⚡\033[0m No model loaded. Use \033[93m/model\033[0m to select a model.")
280
+ return
281
+
282
+ # Check if this is a cached MLX model (contains _4bit, _5bit, etc.)
283
+ if "_4bit" in model_to_load or "_5bit" in model_to_load or "_8bit" in model_to_load:
284
+ # Extract clean model name from cached path
285
+ clean_name = model_to_load
286
+ if clean_name.startswith("_Users_"):
287
+ # Extract the actual model name from the path
288
+ parts = clean_name.replace("_4bit", "").replace("_5bit", "").replace("_8bit", "").split("_")
289
+ if len(parts) > 3:
290
+ clean_name = parts[-1] # Get just the model name
291
+
292
+ # This is a cached MLX model, try to load it directly
293
+ print(f"\n \033[96m⚡\033[0m Loading: \033[93m{clean_name}\033[0m \033[2m(MLX optimized)\033[0m...")
294
+ success, message = self.model_manager.load_model(model_to_load)
295
+
296
+ if success:
297
+ model_info = self.model_manager.get_current_model()
298
+ if model_info:
299
+ # Show clean model info
300
+ if "_4bit" in model_to_load:
301
+ quant_type = "4-bit"
302
+ elif "_8bit" in model_to_load:
303
+ quant_type = "8-bit"
304
+ elif "_5bit" in model_to_load:
305
+ quant_type = "5-bit"
306
+ else:
307
+ quant_type = ""
308
+
309
+ print(f" \033[32m✓\033[0m Model ready: \033[93m{clean_name}\033[0m")
310
+ if quant_type:
311
+ print(f" \033[2m• Size: {model_info.size_gb:.1f}GB ({quant_type} quantized)\033[0m")
312
+ else:
313
+ print(f" \033[2m• Size: {model_info.size_gb:.1f}GB (quantized)\033[0m")
314
+ print(f" \033[2m• Optimizations: AMX acceleration, operation fusion\033[0m")
315
+ print(f" \033[2m• Format: MLX (Apple Silicon optimized)\033[0m")
316
+
317
+ # Show template information
318
+ tokenizer = self.model_manager.tokenizers.get(model_info.name)
319
+ profile = self.template_registry.setup_model(
320
+ model_info.name,
321
+ tokenizer=tokenizer,
322
+ interactive=False
323
+ )
324
+ if profile:
325
+ template_name = profile.config.name
326
+ print(f" \033[2m• Template: {template_name}\033[0m")
327
+ else:
328
+ # Try to extract original model name and reload
329
+ base_name = model_to_load.replace("_4bit", "").replace("_5bit", "").replace("_8bit", "")
330
+ if base_name.startswith("_Users_"):
331
+ # Extract original path
332
+ original_path = "/" + base_name[1:].replace("_", "/")
333
+ if Path(original_path).exists():
334
+ print(f" \033[2m※ Cached model not found, reconverting from original...\033[0m")
335
+ success, message = self.model_manager.load_model(original_path)
336
+ if success:
337
+ model_info = self.model_manager.get_current_model()
338
+ if model_info:
339
+ print(f" \033[32m✓\033[0m Model loaded: \033[93m{model_info.name}\033[0m \033[2m({model_info.size_gb:.1f}GB, {model_info.format.value})\033[0m")
340
+ return
341
+
342
+ print(f"\n \033[31m⚠\033[0m Previously used model not found: \033[93m{model_to_load}\033[0m")
343
+ print(" Use \033[93m/model\033[0m to select a different model or \033[93m/download\033[0m to get new models.")
344
+ return
345
+
346
+ # Try to find the model
347
+ model_path = None
348
+
349
+ # First, check if it's a direct path that exists
350
+ potential_path = Path(model_to_load).expanduser()
351
+ if potential_path.exists():
352
+ model_path = potential_path
353
+ else:
354
+ # Try in the models directory
355
+ potential_path = self.config.model.model_path / model_to_load
356
+ if potential_path.exists():
357
+ model_path = potential_path
358
+ else:
359
+ # Search for the model in available models
360
+ available = self.model_manager.discover_available_models()
361
+ for model in available:
362
+ if model['name'] == model_to_load:
363
+ model_path = Path(model['path'])
364
+ break
365
+
366
+ if not model_path:
367
+ print(f"\n \033[31m⚠\033[0m Previously used model not found: \033[93m{model_to_load}\033[0m")
368
+ print(" Use \033[93m/model\033[0m to select a different model or \033[93m/download\033[0m to get new models.")
369
+ return
370
+
371
+ print(f"\n \033[96m⚡\033[0m Loading: \033[93m{model_to_load}\033[0m...")
372
+ success, message = self.model_manager.load_model(str(model_path))
373
+
374
+ if success:
375
+ model_info = self.model_manager.get_current_model()
376
+ if model_info:
377
+ print(f" \033[32m✓\033[0m Model loaded: \033[93m{model_info.name}\033[0m \033[2m({model_info.size_gb:.1f}GB, {model_info.format.value})\033[0m")
378
+
379
+ # Show template information
380
+ tokenizer = self.model_manager.tokenizers.get(model_info.name)
381
+ profile = self.template_registry.setup_model(
382
+ model_info.name,
383
+ tokenizer=tokenizer,
384
+ interactive=False
385
+ )
386
+ if profile:
387
+ template_name = profile.config.name
388
+ print(f" \033[2m• Template: {template_name}\033[0m")
389
+ else:
390
+ print(f" \033[31m✗\033[0m Failed to load model: {message}", file=sys.stderr)
391
+ print(" Use \033[93m/model\033[0m to select a different model.")
392
+
393
+ def handle_command(self, command: str) -> bool:
394
+ """Handle slash commands. Returns False to exit."""
395
+ parts = command.split(maxsplit=1)
396
+ cmd = parts[0].lower()
397
+ args = parts[1] if len(parts) > 1 else ""
398
+
399
+ if cmd == "/help":
400
+ self.show_help()
401
+ elif cmd == "/model":
402
+ self.manage_models(args)
403
+ elif cmd == "/download":
404
+ self.download_model(args)
405
+ elif cmd == "/clear":
406
+ self.clear_conversation()
407
+ elif cmd == "/save":
408
+ self.save_conversation()
409
+ elif cmd == "/status":
410
+ self.show_status()
411
+ elif cmd == "/gpu":
412
+ self.show_gpu_status()
413
+ elif cmd == "/benchmark":
414
+ self.run_benchmark()
415
+ elif cmd == "/template":
416
+ self.manage_template(args)
417
+ elif cmd == "/finetune":
418
+ self.run_finetune()
419
+ elif cmd == "/login":
420
+ self.hf_login()
421
+ elif cmd in ["/quit", "/exit"]:
422
+ return False
423
+ elif cmd == "?":
424
+ self.show_shortcuts()
425
+ else:
426
+ print(f"\033[31mUnknown command: {cmd}\033[0m")
427
+ print("\033[2mType /help for available commands\033[0m")
428
+
429
+ return True
430
+
431
+ def show_shortcuts(self):
432
+ """Show keyboard shortcuts."""
433
+ width = min(self.get_terminal_width() - 2, 70)
434
+
435
+ print()
436
+ self.print_box_header("Keyboard Shortcuts", width)
437
+ self.print_empty_line(width)
438
+
439
+ shortcuts = [
440
+ ("Ctrl+C", "Cancel current generation"),
441
+ ("Ctrl+D", "Exit Cortex"),
442
+ ("Tab", "Auto-complete commands"),
443
+ ("/help", "Show all commands"),
444
+ ("?", "Show this help")
445
+ ]
446
+
447
+ for key, desc in shortcuts:
448
+ # Color the key/command in yellow
449
+ colored_key = f"\033[93m{key}\033[0m"
450
+ # Calculate padding
451
+ key_width = len(key)
452
+ padding = " " * (12 - key_width) # Align descriptions at column 14
453
+ line = f" {colored_key}{padding}{desc}"
454
+ self.print_box_line(line, width)
455
+
456
+ self.print_empty_line(width)
457
+ self.print_box_footer(width)
458
+
459
+ def show_help(self):
460
+ """Show available commands."""
461
+ width = min(self.get_terminal_width() - 2, 70)
462
+
463
+ print()
464
+ self.print_box_header("Available Commands", width)
465
+ self.print_empty_line(width)
466
+
467
+ commands = [
468
+ ("/help", "Show this help message"),
469
+ ("/status", "Show current setup and GPU info"),
470
+ ("/download", "Download a model from HuggingFace"),
471
+ ("/model", "Manage models (load/delete/info)"),
472
+ ("/finetune", "Fine-tune a model interactively"),
473
+ ("/clear", "Clear conversation history"),
474
+ ("/save", "Save current conversation"),
475
+ ("/template", "Manage chat templates"),
476
+ ("/gpu", "Show GPU status"),
477
+ ("/benchmark", "Run performance benchmark"),
478
+ ("/login", "Login to HuggingFace for gated models"),
479
+ ("/quit", "Exit Cortex")
480
+ ]
481
+
482
+ for cmd, desc in commands:
483
+ # Format: " /command description"
484
+ # Color the command in yellow
485
+ colored_cmd = f"\033[93m{cmd}\033[0m"
486
+ # Calculate padding between command and description
487
+ cmd_width = len(cmd)
488
+ padding = " " * (12 - cmd_width) # Align descriptions at column 14
489
+ line = f" {colored_cmd}{padding}{desc}"
490
+ self.print_box_line(line, width)
491
+
492
+ self.print_empty_line(width)
493
+ self.print_box_footer(width)
494
+
495
+ def download_model(self, args: str = ""):
496
+ """Download a model from HuggingFace."""
497
+ if args:
498
+ # Direct download with provided args
499
+ parts = args.split()
500
+ repo_id = parts[0]
501
+ filename = parts[1] if len(parts) > 1 else None
502
+ else:
503
+ # Interactive mode with numbered options
504
+ width = min(self.get_terminal_width() - 2, 70)
505
+
506
+ # Create download UI box using helper methods
507
+ print()
508
+ self.print_box_header("Model Manager", width)
509
+ self.print_empty_line(width)
510
+
511
+ option_num = 1
512
+ available = self.model_manager.discover_available_models()
513
+
514
+ # Show already downloaded models with numbers to load
515
+ if available:
516
+ self.print_box_line(" \033[96mLoad Existing Model:\033[0m", width)
517
+ self.print_empty_line(width)
518
+
519
+ for model in available[:5]: # Show up to 5 downloaded models
520
+ name = model['name'][:width-15]
521
+ size = f"{model['size_gb']:.1f}GB"
522
+ line = f" \033[93m[{option_num}]\033[0m {name} \033[2m({size})\033[0m"
523
+ self.print_box_line(line, width)
524
+ option_num += 1
525
+
526
+ if len(available) > 5:
527
+ line = f" \033[93m[{option_num}]\033[0m \033[2mShow all {len(available)} models...\033[0m"
528
+ self.print_box_line(line, width)
529
+ option_num += 1
530
+
531
+ self.print_empty_line(width)
532
+ self.print_box_separator(width)
533
+ self.print_empty_line(width)
534
+
535
+ # Download new model options
536
+ self.print_box_line(" \033[96mDownload New Model:\033[0m", width)
537
+ self.print_empty_line(width)
538
+
539
+ # Show format in dimmed color
540
+ line = f" \033[2mEnter repository ID (e.g., meta-llama/Llama-3.2-3B)\033[0m"
541
+ self.print_box_line(line, width)
542
+
543
+ self.print_empty_line(width)
544
+ self.print_box_footer(width)
545
+
546
+ # Get user choice
547
+ choice = self.get_input_with_escape("Choice or repo ID")
548
+
549
+ if choice is None:
550
+ return
551
+
552
+ try:
553
+ choice_num = int(choice)
554
+
555
+ # Load existing model
556
+ if available and choice_num <= len(available[:5]):
557
+ model = available[choice_num - 1]
558
+ print(f"\n\033[96m⚡\033[0m Loading {model['name']}...")
559
+ success, msg = self.model_manager.load_model(model['path'])
560
+ if success:
561
+ print(f"\033[32m✓\033[0m Model loaded successfully!")
562
+
563
+ # Show template information
564
+ model_info = self.model_manager.get_current_model()
565
+ if model_info:
566
+ tokenizer = self.model_manager.tokenizers.get(model_info.name)
567
+ profile = self.template_registry.setup_model(
568
+ model_info.name,
569
+ tokenizer=tokenizer,
570
+ interactive=False
571
+ )
572
+ if profile:
573
+ template_name = profile.config.name
574
+ print(f" \033[2m• Template: {template_name}\033[0m")
575
+ else:
576
+ print(f"\033[31m✗\033[0m Failed to load: {msg}")
577
+ return
578
+
579
+ # Show all models
580
+ elif available and choice_num == len(available[:5]) + 1 and len(available) > 5:
581
+ print()
582
+ self.manage_models() # Use the unified model manager
583
+ return
584
+ else:
585
+ print(f"\033[31m✗ Invalid choice\033[0m")
586
+ return
587
+
588
+ except ValueError:
589
+ # Not a number, treat as repository ID
590
+ repo_id = choice
591
+ # Check if filename is provided
592
+ parts = repo_id.split()
593
+ repo_id = parts[0]
594
+ filename = parts[1] if len(parts) > 1 else None
595
+
596
+ # Validate format
597
+ if '/' not in repo_id:
598
+ print(f"\n\033[31m✗ Invalid format. Expected: username/model-name\033[0m")
599
+ return
600
+
601
+ # Show download starting
602
+ print(f"\n\033[96m⬇\033[0m Downloading: \033[93m{repo_id}\033[0m")
603
+ if filename:
604
+ print(f" File: \033[93m{filename}\033[0m")
605
+ print()
606
+
607
+ success, message, path = self.model_downloader.download_model(repo_id, filename)
608
+
609
+ if success:
610
+ # Success message in a nice box
611
+ width = min(self.get_terminal_width() - 2, 70)
612
+ print()
613
+ # Create a custom header with green color for success
614
+ title_with_color = " \033[32mDownload Complete\033[0m "
615
+ visible_len = self.get_visible_length(title_with_color)
616
+ padding = width - visible_len - 3 # -3 for "╭─" and "╮"
617
+ print(f"╭─{title_with_color}" + "─" * padding + "╮")
618
+ self.print_box_line(" \033[32m✓\033[0m Model downloaded successfully!", width)
619
+
620
+ location_str = str(path)[:width-13]
621
+ self.print_box_line(f" \033[2mLocation: {location_str}\033[0m", width)
622
+ self.print_empty_line(width)
623
+ self.print_box_line(" \033[96mLoad this model now?\033[0m", width)
624
+ self.print_box_line(" \033[93m[Y]es\033[0m \033[2m[N]o\033[0m", width)
625
+ self.print_box_footer(width)
626
+
627
+ try:
628
+ choice = input("\n\033[96m▶\033[0m Choice (\033[93my\033[0m/\033[2mn\033[0m): ").strip().lower()
629
+ if choice in ['y', 'yes']:
630
+ print(f"\n\033[96m⚡\033[0m Loading model...")
631
+ load_success, load_msg = self.model_manager.load_model(str(path))
632
+ if load_success:
633
+ print(f"\033[32m✓\033[0m Model loaded successfully!")
634
+ else:
635
+ print(f"\033[31m✗\033[0m Failed to load: {load_msg}")
636
+ except KeyboardInterrupt:
637
+ print("\n\033[2mCancelled\033[0m")
638
+ else:
639
+ print(f"\n\033[31m✗\033[0m {message}")
640
+
641
+ def hf_login(self):
642
+ """Login to HuggingFace for accessing gated models."""
643
+ try:
644
+ from huggingface_hub import login, HfApi
645
+ from huggingface_hub.utils import HfHubHTTPError
646
+ except ImportError:
647
+ print("\n\033[31m✗\033[0m huggingface-hub not installed. Install with: pip install huggingface-hub")
648
+ return
649
+
650
+ width = min(self.get_terminal_width() - 2, 70)
651
+
652
+ # Create login UI box
653
+ print()
654
+ self.print_box_header("HuggingFace Login", width)
655
+ self.print_empty_line(width)
656
+
657
+ # Check if already logged in
658
+ try:
659
+ api = HfApi()
660
+ user_info = api.whoami()
661
+ if user_info:
662
+ username = user_info.get('name', 'Unknown')
663
+ self.print_box_line(f" \033[32m✓\033[0m Already logged in as: \033[93m{username}\033[0m", width)
664
+ self.print_empty_line(width)
665
+ self.print_box_line(" \033[96mOptions:\033[0m", width)
666
+ self.print_box_line(" \033[93m[1]\033[0m Login with new token", width)
667
+ self.print_box_line(" \033[93m[2]\033[0m Logout", width)
668
+ self.print_box_line(" \033[93m[3]\033[0m Cancel", width)
669
+ self.print_box_footer(width)
670
+
671
+ choice = self.get_input_with_escape("Select option (1-3)")
672
+ if choice == '1':
673
+ # Continue to login flow
674
+ pass
675
+ elif choice == '2':
676
+ # Logout
677
+ from huggingface_hub import logout
678
+ logout()
679
+ print("\n\033[32m✓\033[0m Successfully logged out from HuggingFace")
680
+ return
681
+ else:
682
+ return
683
+ except:
684
+ # Not logged in, continue to login flow
685
+ pass
686
+
687
+ # Show login instructions
688
+ print()
689
+ self.print_box_header("HuggingFace Login", width)
690
+ self.print_empty_line(width)
691
+ self.print_box_line(" To access gated models, you need a HuggingFace token.", width)
692
+ self.print_empty_line(width)
693
+ self.print_box_line(" \033[96m1.\033[0m Get your token from:", width)
694
+ self.print_box_line(" \033[93mhttps://huggingface.co/settings/tokens\033[0m", width)
695
+ self.print_empty_line(width)
696
+ self.print_box_line(" \033[96m2.\033[0m Create a token with \033[93mread\033[0m permissions", width)
697
+ self.print_empty_line(width)
698
+ self.print_box_line(" \033[96m3.\033[0m Paste the token below (input hidden)", width)
699
+ self.print_box_footer(width)
700
+
701
+ # Get token with hidden input
702
+ print()
703
+ token = getpass.getpass("\033[96m▶\033[0m Enter token \033[2m(or press Enter to cancel)\033[0m: ")
704
+
705
+ if not token:
706
+ print("\033[2mCancelled\033[0m")
707
+ return
708
+
709
+ # Try to login
710
+ print("\n\033[96m⚡\033[0m Authenticating with HuggingFace...")
711
+ try:
712
+ login(token=token, add_to_git_credential=True)
713
+
714
+ # Verify login
715
+ api = HfApi()
716
+ user_info = api.whoami()
717
+ username = user_info.get('name', 'Unknown')
718
+
719
+ print(f"\033[32m✓\033[0m Successfully logged in as: \033[93m{username}\033[0m")
720
+ print("\033[2m Token saved for future use\033[0m")
721
+ print("\033[2m You can now download gated models\033[0m")
722
+
723
+ except HfHubHTTPError as e:
724
+ if "Invalid token" in str(e):
725
+ print("\033[31m✗\033[0m Invalid token. Please check your token and try again.")
726
+ else:
727
+ print(f"\033[31m✗\033[0m Login failed: {str(e)}")
728
+ except Exception as e:
729
+ print(f"\033[31m✗\033[0m Login failed: {str(e)}")
730
+
731
+ def manage_models(self, args: str = ""):
732
+ """Interactive model manager - simplified for better UX.
733
+ If args provided, tries to load that model directly."""
734
+
735
+ # If args provided, try direct load
736
+ if args:
737
+ print(f"\033[96m⚡\033[0m Loading model: \033[93m{args}\033[0m...")
738
+ success, message = self.model_manager.load_model(args)
739
+ if success:
740
+ print(f"\033[32m✓\033[0m Model loaded successfully")
741
+ else:
742
+ print(f"\033[31m✗\033[0m Failed: {message}", file=sys.stderr)
743
+ return
744
+
745
+ # Interactive mode
746
+ available = self.model_manager.discover_available_models()
747
+
748
+ if not available:
749
+ print(f"\n\033[31m✗\033[0m No models found in \033[2m{self.config.model.model_path}\033[0m")
750
+ print("Use \033[93m/download\033[0m to download models from HuggingFace")
751
+ return
752
+
753
+ width = min(self.get_terminal_width() - 2, 70)
754
+
755
+ # Build the model manager dialog using helper methods
756
+ print()
757
+ self.print_box_header("Select Model", width)
758
+ self.print_empty_line(width)
759
+
760
+ # List models with numbers - simplified view
761
+ for i, model in enumerate(available, 1):
762
+ # Model name and size
763
+ name = model['name'][:width-30]
764
+ size = f"{model['size_gb']:.1f}GB"
765
+
766
+ # Check if currently loaded (handle both original name and MLX cached name)
767
+ current_model = self.model_manager.current_model or ""
768
+ is_current = (model['name'] == current_model or
769
+ model.get('mlx_name') == current_model or
770
+ current_model.endswith(model['name']))
771
+
772
+ # Build status indicators
773
+ status_parts = []
774
+ if model.get('mlx_optimized'):
775
+ status_parts.append("\033[36m⚡ MLX\033[0m") # Cyan lightning for MLX
776
+ elif model.get('mlx_available'):
777
+ status_parts.append("\033[2m○ MLX ready\033[0m") # Dim circle for can be optimized
778
+
779
+ if is_current:
780
+ status_parts.append("\033[32m● loaded\033[0m")
781
+
782
+ status = " ".join(status_parts) if status_parts else ""
783
+
784
+ # Format the line
785
+ if model.get('mlx_optimized'):
786
+ # Show optimized model with special formatting
787
+ line = f" \033[93m[{i}]\033[0m {name} \033[2m({size})\033[0m {status}"
788
+ else:
789
+ line = f" \033[93m[{i}]\033[0m {name} \033[2m({size})\033[0m {status}"
790
+
791
+ self.print_box_line(line, width)
792
+
793
+ self.print_empty_line(width)
794
+ self.print_box_separator(width)
795
+ self.print_empty_line(width)
796
+
797
+ # Additional options
798
+ self.print_box_line(f" \033[93m[D]\033[0m Delete a model", width)
799
+ self.print_box_line(f" \033[93m[N]\033[0m Download new model", width)
800
+
801
+ self.print_empty_line(width)
802
+ self.print_box_footer(width)
803
+
804
+ # Get user choice
805
+ choice = self.get_input_with_escape(f"Select model to load (1-{len(available)}) or option")
806
+
807
+ if choice is None:
808
+ return
809
+
810
+ choice = choice.lower()
811
+
812
+ if choice == 'n':
813
+ self.download_model()
814
+ return
815
+ elif choice == 'd':
816
+ # Delete mode - show models again for deletion
817
+ del_choice = self.get_input_with_escape(f"Select model to delete (1-{len(available)})")
818
+ if del_choice is None:
819
+ return
820
+ try:
821
+ model_idx = int(del_choice) - 1
822
+ if 0 <= model_idx < len(available):
823
+ selected_model = available[model_idx]
824
+ print(f"\n\033[31m⚠\033[0m Delete \033[93m{selected_model['name']}\033[0m?")
825
+ print(f" This will free \033[93m{selected_model['size_gb']:.1f}GB\033[0m of disk space.")
826
+ confirm = self.get_input_with_escape("Confirm deletion (\033[93my\033[0m/\033[2mN\033[0m)")
827
+ if confirm is None:
828
+ return
829
+ confirm = confirm.lower()
830
+
831
+ if confirm == 'y':
832
+ # Delete the model
833
+ model_path = Path(selected_model['path'])
834
+ try:
835
+ if model_path.is_file():
836
+ model_path.unlink()
837
+ elif model_path.is_dir():
838
+ import shutil
839
+ shutil.rmtree(model_path)
840
+
841
+ print(f"\033[32m✓\033[0m Model deleted successfully. Freed \033[93m{selected_model['size_gb']:.1f}GB\033[0m.")
842
+
843
+ # If this was the current model, clear it
844
+ if selected_model['name'] == self.model_manager.current_model:
845
+ self.model_manager.current_model = None
846
+ print("\033[2mNote: Deleted model was currently loaded. Load another model to continue.\033[0m")
847
+ except Exception as e:
848
+ print(f"\033[31m✗\033[0m Failed to delete: {str(e)}")
849
+ else:
850
+ print("\033[2mDeletion cancelled.\033[0m")
851
+ except (ValueError, IndexError):
852
+ print("\033[31m✗\033[0m Invalid selection")
853
+ return
854
+
855
+ try:
856
+ model_idx = int(choice) - 1
857
+ if 0 <= model_idx < len(available):
858
+ selected_model = available[model_idx]
859
+
860
+ # If already loaded, inform user
861
+ if selected_model['name'] == self.model_manager.current_model:
862
+ print(f"\033[2mModel already loaded: {selected_model['name']}\033[0m")
863
+ return
864
+
865
+ # Load model directly - no second prompt
866
+ print(f"\n\033[96m⚡\033[0m Loading \033[93m{selected_model['name']}\033[0m...")
867
+ success, message = self.model_manager.load_model(selected_model['path'])
868
+ if success:
869
+ # Show the same detailed info as startup
870
+ model_info = self.model_manager.get_current_model()
871
+ if model_info:
872
+ # Determine quantization type from name or model info
873
+ model_name = model_info.name
874
+ if "_4bit" in model_name or "4bit" in str(model_info.quantization):
875
+ quant_type = "4-bit"
876
+ elif "_5bit" in model_name or "5bit" in str(model_info.quantization):
877
+ quant_type = "5-bit"
878
+ elif "_8bit" in model_name or "8bit" in str(model_info.quantization):
879
+ quant_type = "8-bit"
880
+ else:
881
+ quant_type = "" # Don't duplicate "quantized"
882
+
883
+ # Clean model name for display
884
+ clean_name = selected_model['name']
885
+ if clean_name.startswith("_Users_"):
886
+ # Extract just the model name from the path
887
+ parts = clean_name.split("_")
888
+ for i, part in enumerate(parts):
889
+ if "models" in part:
890
+ clean_name = "_".join(parts[i+1:])
891
+ break
892
+ clean_name = clean_name.replace("_4bit", "").replace("_5bit", "").replace("_8bit", "")
893
+
894
+ # Format the model format nicely
895
+ format_display = model_info.format.value
896
+ if format_display.lower() == "mlx":
897
+ format_display = "MLX (Apple Silicon optimized)"
898
+ elif format_display.lower() == "gguf":
899
+ format_display = "GGUF" # Remove redundant "(quantized)"
900
+ elif format_display.lower() == "safetensors":
901
+ format_display = "SafeTensors"
902
+ elif format_display.lower() == "pytorch":
903
+ format_display = "PyTorch"
904
+
905
+ print(f" \033[32m✓\033[0m Model ready: \033[93m{clean_name}\033[0m")
906
+ # Show quantization info only if we have specific type
907
+ if quant_type:
908
+ print(f" \033[2m• Size: {model_info.size_gb:.1f}GB ({quant_type} quantized)\033[0m")
909
+ else:
910
+ print(f" \033[2m• Size: {model_info.size_gb:.1f}GB (quantized)\033[0m")
911
+ print(f" \033[2m• Optimizations: AMX acceleration, operation fusion\033[0m")
912
+ print(f" \033[2m• Format: {format_display}\033[0m")
913
+
914
+ # Show template information
915
+ tokenizer = self.model_manager.tokenizers.get(model_info.name)
916
+ profile = self.template_registry.setup_model(
917
+ model_info.name,
918
+ tokenizer=tokenizer,
919
+ interactive=False
920
+ )
921
+ if profile:
922
+ template_name = profile.config.name
923
+ print(f" \033[2m• Template: {template_name}\033[0m")
924
+ else:
925
+ print(f"\033[32m✓\033[0m Model loaded successfully!")
926
+ else:
927
+ print(f"\033[31m✗\033[0m Failed to load: {message}")
928
+ else:
929
+ print("\033[31m✗\033[0m Invalid selection")
930
+ except ValueError:
931
+ print("\033[31m✗\033[0m Invalid choice")
932
+
933
+ def clear_conversation(self):
934
+ """Clear conversation history."""
935
+ self.conversation_manager.new_conversation()
936
+ print("\033[32m✓\033[0m Conversation cleared.")
937
+
938
+ def save_conversation(self):
939
+ """Save current conversation."""
940
+ try:
941
+ export_data = self.conversation_manager.export_conversation(format="json")
942
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
943
+ filename = self.config.conversation.save_directory / f"conversation_{timestamp}.json"
944
+
945
+ with open(filename, 'w') as f:
946
+ f.write(export_data)
947
+
948
+ print(f"\033[32m✓\033[0m Conversation saved to {filename}")
949
+ except Exception as e:
950
+ print(f"\033[31m✗\033[0m Failed to save: {str(e)}", file=sys.stderr)
951
+
952
+ def show_status(self):
953
+ """Show current setup status."""
954
+ is_valid, gpu_info, errors = self.gpu_validator.validate()
955
+
956
+ width = min(self.get_terminal_width() - 2, 70) # Consistent width with other dialogs
957
+
958
+ print()
959
+ self.print_box_header("Current Setup", width)
960
+ self.print_empty_line(width)
961
+
962
+ # GPU Info
963
+ if gpu_info:
964
+ self.print_box_line(f" \033[2mGPU:\033[0m \033[93m{gpu_info.chip_name}\033[0m", width)
965
+ self.print_box_line(f" \033[2mCores:\033[0m \033[93m{gpu_info.gpu_cores}\033[0m", width)
966
+
967
+ mem_gb = gpu_info.total_memory / (1024**3)
968
+ mem_str = f"{mem_gb:.1f} GB"
969
+ self.print_box_line(f" \033[2mMemory:\033[0m \033[93m{mem_str}\033[0m", width)
970
+
971
+ # Model Info
972
+ if self.model_manager.current_model:
973
+ model_info = self.model_manager.get_current_model()
974
+ if model_info:
975
+ self.print_box_line(f" \033[2mModel:\033[0m \033[93m{model_info.name[:43]}\033[0m", width)
976
+
977
+ # Template info
978
+ tokenizer = self.model_manager.tokenizers.get(model_info.name)
979
+ profile = self.template_registry.get_template(model_info.name)
980
+ if profile:
981
+ template_name = profile.config.name
982
+ self.print_box_line(f" \033[2mTemplate:\033[0m \033[93m{template_name}\033[0m", width)
983
+ else:
984
+ self.print_box_line(f" \033[2mModel:\033[0m \033[31mNone loaded\033[0m", width)
985
+
986
+ self.print_empty_line(width)
987
+ self.print_box_footer(width)
988
+
989
+ def show_gpu_status(self):
990
+ """Show GPU status."""
991
+ is_valid, gpu_info, errors = self.gpu_validator.validate()
992
+ if gpu_info:
993
+ print(f"\n\033[96mGPU Information:\033[0m")
994
+ print(f" Chip: \033[93m{gpu_info.chip_name}\033[0m")
995
+ print(f" GPU Cores: \033[93m{gpu_info.gpu_cores}\033[0m")
996
+ print(f" Total Memory: \033[93m{gpu_info.total_memory / (1024**3):.1f} GB\033[0m")
997
+ print(f" Available Memory: \033[93m{gpu_info.available_memory / (1024**3):.1f} GB\033[0m")
998
+ print(f" Metal Support: {'\033[32mYes\033[0m' if gpu_info.has_metal else '\033[31mNo\033[0m'}")
999
+ print(f" MPS Support: {'\033[32mYes\033[0m' if gpu_info.has_mps else '\033[31mNo\033[0m'}")
1000
+
1001
+ memory_status = self.model_manager.get_memory_status()
1002
+ print(f"\n\033[96mMemory Status:\033[0m")
1003
+ print(f" Available: \033[93m{memory_status['available_gb']:.1f} GB\033[0m")
1004
+ print(f" Models Loaded: \033[93m{memory_status['models_loaded']}\033[0m")
1005
+ print(f" Model Memory: \033[93m{memory_status['model_memory_gb']:.1f} GB\033[0m")
1006
+
1007
+ def run_benchmark(self):
1008
+ """Run performance benchmark."""
1009
+ if not self.model_manager.current_model:
1010
+ print("\033[31m✗\033[0m No model loaded.")
1011
+ return
1012
+
1013
+ print("\033[96m⚡\033[0m Running benchmark (100 tokens)...")
1014
+ metrics = self.inference_engine.benchmark()
1015
+
1016
+ if metrics:
1017
+ print(f"\n\033[96mBenchmark Results:\033[0m")
1018
+ print(f" Tokens Generated: \033[93m{metrics.tokens_generated}\033[0m")
1019
+ print(f" Time: \033[93m{metrics.time_elapsed:.2f}s\033[0m")
1020
+ print(f" Tokens/Second: \033[93m{metrics.tokens_per_second:.1f}\033[0m")
1021
+ print(f" First Token: \033[93m{metrics.first_token_latency:.3f}s\033[0m")
1022
+ print(f" GPU Usage: \033[93m{metrics.gpu_utilization:.1f}%\033[0m")
1023
+ print(f" Memory: \033[93m{metrics.memory_used_gb:.1f}GB\033[0m")
1024
+
1025
+ def manage_template(self, args: str = ""):
1026
+ """Manage template configuration for the current model."""
1027
+ if not self.model_manager.current_model:
1028
+ print("\033[31m✗\033[0m No model loaded.")
1029
+ return
1030
+
1031
+ model_name = self.model_manager.current_model
1032
+ tokenizer = self.model_manager.tokenizers.get(model_name)
1033
+
1034
+ # If args provided, handle specific subcommands
1035
+ if args:
1036
+ args_parts = args.split()
1037
+ subcommand = args_parts[0].lower()
1038
+
1039
+ if subcommand == "reset":
1040
+ if self.template_registry.reset_model_config(model_name):
1041
+ print(f"\033[32m✓\033[0m Template configuration reset for {model_name}")
1042
+ else:
1043
+ print(f"\033[31m✗\033[0m No configuration found for {model_name}")
1044
+ return
1045
+ elif subcommand == "status":
1046
+ config = self.template_registry.config_manager.get_model_config(model_name)
1047
+ if config:
1048
+ self.template_registry.interactive.show_current_config(model_name, config)
1049
+ else:
1050
+ print(f"\033[33m⚠\033[0m No template configuration for {model_name}")
1051
+ return
1052
+
1053
+ # Interactive template configuration
1054
+ print(f"\n\033[96m⚙\033[0m Configuring template for: \033[93m{model_name}\033[0m")
1055
+
1056
+ # Force interactive setup
1057
+ profile = self.template_registry.setup_model(
1058
+ model_name,
1059
+ tokenizer=tokenizer,
1060
+ interactive=True,
1061
+ force_setup=True
1062
+ )
1063
+
1064
+ print(f"\n\033[32m✓\033[0m Template configured successfully!")
1065
+
1066
+ def run_finetune(self):
1067
+ """Run the interactive fine-tuning wizard."""
1068
+ # Check if any models are available
1069
+ available = self.model_manager.discover_available_models()
1070
+ if not available:
1071
+ print(f"\n\033[31m✗\033[0m No models found. Use \033[93m/download\033[0m to download a model first.")
1072
+ return
1073
+
1074
+ # Pass CLI instance to wizard so it can use the box methods
1075
+ self.fine_tune_wizard.cli = self
1076
+
1077
+ # Run the wizard
1078
+ success, message = self.fine_tune_wizard.start()
1079
+
1080
+ if success:
1081
+ print(f"\n\033[32m✓\033[0m {message}")
1082
+ else:
1083
+ if "cancelled" not in message.lower():
1084
+ print(f"\n\033[31m✗\033[0m {message}")
1085
+ # If cancelled, wizard already handles the message
1086
+
1087
+ def generate_response(self, user_input: str):
1088
+ """Generate and stream response from the model."""
1089
+ if not self.model_manager.current_model:
1090
+ print("\n\033[31m✗\033[0m No model loaded. Use \033[93m/model\033[0m to load a model or \033[93m/download\033[0m to download one.")
1091
+ return
1092
+
1093
+ # Get current model name and tokenizer
1094
+ model_name = self.model_manager.current_model
1095
+ tokenizer = self.model_manager.tokenizers.get(model_name)
1096
+
1097
+ # Setup model template to get the profile
1098
+ template_profile = None
1099
+ uses_reasoning_template = False
1100
+ try:
1101
+ template_profile = self.template_registry.setup_model(
1102
+ model_name,
1103
+ tokenizer=tokenizer,
1104
+ interactive=False
1105
+ )
1106
+ # Check if this is a reasoning template
1107
+ if template_profile and hasattr(template_profile.config, 'template_type'):
1108
+ from cortex.template_registry.template_profiles.base import TemplateType
1109
+ uses_reasoning_template = (template_profile.config.template_type == TemplateType.REASONING)
1110
+ except Exception as e:
1111
+ logger.debug(f"Failed to get template profile: {e}")
1112
+
1113
+ # Build conversation context with proper formatting BEFORE adding to conversation
1114
+ formatted_prompt = self._format_prompt_with_chat_template(user_input)
1115
+
1116
+ # DEBUG: Uncomment these lines to see the exact prompt being sent to the model
1117
+ # This is crucial for debugging when models give unexpected responses
1118
+ # It shows the formatted prompt with all special tokens and formatting
1119
+ # print(f"\033[33m[DEBUG] Formatted prompt being sent to model:\033[0m", file=sys.stderr)
1120
+ # print(f"\033[33m{repr(formatted_prompt[:200])}...\033[0m", file=sys.stderr)
1121
+
1122
+ # Now add user message to conversation history
1123
+ self.conversation_manager.add_message(MessageRole.USER, user_input)
1124
+
1125
+ # Start response on a new line; prefix is rendered with the markdown output.
1126
+ print()
1127
+
1128
+ # Get stop sequences from template profile
1129
+ stop_sequences = []
1130
+ if template_profile and hasattr(template_profile, 'get_stop_sequences'):
1131
+ try:
1132
+ stop_sequences = template_profile.get_stop_sequences()
1133
+ logger.debug(f"Using stop sequences from template: {stop_sequences}")
1134
+ except Exception as e:
1135
+ logger.debug(f"Could not get stop sequences: {e}")
1136
+
1137
+ # Create generation request with formatted prompt
1138
+ # Use lower temperature for more focused responses
1139
+ request = GenerationRequest(
1140
+ prompt=formatted_prompt,
1141
+ max_tokens=self.config.inference.max_tokens,
1142
+ temperature=0.3, # Lower temperature for less randomness
1143
+ top_p=0.9, # Slightly lower top_p
1144
+ top_k=self.config.inference.top_k,
1145
+ repetition_penalty=self.config.inference.repetition_penalty,
1146
+ stream=True,
1147
+ stop_sequences=stop_sequences
1148
+ )
1149
+
1150
+ # Generate response
1151
+ self.generating = True
1152
+ generated_text = ""
1153
+ start_time = time.time()
1154
+ token_count = 0
1155
+ first_token_time = None
1156
+
1157
+ try:
1158
+ # Reset streaming state for reasoning templates if supported
1159
+ if uses_reasoning_template and template_profile and template_profile.supports_streaming():
1160
+ if hasattr(template_profile, 'reset_streaming_state'):
1161
+ template_profile.reset_streaming_state()
1162
+
1163
+ display_text = ""
1164
+ accumulated_response = ""
1165
+ last_render_time = 0.0
1166
+ render_interval = 0.05 # seconds
1167
+ prefix_style = Style(color="cyan")
1168
+
1169
+ def build_renderable(text: str):
1170
+ markdown = ThinkMarkdown(text, code_theme="monokai", use_line_numbers=False)
1171
+ return PrefixedRenderable(markdown, prefix="⏺ ", prefix_style=prefix_style, indent=" ")
1172
+
1173
+ with Live(
1174
+ build_renderable(""),
1175
+ console=self.console,
1176
+ refresh_per_second=20,
1177
+ transient=False,
1178
+ ) as live:
1179
+ for token in self.inference_engine.generate(request):
1180
+ if first_token_time is None:
1181
+ first_token_time = time.time()
1182
+
1183
+ generated_text += token
1184
+ token_count += 1
1185
+
1186
+ display_token = token
1187
+ if uses_reasoning_template and template_profile and template_profile.supports_streaming():
1188
+ display_token, should_display = template_profile.process_streaming_response(
1189
+ token, accumulated_response
1190
+ )
1191
+ accumulated_response += token
1192
+ if not should_display:
1193
+ display_token = ""
1194
+
1195
+ if display_token:
1196
+ display_text += display_token
1197
+
1198
+ now = time.time()
1199
+ if display_token and ("\n" in display_token or now - last_render_time >= render_interval):
1200
+ live.update(build_renderable(display_text))
1201
+ last_render_time = now
1202
+
1203
+ if uses_reasoning_template and template_profile:
1204
+ final_text = template_profile.process_response(generated_text)
1205
+ generated_text = final_text
1206
+ if not template_profile.config.show_reasoning:
1207
+ display_text = final_text
1208
+
1209
+ live.update(build_renderable(display_text))
1210
+
1211
+ # Add blank line for spacing between response and metrics
1212
+ print()
1213
+
1214
+ # Display final metrics in a clean, professional way
1215
+ elapsed = time.time() - start_time
1216
+ if token_count > 0 and elapsed > 0:
1217
+ tokens_per_sec = token_count / elapsed
1218
+ first_token_latency = first_token_time - start_time if first_token_time else 0
1219
+
1220
+ # Build metrics parts - all will be wrapped in dim for subtlety
1221
+ metrics_parts = []
1222
+
1223
+ if first_token_latency > 0.1:
1224
+ # First token latency
1225
+ metrics_parts.append(f"first {first_token_latency:.2f}s")
1226
+
1227
+ # Total time
1228
+ metrics_parts.append(f"total {elapsed:.1f}s")
1229
+
1230
+ # Token count
1231
+ metrics_parts.append(f"tokens {token_count}")
1232
+
1233
+ # Throughput
1234
+ metrics_parts.append(f"speed {tokens_per_sec:.1f} tok/s")
1235
+
1236
+ # Print entire metrics line as dim/secondary to make it less prominent
1237
+ # Indent metrics to align with response text
1238
+ metrics_line = " · ".join(metrics_parts)
1239
+ print(f" \033[2m{metrics_line}\033[0m")
1240
+
1241
+ # Add assistant message to conversation history
1242
+ self.conversation_manager.add_message(MessageRole.ASSISTANT, generated_text)
1243
+
1244
+ except Exception as e:
1245
+ print(f"\n\033[31m✗ Error:\033[0m {str(e)}", file=sys.stderr)
1246
+
1247
+ finally:
1248
+ self.generating = False
1249
+
1250
+ def get_user_input(self) -> str:
1251
+ """Get user input with standard prompt."""
1252
+ try:
1253
+ print()
1254
+ user_input = input("> ")
1255
+ return user_input.strip()
1256
+ except (KeyboardInterrupt, EOFError):
1257
+ raise
1258
+
1259
+ def _format_prompt_with_chat_template(self, user_input: str) -> str:
1260
+ """Format the prompt with appropriate chat template for the model."""
1261
+ # Get current conversation context
1262
+ conversation = self.conversation_manager.get_current_conversation()
1263
+
1264
+ # Get the tokenizer for the current model
1265
+ model_name = self.model_manager.current_model
1266
+ tokenizer = self.model_manager.tokenizers.get(model_name)
1267
+
1268
+ # Build messages list from conversation history
1269
+ messages = []
1270
+
1271
+ # Add conversation history if exists
1272
+ if conversation and conversation.messages:
1273
+ # Include recent context (last few messages)
1274
+ context_messages = conversation.messages[-10:] # Last 10 messages for context
1275
+ for msg in context_messages:
1276
+ messages.append({
1277
+ "role": msg.role.value,
1278
+ "content": msg.content
1279
+ })
1280
+
1281
+ # Add current user message
1282
+ messages.append({
1283
+ "role": "user",
1284
+ "content": user_input
1285
+ })
1286
+
1287
+ # Use template registry to format messages
1288
+ try:
1289
+ # Setup model template if not already configured
1290
+ profile = self.template_registry.setup_model(
1291
+ model_name,
1292
+ tokenizer=tokenizer,
1293
+ interactive=False # Non-interactive for smoother experience
1294
+ )
1295
+
1296
+ # Format messages using the template
1297
+ formatted = profile.format_messages(messages, add_generation_prompt=True)
1298
+
1299
+ # DEBUG: Uncomment to see formatted prompt
1300
+ # print(f"\033[36m[DEBUG] Using template: {profile.config.name}\033[0m", file=sys.stderr)
1301
+ # print(f"\033[36m[DEBUG] Formatted prompt preview: {formatted[:200]}...\033[0m", file=sys.stderr)
1302
+
1303
+ return formatted
1304
+
1305
+ except (AttributeError, TypeError, ValueError) as e:
1306
+ # Fallback to old method if template registry fails
1307
+ logger.debug(f"Template registry failed: {e}, using fallback")
1308
+
1309
+ if tokenizer and hasattr(tokenizer, 'apply_chat_template'):
1310
+ # Try direct tokenizer method
1311
+ try:
1312
+ formatted = tokenizer.apply_chat_template(
1313
+ messages,
1314
+ tokenize=False,
1315
+ add_generation_prompt=True
1316
+ )
1317
+ return formatted
1318
+ except (AttributeError, TypeError, ValueError) as e:
1319
+ logger.debug(f"Tokenizer apply_chat_template failed: {e}")
1320
+
1321
+ # Fallback: For TinyLlama and other chat models, use the proper format
1322
+ # Check if it's a chat model
1323
+ if model_name and "chat" in model_name.lower():
1324
+ # DEBUG: Uncomment to see when fallback chat format is used
1325
+ # This occurs when tokenizer doesn't have apply_chat_template method
1326
+ # print(f"\033[35m[DEBUG] Using chat model fallback for: {model_name}\033[0m", file=sys.stderr)
1327
+
1328
+ # Use the proper chat format for TinyLlama and similar models
1329
+ # Build conversation history
1330
+ history = ""
1331
+ if conversation and conversation.messages:
1332
+ recent_messages = conversation.messages[-6:] # Get last few messages
1333
+ for msg in recent_messages:
1334
+ if msg.role == MessageRole.USER:
1335
+ history += f"<|user|>\n{msg.content}</s>\n"
1336
+ elif msg.role == MessageRole.ASSISTANT:
1337
+ history += f"<|assistant|>\n{msg.content}</s>\n"
1338
+
1339
+ # Add current user message with proper format
1340
+ prompt = f"{history}<|user|>\n{user_input}</s>\n<|assistant|>\n"
1341
+
1342
+ # DEBUG: Uncomment to confirm fallback format was applied
1343
+ # print(f"\033[35m[DEBUG] Chat fallback format used\033[0m", file=sys.stderr)
1344
+ return prompt
1345
+
1346
+ # Generic fallback for non-chat models
1347
+ if conversation and len(conversation.messages) > 0:
1348
+ # Include some conversation history
1349
+ context = ""
1350
+ recent_messages = conversation.messages[-6:] # Get last few messages
1351
+ for msg in recent_messages:
1352
+ if msg.role == MessageRole.USER:
1353
+ context += f"User: {msg.content}\n"
1354
+ elif msg.role == MessageRole.ASSISTANT:
1355
+ context += f"Assistant: {msg.content}\n"
1356
+
1357
+ # Add current exchange
1358
+ prompt = f"{context}User: {user_input}\nAssistant:"
1359
+ else:
1360
+ # First message in conversation - use simple format
1361
+ prompt = f"User: {user_input}\nAssistant:"
1362
+
1363
+ return prompt
1364
+
1365
+ def get_input_from_box(self) -> str:
1366
+ """Get user input from a styled input box.
1367
+
1368
+ Displays a green-bordered input box, collects user input, then converts
1369
+ the box to a simple prompt in the conversation history.
1370
+
1371
+ Guarantees that the input box is fully cleared after submission so
1372
+ no borders/pipes remain on screen.
1373
+ """
1374
+ width = self.get_terminal_width()
1375
+
1376
+ # ANSI codes
1377
+ GREEN = "\033[92m"
1378
+ YELLOW = "\033[93m"
1379
+ DIM = "\033[2m"
1380
+ RESET = "\033[0m"
1381
+ CLEAR_LINE = "\033[2K"
1382
+ CLEAR_TO_EOL = "\033[K"
1383
+ CURSOR_UP = "\033[A"
1384
+ CURSOR_DOWN = "\033[B"
1385
+ MOVE_COL = lambda n: f"\033[{n}G"
1386
+
1387
+ # Get current model name for display
1388
+ current_model = ""
1389
+ if self.model_manager.current_model:
1390
+ model_name = os.path.basename(self.model_manager.current_model)
1391
+ # Display full model name without truncation
1392
+ current_model = f"{DIM}Model:{RESET} {YELLOW}{model_name}{RESET}"
1393
+
1394
+ # Draw the input box with dim borders
1395
+ print()
1396
+ print(f"{DIM}╭{'─' * (width - 2)}╮{RESET}")
1397
+ print(f"{DIM}│{RESET}{' ' * (width - 2)}{DIM}│{RESET}")
1398
+ print(f"{DIM}│{RESET}{' ' * (width - 2)}{DIM}│{RESET}")
1399
+ print(f"{DIM}│{RESET}{' ' * (width - 2)}{DIM}│{RESET}")
1400
+ print(f"{DIM}╰{'─' * (width - 2)}╯{RESET}")
1401
+
1402
+ # Bottom hint: show current model aligned with box
1403
+ if current_model:
1404
+ print(f"{current_model}")
1405
+ else:
1406
+ print() # Empty line if no model loaded
1407
+
1408
+ # Move cursor to input position inside the box
1409
+ sys.stdout.write("\033[3A") # Move up 3 lines to the input line
1410
+ sys.stdout.write(f"\r{DIM}│{RESET} > ") # Position at prompt
1411
+ sys.stdout.flush()
1412
+
1413
+ try:
1414
+ # Get user input with custom character handling
1415
+ user_input = self._get_protected_input(width)
1416
+
1417
+ # After _get_protected_input returns, the cursor is at the start of the
1418
+ # bottom border line (due to CRLFs when Enter was pressed).
1419
+ # Explicitly clear the entire input box region using relative moves.
1420
+ # 1) Clear hint line (one line below bottom border)
1421
+ sys.stdout.write(f"{CURSOR_DOWN}\r{CLEAR_LINE}")
1422
+ # 2) Clear bottom border
1423
+ sys.stdout.write(f"{CURSOR_UP}\r{CLEAR_LINE}")
1424
+ # 3) Clear padding line
1425
+ sys.stdout.write(f"{CURSOR_UP}\r{CLEAR_LINE}")
1426
+ # 4) Clear input line
1427
+ sys.stdout.write(f"{CURSOR_UP}\r{CLEAR_LINE}")
1428
+ # 5) Clear padding line
1429
+ sys.stdout.write(f"{CURSOR_UP}\r{CLEAR_LINE}")
1430
+ # 6) Clear top border
1431
+ sys.stdout.write(f"{CURSOR_UP}\r{CLEAR_LINE}")
1432
+
1433
+ # Position cursor at the start of where the top border was and print
1434
+ # the clean prompt that represents the submitted user message.
1435
+ sys.stdout.write("\r> " + user_input.strip() + "\n")
1436
+ sys.stdout.flush()
1437
+
1438
+ return user_input.strip()
1439
+
1440
+ except KeyboardInterrupt:
1441
+ # Cleanup already done in _get_protected_input before raising
1442
+ raise
1443
+ except EOFError:
1444
+ # Clean up the box on Ctrl+D by clearing the lines if possible.
1445
+ # We are on the input line.
1446
+ try:
1447
+ sys.stdout.write(f"\r{CLEAR_LINE}") # input line
1448
+ sys.stdout.write(f"{CURSOR_DOWN}\r{CLEAR_LINE}") # padding line
1449
+ sys.stdout.write(f"{CURSOR_DOWN}\r{CLEAR_LINE}") # bottom border
1450
+ sys.stdout.write(f"{CURSOR_DOWN}\r{CLEAR_LINE}") # hint line
1451
+ sys.stdout.write(f"{CURSOR_UP}\r{CLEAR_LINE}") # bottom border
1452
+ sys.stdout.write(f"{CURSOR_UP}\r{CLEAR_LINE}") # padding line
1453
+ sys.stdout.write(f"{CURSOR_UP}\r{CLEAR_LINE}") # input line
1454
+ sys.stdout.write(f"{CURSOR_UP}\r{CLEAR_LINE}") # padding line
1455
+ sys.stdout.write(f"{CURSOR_UP}\r{CLEAR_LINE}") # top border
1456
+ sys.stdout.flush()
1457
+ finally:
1458
+ pass
1459
+ raise
1460
+
1461
+ def _get_protected_input(self, box_width: int) -> str:
1462
+ """Get input with protection against deleting the prompt.
1463
+
1464
+ This method reads input character by character and prevents
1465
+ the user from backspacing past the beginning of their input.
1466
+ """
1467
+ DIM = "\033[2m"
1468
+ RESET = "\033[0m"
1469
+ CLEAR_TO_END = "\033[K"
1470
+ SAVE_CURSOR = "\033[s"
1471
+ RESTORE_CURSOR = "\033[u"
1472
+
1473
+ # Calculate usable width for text (box_width - borders - prompt)
1474
+ # box_width - 2 (borders) - 4 (prompt " > ")
1475
+ max_display_width = box_width - 6
1476
+
1477
+ # Store terminal settings
1478
+ old_settings = termios.tcgetattr(sys.stdin)
1479
+
1480
+ try:
1481
+ # Set terminal to raw mode for character-by-character input
1482
+ # Disable ISIG so we can handle Ctrl+C manually for clean exit
1483
+ new_settings = termios.tcgetattr(sys.stdin)
1484
+ new_settings[3] = new_settings[3] & ~termios.ICANON # Disable canonical mode
1485
+ new_settings[3] = new_settings[3] & ~termios.ECHO # Disable echo
1486
+ new_settings[3] = new_settings[3] & ~termios.ISIG # Disable signals - we'll handle Ctrl+C manually
1487
+ new_settings[6][termios.VMIN] = 1 # Read at least 1 character
1488
+ new_settings[6][termios.VTIME] = 0 # No timeout
1489
+ termios.tcsetattr(sys.stdin, termios.TCSADRAIN, new_settings)
1490
+
1491
+ input_buffer = []
1492
+ cursor_pos = 0
1493
+ view_offset = 0 # For horizontal scrolling when text exceeds width
1494
+
1495
+ def redraw_line():
1496
+ """Redraw the entire input line with proper boundaries."""
1497
+ nonlocal view_offset
1498
+
1499
+ # Calculate what portion of text to display
1500
+ if len(input_buffer) <= max_display_width:
1501
+ # Text fits within box
1502
+ display_text = ''.join(input_buffer)
1503
+ display_cursor_pos = cursor_pos
1504
+ else:
1505
+ # Text needs scrolling
1506
+ # Ensure cursor is visible in the viewport
1507
+ if cursor_pos < view_offset:
1508
+ # Cursor moved left out of view
1509
+ view_offset = cursor_pos
1510
+ elif cursor_pos >= view_offset + max_display_width:
1511
+ # Cursor moved right out of view
1512
+ view_offset = cursor_pos - max_display_width + 1
1513
+
1514
+ # Extract visible portion
1515
+ display_text = ''.join(input_buffer[view_offset:view_offset + max_display_width])
1516
+ display_cursor_pos = cursor_pos - view_offset
1517
+
1518
+ # Clear line and redraw
1519
+ sys.stdout.write(f"\r{DIM}│{RESET} > {display_text}{CLEAR_TO_END}")
1520
+
1521
+ # Draw right border at the correct position
1522
+ # box_width is the full width including borders, so border is at box_width position
1523
+ sys.stdout.write(f"\033[{box_width}G") # Move to border column
1524
+ sys.stdout.write(f"{DIM}│{RESET}")
1525
+
1526
+ # Position cursor at the correct location
1527
+ cursor_column = 5 + display_cursor_pos # 5 = "│ > "
1528
+ sys.stdout.write(f"\033[{cursor_column}G")
1529
+ sys.stdout.flush()
1530
+
1531
+ # Initial display
1532
+ redraw_line()
1533
+
1534
+ while True:
1535
+ char = sys.stdin.read(1)
1536
+
1537
+ # Handle special characters
1538
+ if char == '\r' or char == '\n': # Enter key
1539
+ sys.stdout.write('\r\n')
1540
+ sys.stdout.write('\r\n')
1541
+ sys.stdout.flush()
1542
+ break
1543
+
1544
+ elif char == '\x7f' or char == '\x08': # Backspace (DEL or BS)
1545
+ # Only allow backspace if there are characters to delete
1546
+ if cursor_pos > 0:
1547
+ cursor_pos -= 1
1548
+ input_buffer.pop(cursor_pos)
1549
+ redraw_line()
1550
+ # If cursor_pos is 0, do nothing (can't delete the prompt)
1551
+
1552
+ elif char == '\x03': # Ctrl+C
1553
+ # Clean up the display before raising KeyboardInterrupt
1554
+ # We're in the input line, need to clear the entire box
1555
+ sys.stdout.write("\r\033[2K") # Clear current line
1556
+ sys.stdout.write("\033[1B\r\033[2K") # Down 1, clear padding line
1557
+ sys.stdout.write("\033[1B\r\033[2K") # Down 1, clear bottom border
1558
+ sys.stdout.write("\033[1B\r\033[2K") # Down 1, clear model line
1559
+ sys.stdout.write("\033[4A\r\033[2K") # Up 4 to padding line, clear
1560
+ sys.stdout.write("\033[1A\r\033[2K") # Up 1 to top border, clear
1561
+ sys.stdout.write("\033[1A\r\033[2K") # Up 1 to empty line, clear
1562
+ sys.stdout.write("\r") # Position at start
1563
+ sys.stdout.flush()
1564
+ # Now raise the interrupt for clean exit
1565
+ raise KeyboardInterrupt
1566
+
1567
+ elif char == '\x04': # Ctrl+D
1568
+ raise EOFError
1569
+
1570
+ elif char == '\x1b': # ESC sequence (arrow keys, etc.)
1571
+ # Read the rest of the escape sequence
1572
+ next1 = sys.stdin.read(1)
1573
+ if next1 == '[':
1574
+ next2 = sys.stdin.read(1)
1575
+ if next2 == 'D': # Left arrow
1576
+ if cursor_pos > 0:
1577
+ cursor_pos -= 1
1578
+ redraw_line()
1579
+ elif next2 == 'C': # Right arrow
1580
+ if cursor_pos < len(input_buffer):
1581
+ cursor_pos += 1
1582
+ redraw_line()
1583
+ elif next2 == 'H': # Home
1584
+ cursor_pos = 0
1585
+ view_offset = 0
1586
+ redraw_line()
1587
+ elif next2 == 'F': # End
1588
+ cursor_pos = len(input_buffer)
1589
+ redraw_line()
1590
+ # For other sequences, continue without action
1591
+ continue
1592
+
1593
+ elif ord(char) >= 32: # Printable character
1594
+ # Insert character at cursor position
1595
+ input_buffer.insert(cursor_pos, char)
1596
+ cursor_pos += 1
1597
+ redraw_line()
1598
+
1599
+ return ''.join(input_buffer)
1600
+
1601
+ finally:
1602
+ # Restore terminal settings
1603
+ termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings)
1604
+
1605
+
1606
+ def run(self):
1607
+ """Main REPL loop."""
1608
+ self.print_welcome()
1609
+ self.load_default_model()
1610
+
1611
+ # Start new conversation
1612
+ self.conversation_manager.new_conversation()
1613
+
1614
+ while self.running:
1615
+ try:
1616
+ # Get input from styled box
1617
+ user_input = self.get_input_from_box()
1618
+
1619
+ if not user_input:
1620
+ continue
1621
+
1622
+ # Check for exit commands
1623
+ if user_input.lower() in ['quit', 'exit']:
1624
+ break
1625
+
1626
+ # Handle shortcuts
1627
+ if user_input == '?':
1628
+ self.show_shortcuts()
1629
+ # Don't increment message count for shortcuts
1630
+ continue
1631
+
1632
+ # Handle slash commands
1633
+ if user_input.startswith('/'):
1634
+ if not self.handle_command(user_input):
1635
+ break
1636
+ # Don't increment message count for commands
1637
+ continue
1638
+
1639
+ # Generate response
1640
+ self.generate_response(user_input)
1641
+
1642
+ except EOFError:
1643
+ break
1644
+ except KeyboardInterrupt:
1645
+ # Clean exit on Ctrl+C, same as /quit
1646
+ break
1647
+ except Exception as e:
1648
+ print(f"\033[31m✗ Error:\033[0m {str(e)}", file=sys.stderr)
1649
+
1650
+ print("\n\033[2mGoodbye!\033[0m")
1651
+
1652
+
1653
+ def main():
1654
+ """Main entry point for CLI."""
1655
+ # Initialize components
1656
+ config = Config()
1657
+ gpu_validator = GPUValidator()
1658
+
1659
+ # Validate GPU
1660
+ is_valid, gpu_info, errors = gpu_validator.validate()
1661
+ if not is_valid:
1662
+ print("GPU validation failed. Cortex requires Apple Silicon with Metal support.")
1663
+ for error in errors:
1664
+ print(f" - {error}")
1665
+ sys.exit(1)
1666
+
1667
+ # Initialize managers
1668
+ model_manager = ModelManager(config, gpu_validator)
1669
+ inference_engine = InferenceEngine(config, model_manager)
1670
+ conversation_manager = ConversationManager(config)
1671
+
1672
+ # Create and run CLI
1673
+ cli = CortexCLI(
1674
+ config=config,
1675
+ gpu_validator=gpu_validator,
1676
+ model_manager=model_manager,
1677
+ inference_engine=inference_engine,
1678
+ conversation_manager=conversation_manager
1679
+ )
1680
+
1681
+ cli.run()
1682
+
1683
+
1684
+ if __name__ == "__main__":
1685
+ main()