cortex-llm 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. cortex/__init__.py +73 -0
  2. cortex/__main__.py +83 -0
  3. cortex/config.py +329 -0
  4. cortex/conversation_manager.py +468 -0
  5. cortex/fine_tuning/__init__.py +8 -0
  6. cortex/fine_tuning/dataset.py +332 -0
  7. cortex/fine_tuning/mlx_lora_trainer.py +502 -0
  8. cortex/fine_tuning/trainer.py +957 -0
  9. cortex/fine_tuning/wizard.py +707 -0
  10. cortex/gpu_validator.py +467 -0
  11. cortex/inference_engine.py +727 -0
  12. cortex/metal/__init__.py +275 -0
  13. cortex/metal/gpu_validator.py +177 -0
  14. cortex/metal/memory_pool.py +886 -0
  15. cortex/metal/mlx_accelerator.py +678 -0
  16. cortex/metal/mlx_converter.py +638 -0
  17. cortex/metal/mps_optimizer.py +417 -0
  18. cortex/metal/optimizer.py +665 -0
  19. cortex/metal/performance_profiler.py +364 -0
  20. cortex/model_downloader.py +130 -0
  21. cortex/model_manager.py +2187 -0
  22. cortex/quantization/__init__.py +5 -0
  23. cortex/quantization/dynamic_quantizer.py +736 -0
  24. cortex/template_registry/__init__.py +15 -0
  25. cortex/template_registry/auto_detector.py +144 -0
  26. cortex/template_registry/config_manager.py +234 -0
  27. cortex/template_registry/interactive.py +260 -0
  28. cortex/template_registry/registry.py +347 -0
  29. cortex/template_registry/template_profiles/__init__.py +5 -0
  30. cortex/template_registry/template_profiles/base.py +142 -0
  31. cortex/template_registry/template_profiles/complex/__init__.py +5 -0
  32. cortex/template_registry/template_profiles/complex/reasoning.py +263 -0
  33. cortex/template_registry/template_profiles/standard/__init__.py +9 -0
  34. cortex/template_registry/template_profiles/standard/alpaca.py +73 -0
  35. cortex/template_registry/template_profiles/standard/chatml.py +82 -0
  36. cortex/template_registry/template_profiles/standard/gemma.py +103 -0
  37. cortex/template_registry/template_profiles/standard/llama.py +87 -0
  38. cortex/template_registry/template_profiles/standard/simple.py +65 -0
  39. cortex/ui/__init__.py +120 -0
  40. cortex/ui/cli.py +1685 -0
  41. cortex/ui/markdown_render.py +185 -0
  42. cortex/ui/terminal_app.py +534 -0
  43. cortex_llm-1.0.0.dist-info/METADATA +275 -0
  44. cortex_llm-1.0.0.dist-info/RECORD +48 -0
  45. cortex_llm-1.0.0.dist-info/WHEEL +5 -0
  46. cortex_llm-1.0.0.dist-info/entry_points.txt +2 -0
  47. cortex_llm-1.0.0.dist-info/licenses/LICENSE +21 -0
  48. cortex_llm-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,332 @@
1
+ """Dataset preparation utilities for fine-tuning."""
2
+
3
+ import json
4
+ import csv
5
+ import logging
6
+ from pathlib import Path
7
+ from typing import List, Dict, Any, Tuple, Optional
8
+ import re
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class DatasetPreparer:
14
+ """Prepare and validate datasets for fine-tuning."""
15
+
16
+ SUPPORTED_FORMATS = ['.jsonl', '.json', '.csv', '.txt']
17
+ MIN_EXAMPLES = 5
18
+ MAX_EXAMPLES = 10000
19
+
20
+ def validate_dataset(self, file_path: Path) -> Tuple[bool, str, Optional[Path]]:
21
+ """
22
+ Validate and prepare a dataset file.
23
+
24
+ Args:
25
+ file_path: Path to the dataset file
26
+
27
+ Returns:
28
+ Tuple of (is_valid, message, processed_path)
29
+ """
30
+ try:
31
+ # Check file exists
32
+ if not file_path.exists():
33
+ return False, f"File not found: {file_path}", None
34
+
35
+ # Check file extension
36
+ if file_path.suffix.lower() not in self.SUPPORTED_FORMATS:
37
+ return False, f"Unsupported format. Supported: {', '.join(self.SUPPORTED_FORMATS)}", None
38
+
39
+ # Process based on format
40
+ if file_path.suffix.lower() == '.jsonl':
41
+ return self._validate_jsonl(file_path)
42
+ elif file_path.suffix.lower() == '.json':
43
+ return self._validate_json(file_path)
44
+ elif file_path.suffix.lower() == '.csv':
45
+ return self._validate_csv(file_path)
46
+ elif file_path.suffix.lower() == '.txt':
47
+ return self._validate_txt(file_path)
48
+ else:
49
+ return False, "Unsupported format", None
50
+
51
+ except Exception as e:
52
+ logger.error(f"Error validating dataset: {e}")
53
+ return False, f"Validation error: {str(e)}", None
54
+
55
+ def _validate_jsonl(self, file_path: Path) -> Tuple[bool, str, Optional[Path]]:
56
+ """Validate JSONL format dataset."""
57
+ try:
58
+ examples = []
59
+ with open(file_path, 'r', encoding='utf-8') as f:
60
+ for line_num, line in enumerate(f, 1):
61
+ line = line.strip()
62
+ if not line:
63
+ continue
64
+
65
+ try:
66
+ data = json.loads(line)
67
+ except json.JSONDecodeError as e:
68
+ return False, f"Invalid JSON at line {line_num}: {e}", None
69
+
70
+ # Check for required fields
71
+ if 'prompt' in data and 'response' in data:
72
+ examples.append(data)
73
+ elif 'text' in data:
74
+ examples.append(data)
75
+ elif 'instruction' in data and 'output' in data:
76
+ # Convert to standard format
77
+ examples.append({
78
+ 'prompt': data['instruction'],
79
+ 'response': data['output']
80
+ })
81
+ else:
82
+ return False, f"Line {line_num}: Missing required fields (need 'prompt'/'response' or 'text')", None
83
+
84
+ # Check example count
85
+ if len(examples) < self.MIN_EXAMPLES:
86
+ return False, f"Too few examples ({len(examples)}). Minimum: {self.MIN_EXAMPLES}", None
87
+
88
+ if len(examples) > self.MAX_EXAMPLES:
89
+ logger.warning(f"Dataset has {len(examples)} examples. Will use first {self.MAX_EXAMPLES}")
90
+ examples = examples[:self.MAX_EXAMPLES]
91
+
92
+ # Save processed dataset if needed
93
+ processed_path = self._save_processed_dataset(examples, file_path)
94
+
95
+ return True, f"Valid dataset with {len(examples)} examples", processed_path
96
+
97
+ except Exception as e:
98
+ return False, f"Error reading JSONL file: {e}", None
99
+
100
+ def _validate_json(self, file_path: Path) -> Tuple[bool, str, Optional[Path]]:
101
+ """Validate JSON format dataset."""
102
+ try:
103
+ with open(file_path, 'r', encoding='utf-8') as f:
104
+ data = json.load(f)
105
+
106
+ examples = []
107
+
108
+ # Handle different JSON structures
109
+ if isinstance(data, list):
110
+ # Array of examples
111
+ for item in data:
112
+ if isinstance(item, dict):
113
+ if 'prompt' in item and 'response' in item:
114
+ examples.append(item)
115
+ elif 'text' in item:
116
+ examples.append(item)
117
+ elif isinstance(data, dict):
118
+ # Single example or nested structure
119
+ if 'examples' in data:
120
+ examples = data['examples']
121
+ elif 'data' in data:
122
+ examples = data['data']
123
+ elif 'prompt' in data and 'response' in data:
124
+ examples = [data]
125
+
126
+ if not examples:
127
+ return False, "No valid examples found in JSON file", None
128
+
129
+ # Convert to JSONL format
130
+ processed_path = file_path.with_suffix('.jsonl')
131
+ with open(processed_path, 'w', encoding='utf-8') as f:
132
+ for example in examples[:self.MAX_EXAMPLES]:
133
+ f.write(json.dumps(example) + '\n')
134
+
135
+ return True, f"Converted JSON to JSONL with {len(examples)} examples", processed_path
136
+
137
+ except Exception as e:
138
+ return False, f"Error reading JSON file: {e}", None
139
+
140
+ def _validate_csv(self, file_path: Path) -> Tuple[bool, str, Optional[Path]]:
141
+ """Validate CSV format dataset."""
142
+ try:
143
+ examples = []
144
+
145
+ with open(file_path, 'r', encoding='utf-8') as f:
146
+ reader = csv.DictReader(f)
147
+
148
+ for row in reader:
149
+ # Look for prompt/response columns
150
+ if 'prompt' in row and 'response' in row:
151
+ examples.append({
152
+ 'prompt': row['prompt'],
153
+ 'response': row['response']
154
+ })
155
+ elif 'instruction' in row and 'output' in row:
156
+ examples.append({
157
+ 'prompt': row['instruction'],
158
+ 'response': row['output']
159
+ })
160
+ elif 'question' in row and 'answer' in row:
161
+ examples.append({
162
+ 'prompt': row['question'],
163
+ 'response': row['answer']
164
+ })
165
+ elif 'text' in row:
166
+ examples.append({'text': row['text']})
167
+
168
+ if not examples:
169
+ return False, "No valid columns found (need 'prompt'/'response' or similar)", None
170
+
171
+ # Convert to JSONL
172
+ processed_path = file_path.with_suffix('.jsonl')
173
+ with open(processed_path, 'w', encoding='utf-8') as f:
174
+ for example in examples[:self.MAX_EXAMPLES]:
175
+ f.write(json.dumps(example) + '\n')
176
+
177
+ return True, f"Converted CSV to JSONL with {len(examples)} examples", processed_path
178
+
179
+ except Exception as e:
180
+ return False, f"Error reading CSV file: {e}", None
181
+
182
+ def _validate_txt(self, file_path: Path) -> Tuple[bool, str, Optional[Path]]:
183
+ """Validate plain text format dataset."""
184
+ try:
185
+ with open(file_path, 'r', encoding='utf-8') as f:
186
+ content = f.read()
187
+
188
+ examples = []
189
+
190
+ # Try to parse as Q&A format
191
+ qa_pattern = r'(?:Q:|Question:)\s*(.*?)\s*(?:A:|Answer:)\s*(.*?)(?=(?:Q:|Question:)|$)'
192
+ qa_matches = re.findall(qa_pattern, content, re.DOTALL | re.IGNORECASE)
193
+
194
+ if qa_matches:
195
+ for question, answer in qa_matches:
196
+ examples.append({
197
+ 'prompt': question.strip(),
198
+ 'response': answer.strip()
199
+ })
200
+ else:
201
+ # Try to parse as conversation format
202
+ conv_pattern = r'(?:User|Human):\s*(.*?)\s*(?:Assistant|AI|Bot):\s*(.*?)(?=(?:User|Human):|$)'
203
+ conv_matches = re.findall(conv_pattern, content, re.DOTALL | re.IGNORECASE)
204
+
205
+ if conv_matches:
206
+ for user_msg, assistant_msg in conv_matches:
207
+ examples.append({
208
+ 'prompt': user_msg.strip(),
209
+ 'response': assistant_msg.strip()
210
+ })
211
+ else:
212
+ # Treat as single text block
213
+ # Split into chunks if too long
214
+ chunks = self._split_text_into_chunks(content, max_length=500)
215
+ for chunk in chunks[:self.MAX_EXAMPLES]:
216
+ examples.append({'text': chunk})
217
+
218
+ if not examples:
219
+ return False, "Could not parse text file into examples", None
220
+
221
+ # Convert to JSONL
222
+ processed_path = file_path.with_suffix('.jsonl')
223
+ with open(processed_path, 'w', encoding='utf-8') as f:
224
+ for example in examples:
225
+ f.write(json.dumps(example) + '\n')
226
+
227
+ return True, f"Parsed text file into {len(examples)} examples", processed_path
228
+
229
+ except Exception as e:
230
+ return False, f"Error reading text file: {e}", None
231
+
232
+ def _split_text_into_chunks(self, text: str, max_length: int = 500) -> List[str]:
233
+ """Split text into chunks of approximately max_length."""
234
+ # Split by paragraphs first
235
+ paragraphs = text.split('\n\n')
236
+
237
+ chunks = []
238
+ current_chunk = ""
239
+
240
+ for para in paragraphs:
241
+ para = para.strip()
242
+ if not para:
243
+ continue
244
+
245
+ if len(current_chunk) + len(para) + 1 < max_length:
246
+ if current_chunk:
247
+ current_chunk += "\n\n" + para
248
+ else:
249
+ current_chunk = para
250
+ else:
251
+ if current_chunk:
252
+ chunks.append(current_chunk)
253
+ current_chunk = para
254
+
255
+ if current_chunk:
256
+ chunks.append(current_chunk)
257
+
258
+ return chunks
259
+
260
+ def _save_processed_dataset(self, examples: List[Dict], original_path: Path) -> Path:
261
+ """Save processed dataset if modifications were made."""
262
+ # If already in correct format, return original path
263
+ if all('prompt' in ex and 'response' in ex for ex in examples):
264
+ return original_path
265
+
266
+ # Otherwise, save processed version
267
+ processed_dir = Path.home() / ".cortex" / "processed_datasets"
268
+ processed_dir.mkdir(parents=True, exist_ok=True)
269
+
270
+ processed_path = processed_dir / f"{original_path.stem}_processed.jsonl"
271
+
272
+ with open(processed_path, 'w', encoding='utf-8') as f:
273
+ for example in examples:
274
+ f.write(json.dumps(example) + '\n')
275
+
276
+ return processed_path
277
+
278
+ def create_sample_dataset(self, domain: str = "general") -> Path:
279
+ """Create a sample dataset for testing."""
280
+ samples = {
281
+ "general": [
282
+ {"prompt": "What is machine learning?",
283
+ "response": "Machine learning is a type of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed."},
284
+ {"prompt": "Explain neural networks",
285
+ "response": "Neural networks are computing systems inspired by biological neural networks, consisting of interconnected nodes that process information."},
286
+ {"prompt": "What is deep learning?",
287
+ "response": "Deep learning is a subset of machine learning that uses neural networks with multiple layers to learn from large amounts of data."},
288
+ {"prompt": "Define natural language processing",
289
+ "response": "Natural language processing (NLP) is a branch of AI that helps computers understand, interpret, and generate human language."},
290
+ {"prompt": "What is computer vision?",
291
+ "response": "Computer vision is a field of AI that trains computers to interpret and understand visual information from the world."},
292
+ ],
293
+ "coding": [
294
+ {"prompt": "How do I reverse a string in Python?",
295
+ "response": "You can reverse a string in Python using slicing: `reversed_string = original_string[::-1]`"},
296
+ {"prompt": "What is a list comprehension?",
297
+ "response": "A list comprehension is a concise way to create lists in Python: `[expression for item in iterable if condition]`"},
298
+ {"prompt": "Explain recursion",
299
+ "response": "Recursion is a programming technique where a function calls itself to solve smaller instances of the same problem."},
300
+ {"prompt": "What is object-oriented programming?",
301
+ "response": "Object-oriented programming (OOP) is a programming paradigm that organizes code into objects containing data and methods."},
302
+ {"prompt": "How do I handle exceptions in Python?",
303
+ "response": "Use try-except blocks: `try: risky_code() except Exception as e: handle_error(e)`"},
304
+ ],
305
+ "creative": [
306
+ {"prompt": "Write a haiku about coding",
307
+ "response": "Lines of logic flow,\nBugs emerge, then disappear,\nCode compiles at last."},
308
+ {"prompt": "Create a metaphor for machine learning",
309
+ "response": "Machine learning is like teaching a child to recognize patterns - showing many examples until they can identify new ones on their own."},
310
+ {"prompt": "Describe a sunset poetically",
311
+ "response": "The sun paints the sky in hues of amber and rose, a masterpiece that fades into the embrace of twilight."},
312
+ {"prompt": "Write a short story opening",
313
+ "response": "The old lighthouse keeper had seen many storms, but none quite like the one approaching that November evening."},
314
+ {"prompt": "Create a motivational quote",
315
+ "response": "Every line of code you write today is a step toward the solution you'll celebrate tomorrow."},
316
+ ]
317
+ }
318
+
319
+ # Get samples for the specified domain
320
+ domain_samples = samples.get(domain, samples["general"])
321
+
322
+ # Save to file
323
+ sample_dir = Path.home() / ".cortex" / "sample_datasets"
324
+ sample_dir.mkdir(parents=True, exist_ok=True)
325
+
326
+ sample_path = sample_dir / f"sample_{domain}.jsonl"
327
+
328
+ with open(sample_path, 'w', encoding='utf-8') as f:
329
+ for sample in domain_samples:
330
+ f.write(json.dumps(sample) + '\n')
331
+
332
+ return sample_path