cortex-llm 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cortex/__init__.py +73 -0
- cortex/__main__.py +83 -0
- cortex/config.py +329 -0
- cortex/conversation_manager.py +468 -0
- cortex/fine_tuning/__init__.py +8 -0
- cortex/fine_tuning/dataset.py +332 -0
- cortex/fine_tuning/mlx_lora_trainer.py +502 -0
- cortex/fine_tuning/trainer.py +957 -0
- cortex/fine_tuning/wizard.py +707 -0
- cortex/gpu_validator.py +467 -0
- cortex/inference_engine.py +727 -0
- cortex/metal/__init__.py +275 -0
- cortex/metal/gpu_validator.py +177 -0
- cortex/metal/memory_pool.py +886 -0
- cortex/metal/mlx_accelerator.py +678 -0
- cortex/metal/mlx_converter.py +638 -0
- cortex/metal/mps_optimizer.py +417 -0
- cortex/metal/optimizer.py +665 -0
- cortex/metal/performance_profiler.py +364 -0
- cortex/model_downloader.py +130 -0
- cortex/model_manager.py +2187 -0
- cortex/quantization/__init__.py +5 -0
- cortex/quantization/dynamic_quantizer.py +736 -0
- cortex/template_registry/__init__.py +15 -0
- cortex/template_registry/auto_detector.py +144 -0
- cortex/template_registry/config_manager.py +234 -0
- cortex/template_registry/interactive.py +260 -0
- cortex/template_registry/registry.py +347 -0
- cortex/template_registry/template_profiles/__init__.py +5 -0
- cortex/template_registry/template_profiles/base.py +142 -0
- cortex/template_registry/template_profiles/complex/__init__.py +5 -0
- cortex/template_registry/template_profiles/complex/reasoning.py +263 -0
- cortex/template_registry/template_profiles/standard/__init__.py +9 -0
- cortex/template_registry/template_profiles/standard/alpaca.py +73 -0
- cortex/template_registry/template_profiles/standard/chatml.py +82 -0
- cortex/template_registry/template_profiles/standard/gemma.py +103 -0
- cortex/template_registry/template_profiles/standard/llama.py +87 -0
- cortex/template_registry/template_profiles/standard/simple.py +65 -0
- cortex/ui/__init__.py +120 -0
- cortex/ui/cli.py +1685 -0
- cortex/ui/markdown_render.py +185 -0
- cortex/ui/terminal_app.py +534 -0
- cortex_llm-1.0.0.dist-info/METADATA +275 -0
- cortex_llm-1.0.0.dist-info/RECORD +48 -0
- cortex_llm-1.0.0.dist-info/WHEEL +5 -0
- cortex_llm-1.0.0.dist-info/entry_points.txt +2 -0
- cortex_llm-1.0.0.dist-info/licenses/LICENSE +21 -0
- cortex_llm-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
"""Dataset preparation utilities for fine-tuning."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import csv
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import List, Dict, Any, Tuple, Optional
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DatasetPreparer:
|
|
14
|
+
"""Prepare and validate datasets for fine-tuning."""
|
|
15
|
+
|
|
16
|
+
SUPPORTED_FORMATS = ['.jsonl', '.json', '.csv', '.txt']
|
|
17
|
+
MIN_EXAMPLES = 5
|
|
18
|
+
MAX_EXAMPLES = 10000
|
|
19
|
+
|
|
20
|
+
def validate_dataset(self, file_path: Path) -> Tuple[bool, str, Optional[Path]]:
|
|
21
|
+
"""
|
|
22
|
+
Validate and prepare a dataset file.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
file_path: Path to the dataset file
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Tuple of (is_valid, message, processed_path)
|
|
29
|
+
"""
|
|
30
|
+
try:
|
|
31
|
+
# Check file exists
|
|
32
|
+
if not file_path.exists():
|
|
33
|
+
return False, f"File not found: {file_path}", None
|
|
34
|
+
|
|
35
|
+
# Check file extension
|
|
36
|
+
if file_path.suffix.lower() not in self.SUPPORTED_FORMATS:
|
|
37
|
+
return False, f"Unsupported format. Supported: {', '.join(self.SUPPORTED_FORMATS)}", None
|
|
38
|
+
|
|
39
|
+
# Process based on format
|
|
40
|
+
if file_path.suffix.lower() == '.jsonl':
|
|
41
|
+
return self._validate_jsonl(file_path)
|
|
42
|
+
elif file_path.suffix.lower() == '.json':
|
|
43
|
+
return self._validate_json(file_path)
|
|
44
|
+
elif file_path.suffix.lower() == '.csv':
|
|
45
|
+
return self._validate_csv(file_path)
|
|
46
|
+
elif file_path.suffix.lower() == '.txt':
|
|
47
|
+
return self._validate_txt(file_path)
|
|
48
|
+
else:
|
|
49
|
+
return False, "Unsupported format", None
|
|
50
|
+
|
|
51
|
+
except Exception as e:
|
|
52
|
+
logger.error(f"Error validating dataset: {e}")
|
|
53
|
+
return False, f"Validation error: {str(e)}", None
|
|
54
|
+
|
|
55
|
+
def _validate_jsonl(self, file_path: Path) -> Tuple[bool, str, Optional[Path]]:
|
|
56
|
+
"""Validate JSONL format dataset."""
|
|
57
|
+
try:
|
|
58
|
+
examples = []
|
|
59
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
60
|
+
for line_num, line in enumerate(f, 1):
|
|
61
|
+
line = line.strip()
|
|
62
|
+
if not line:
|
|
63
|
+
continue
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
data = json.loads(line)
|
|
67
|
+
except json.JSONDecodeError as e:
|
|
68
|
+
return False, f"Invalid JSON at line {line_num}: {e}", None
|
|
69
|
+
|
|
70
|
+
# Check for required fields
|
|
71
|
+
if 'prompt' in data and 'response' in data:
|
|
72
|
+
examples.append(data)
|
|
73
|
+
elif 'text' in data:
|
|
74
|
+
examples.append(data)
|
|
75
|
+
elif 'instruction' in data and 'output' in data:
|
|
76
|
+
# Convert to standard format
|
|
77
|
+
examples.append({
|
|
78
|
+
'prompt': data['instruction'],
|
|
79
|
+
'response': data['output']
|
|
80
|
+
})
|
|
81
|
+
else:
|
|
82
|
+
return False, f"Line {line_num}: Missing required fields (need 'prompt'/'response' or 'text')", None
|
|
83
|
+
|
|
84
|
+
# Check example count
|
|
85
|
+
if len(examples) < self.MIN_EXAMPLES:
|
|
86
|
+
return False, f"Too few examples ({len(examples)}). Minimum: {self.MIN_EXAMPLES}", None
|
|
87
|
+
|
|
88
|
+
if len(examples) > self.MAX_EXAMPLES:
|
|
89
|
+
logger.warning(f"Dataset has {len(examples)} examples. Will use first {self.MAX_EXAMPLES}")
|
|
90
|
+
examples = examples[:self.MAX_EXAMPLES]
|
|
91
|
+
|
|
92
|
+
# Save processed dataset if needed
|
|
93
|
+
processed_path = self._save_processed_dataset(examples, file_path)
|
|
94
|
+
|
|
95
|
+
return True, f"Valid dataset with {len(examples)} examples", processed_path
|
|
96
|
+
|
|
97
|
+
except Exception as e:
|
|
98
|
+
return False, f"Error reading JSONL file: {e}", None
|
|
99
|
+
|
|
100
|
+
def _validate_json(self, file_path: Path) -> Tuple[bool, str, Optional[Path]]:
|
|
101
|
+
"""Validate JSON format dataset."""
|
|
102
|
+
try:
|
|
103
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
104
|
+
data = json.load(f)
|
|
105
|
+
|
|
106
|
+
examples = []
|
|
107
|
+
|
|
108
|
+
# Handle different JSON structures
|
|
109
|
+
if isinstance(data, list):
|
|
110
|
+
# Array of examples
|
|
111
|
+
for item in data:
|
|
112
|
+
if isinstance(item, dict):
|
|
113
|
+
if 'prompt' in item and 'response' in item:
|
|
114
|
+
examples.append(item)
|
|
115
|
+
elif 'text' in item:
|
|
116
|
+
examples.append(item)
|
|
117
|
+
elif isinstance(data, dict):
|
|
118
|
+
# Single example or nested structure
|
|
119
|
+
if 'examples' in data:
|
|
120
|
+
examples = data['examples']
|
|
121
|
+
elif 'data' in data:
|
|
122
|
+
examples = data['data']
|
|
123
|
+
elif 'prompt' in data and 'response' in data:
|
|
124
|
+
examples = [data]
|
|
125
|
+
|
|
126
|
+
if not examples:
|
|
127
|
+
return False, "No valid examples found in JSON file", None
|
|
128
|
+
|
|
129
|
+
# Convert to JSONL format
|
|
130
|
+
processed_path = file_path.with_suffix('.jsonl')
|
|
131
|
+
with open(processed_path, 'w', encoding='utf-8') as f:
|
|
132
|
+
for example in examples[:self.MAX_EXAMPLES]:
|
|
133
|
+
f.write(json.dumps(example) + '\n')
|
|
134
|
+
|
|
135
|
+
return True, f"Converted JSON to JSONL with {len(examples)} examples", processed_path
|
|
136
|
+
|
|
137
|
+
except Exception as e:
|
|
138
|
+
return False, f"Error reading JSON file: {e}", None
|
|
139
|
+
|
|
140
|
+
def _validate_csv(self, file_path: Path) -> Tuple[bool, str, Optional[Path]]:
|
|
141
|
+
"""Validate CSV format dataset."""
|
|
142
|
+
try:
|
|
143
|
+
examples = []
|
|
144
|
+
|
|
145
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
146
|
+
reader = csv.DictReader(f)
|
|
147
|
+
|
|
148
|
+
for row in reader:
|
|
149
|
+
# Look for prompt/response columns
|
|
150
|
+
if 'prompt' in row and 'response' in row:
|
|
151
|
+
examples.append({
|
|
152
|
+
'prompt': row['prompt'],
|
|
153
|
+
'response': row['response']
|
|
154
|
+
})
|
|
155
|
+
elif 'instruction' in row and 'output' in row:
|
|
156
|
+
examples.append({
|
|
157
|
+
'prompt': row['instruction'],
|
|
158
|
+
'response': row['output']
|
|
159
|
+
})
|
|
160
|
+
elif 'question' in row and 'answer' in row:
|
|
161
|
+
examples.append({
|
|
162
|
+
'prompt': row['question'],
|
|
163
|
+
'response': row['answer']
|
|
164
|
+
})
|
|
165
|
+
elif 'text' in row:
|
|
166
|
+
examples.append({'text': row['text']})
|
|
167
|
+
|
|
168
|
+
if not examples:
|
|
169
|
+
return False, "No valid columns found (need 'prompt'/'response' or similar)", None
|
|
170
|
+
|
|
171
|
+
# Convert to JSONL
|
|
172
|
+
processed_path = file_path.with_suffix('.jsonl')
|
|
173
|
+
with open(processed_path, 'w', encoding='utf-8') as f:
|
|
174
|
+
for example in examples[:self.MAX_EXAMPLES]:
|
|
175
|
+
f.write(json.dumps(example) + '\n')
|
|
176
|
+
|
|
177
|
+
return True, f"Converted CSV to JSONL with {len(examples)} examples", processed_path
|
|
178
|
+
|
|
179
|
+
except Exception as e:
|
|
180
|
+
return False, f"Error reading CSV file: {e}", None
|
|
181
|
+
|
|
182
|
+
def _validate_txt(self, file_path: Path) -> Tuple[bool, str, Optional[Path]]:
|
|
183
|
+
"""Validate plain text format dataset."""
|
|
184
|
+
try:
|
|
185
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
186
|
+
content = f.read()
|
|
187
|
+
|
|
188
|
+
examples = []
|
|
189
|
+
|
|
190
|
+
# Try to parse as Q&A format
|
|
191
|
+
qa_pattern = r'(?:Q:|Question:)\s*(.*?)\s*(?:A:|Answer:)\s*(.*?)(?=(?:Q:|Question:)|$)'
|
|
192
|
+
qa_matches = re.findall(qa_pattern, content, re.DOTALL | re.IGNORECASE)
|
|
193
|
+
|
|
194
|
+
if qa_matches:
|
|
195
|
+
for question, answer in qa_matches:
|
|
196
|
+
examples.append({
|
|
197
|
+
'prompt': question.strip(),
|
|
198
|
+
'response': answer.strip()
|
|
199
|
+
})
|
|
200
|
+
else:
|
|
201
|
+
# Try to parse as conversation format
|
|
202
|
+
conv_pattern = r'(?:User|Human):\s*(.*?)\s*(?:Assistant|AI|Bot):\s*(.*?)(?=(?:User|Human):|$)'
|
|
203
|
+
conv_matches = re.findall(conv_pattern, content, re.DOTALL | re.IGNORECASE)
|
|
204
|
+
|
|
205
|
+
if conv_matches:
|
|
206
|
+
for user_msg, assistant_msg in conv_matches:
|
|
207
|
+
examples.append({
|
|
208
|
+
'prompt': user_msg.strip(),
|
|
209
|
+
'response': assistant_msg.strip()
|
|
210
|
+
})
|
|
211
|
+
else:
|
|
212
|
+
# Treat as single text block
|
|
213
|
+
# Split into chunks if too long
|
|
214
|
+
chunks = self._split_text_into_chunks(content, max_length=500)
|
|
215
|
+
for chunk in chunks[:self.MAX_EXAMPLES]:
|
|
216
|
+
examples.append({'text': chunk})
|
|
217
|
+
|
|
218
|
+
if not examples:
|
|
219
|
+
return False, "Could not parse text file into examples", None
|
|
220
|
+
|
|
221
|
+
# Convert to JSONL
|
|
222
|
+
processed_path = file_path.with_suffix('.jsonl')
|
|
223
|
+
with open(processed_path, 'w', encoding='utf-8') as f:
|
|
224
|
+
for example in examples:
|
|
225
|
+
f.write(json.dumps(example) + '\n')
|
|
226
|
+
|
|
227
|
+
return True, f"Parsed text file into {len(examples)} examples", processed_path
|
|
228
|
+
|
|
229
|
+
except Exception as e:
|
|
230
|
+
return False, f"Error reading text file: {e}", None
|
|
231
|
+
|
|
232
|
+
def _split_text_into_chunks(self, text: str, max_length: int = 500) -> List[str]:
|
|
233
|
+
"""Split text into chunks of approximately max_length."""
|
|
234
|
+
# Split by paragraphs first
|
|
235
|
+
paragraphs = text.split('\n\n')
|
|
236
|
+
|
|
237
|
+
chunks = []
|
|
238
|
+
current_chunk = ""
|
|
239
|
+
|
|
240
|
+
for para in paragraphs:
|
|
241
|
+
para = para.strip()
|
|
242
|
+
if not para:
|
|
243
|
+
continue
|
|
244
|
+
|
|
245
|
+
if len(current_chunk) + len(para) + 1 < max_length:
|
|
246
|
+
if current_chunk:
|
|
247
|
+
current_chunk += "\n\n" + para
|
|
248
|
+
else:
|
|
249
|
+
current_chunk = para
|
|
250
|
+
else:
|
|
251
|
+
if current_chunk:
|
|
252
|
+
chunks.append(current_chunk)
|
|
253
|
+
current_chunk = para
|
|
254
|
+
|
|
255
|
+
if current_chunk:
|
|
256
|
+
chunks.append(current_chunk)
|
|
257
|
+
|
|
258
|
+
return chunks
|
|
259
|
+
|
|
260
|
+
def _save_processed_dataset(self, examples: List[Dict], original_path: Path) -> Path:
|
|
261
|
+
"""Save processed dataset if modifications were made."""
|
|
262
|
+
# If already in correct format, return original path
|
|
263
|
+
if all('prompt' in ex and 'response' in ex for ex in examples):
|
|
264
|
+
return original_path
|
|
265
|
+
|
|
266
|
+
# Otherwise, save processed version
|
|
267
|
+
processed_dir = Path.home() / ".cortex" / "processed_datasets"
|
|
268
|
+
processed_dir.mkdir(parents=True, exist_ok=True)
|
|
269
|
+
|
|
270
|
+
processed_path = processed_dir / f"{original_path.stem}_processed.jsonl"
|
|
271
|
+
|
|
272
|
+
with open(processed_path, 'w', encoding='utf-8') as f:
|
|
273
|
+
for example in examples:
|
|
274
|
+
f.write(json.dumps(example) + '\n')
|
|
275
|
+
|
|
276
|
+
return processed_path
|
|
277
|
+
|
|
278
|
+
def create_sample_dataset(self, domain: str = "general") -> Path:
|
|
279
|
+
"""Create a sample dataset for testing."""
|
|
280
|
+
samples = {
|
|
281
|
+
"general": [
|
|
282
|
+
{"prompt": "What is machine learning?",
|
|
283
|
+
"response": "Machine learning is a type of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed."},
|
|
284
|
+
{"prompt": "Explain neural networks",
|
|
285
|
+
"response": "Neural networks are computing systems inspired by biological neural networks, consisting of interconnected nodes that process information."},
|
|
286
|
+
{"prompt": "What is deep learning?",
|
|
287
|
+
"response": "Deep learning is a subset of machine learning that uses neural networks with multiple layers to learn from large amounts of data."},
|
|
288
|
+
{"prompt": "Define natural language processing",
|
|
289
|
+
"response": "Natural language processing (NLP) is a branch of AI that helps computers understand, interpret, and generate human language."},
|
|
290
|
+
{"prompt": "What is computer vision?",
|
|
291
|
+
"response": "Computer vision is a field of AI that trains computers to interpret and understand visual information from the world."},
|
|
292
|
+
],
|
|
293
|
+
"coding": [
|
|
294
|
+
{"prompt": "How do I reverse a string in Python?",
|
|
295
|
+
"response": "You can reverse a string in Python using slicing: `reversed_string = original_string[::-1]`"},
|
|
296
|
+
{"prompt": "What is a list comprehension?",
|
|
297
|
+
"response": "A list comprehension is a concise way to create lists in Python: `[expression for item in iterable if condition]`"},
|
|
298
|
+
{"prompt": "Explain recursion",
|
|
299
|
+
"response": "Recursion is a programming technique where a function calls itself to solve smaller instances of the same problem."},
|
|
300
|
+
{"prompt": "What is object-oriented programming?",
|
|
301
|
+
"response": "Object-oriented programming (OOP) is a programming paradigm that organizes code into objects containing data and methods."},
|
|
302
|
+
{"prompt": "How do I handle exceptions in Python?",
|
|
303
|
+
"response": "Use try-except blocks: `try: risky_code() except Exception as e: handle_error(e)`"},
|
|
304
|
+
],
|
|
305
|
+
"creative": [
|
|
306
|
+
{"prompt": "Write a haiku about coding",
|
|
307
|
+
"response": "Lines of logic flow,\nBugs emerge, then disappear,\nCode compiles at last."},
|
|
308
|
+
{"prompt": "Create a metaphor for machine learning",
|
|
309
|
+
"response": "Machine learning is like teaching a child to recognize patterns - showing many examples until they can identify new ones on their own."},
|
|
310
|
+
{"prompt": "Describe a sunset poetically",
|
|
311
|
+
"response": "The sun paints the sky in hues of amber and rose, a masterpiece that fades into the embrace of twilight."},
|
|
312
|
+
{"prompt": "Write a short story opening",
|
|
313
|
+
"response": "The old lighthouse keeper had seen many storms, but none quite like the one approaching that November evening."},
|
|
314
|
+
{"prompt": "Create a motivational quote",
|
|
315
|
+
"response": "Every line of code you write today is a step toward the solution you'll celebrate tomorrow."},
|
|
316
|
+
]
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
# Get samples for the specified domain
|
|
320
|
+
domain_samples = samples.get(domain, samples["general"])
|
|
321
|
+
|
|
322
|
+
# Save to file
|
|
323
|
+
sample_dir = Path.home() / ".cortex" / "sample_datasets"
|
|
324
|
+
sample_dir.mkdir(parents=True, exist_ok=True)
|
|
325
|
+
|
|
326
|
+
sample_path = sample_dir / f"sample_{domain}.jsonl"
|
|
327
|
+
|
|
328
|
+
with open(sample_path, 'w', encoding='utf-8') as f:
|
|
329
|
+
for sample in domain_samples:
|
|
330
|
+
f.write(json.dumps(sample) + '\n')
|
|
331
|
+
|
|
332
|
+
return sample_path
|