clarity-ai 6.3.1 → 6.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/clarity_flash_14b.ipynb +245 -0
- package/clarity_heavy_20b_moe.ipynb +270 -0
- package/package.json +1 -1
- package/src/components/AppRoot.js +4 -3
- package/src/components/CommandPicker.js +17 -19
- package/src/components/Composer.js +30 -58
- package/src/components/LoadingIndicator.js +4 -7
- package/src/components/MessageList.js +52 -74
- package/src/components/ModelPicker.js +18 -28
- package/src/components/StatusBar.js +9 -9
- package/src/components/ThinkingBlock.js +10 -9
- package/src/components/ToolCard.js +19 -30
- package/src/config/theme.js +21 -75
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"metadata": {},
|
|
6
|
+
"source": [
|
|
7
|
+
"# CLARITY Flash 14B — TPU Fine-Tuning\n",
|
|
8
|
+
"Trains a 14B parameter model on agent CoT + tool-calling data.\n",
|
|
9
|
+
"Target: Google Colab TPU v2-8 (free tier)\n",
|
|
10
|
+
"HF token: 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"
|
|
11
|
+
]
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
"cell_type": "code",
|
|
15
|
+
"metadata": {},
|
|
16
|
+
"source": [
|
|
17
|
+
"# === Install ===\n",
|
|
18
|
+
"!pip install -q torch torch-xla torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu\n",
|
|
19
|
+
"!pip install -q transformers datasets accelerate peft bitsandbytes sentencepiece huggingface_hub"
|
|
20
|
+
],
|
|
21
|
+
"execution_count": null,
|
|
22
|
+
"outputs": []
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"cell_type": "code",
|
|
26
|
+
"metadata": {},
|
|
27
|
+
"source": [
|
|
28
|
+
"# === HF Auth ===\n",
|
|
29
|
+
"from huggingface_hub import login, HfApi, create_repo\n",
|
|
30
|
+
"HF_TOKEN = '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'\n",
|
|
31
|
+
"login(token=HF_TOKEN, add_to_git_credential=True)\n",
|
|
32
|
+
"api = HfApi(token=HF_TOKEN)"
|
|
33
|
+
],
|
|
34
|
+
"execution_count": null,
|
|
35
|
+
"outputs": []
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"cell_type": "code",
|
|
39
|
+
"metadata": {},
|
|
40
|
+
"source": [
|
|
41
|
+
"# === TPU Setup ===\n",
|
|
42
|
+
"import torch\n",
|
|
43
|
+
"import torch_xla\n",
|
|
44
|
+
"import torch_xla.core.xla_model as xm\n",
|
|
45
|
+
"device = xm.xla_device()\n",
|
|
46
|
+
"print('Device:', device)\n",
|
|
47
|
+
"print('TPU cores:', torch_xla._XLAC._xla_get_num_devices())"
|
|
48
|
+
],
|
|
49
|
+
"execution_count": null,
|
|
50
|
+
"outputs": []
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
"cell_type": "code",
|
|
54
|
+
"metadata": {},
|
|
55
|
+
"source": [
|
|
56
|
+
"# === Data Loading ===\n",
|
|
57
|
+
"import requests\n",
|
|
58
|
+
"import json\n",
|
|
59
|
+
"from datasets import Dataset\n",
|
|
60
|
+
"\n",
|
|
61
|
+
"DATA_URLS = [\n",
|
|
62
|
+
" 'https://huggingface.co/spaces/Universal-618/Clarity-main/main-data',\n",
|
|
63
|
+
" 'https://huggingface.co/spaces/Universal-618/Clarity-2/main-data',\n",
|
|
64
|
+
" 'https://huggingface.co/spaces/Universal-618/Clarity-3/main-data',\n",
|
|
65
|
+
"]\n",
|
|
66
|
+
"\n",
|
|
67
|
+
"all_samples = []\n",
|
|
68
|
+
"for url in DATA_URLS:\n",
|
|
69
|
+
" try:\n",
|
|
70
|
+
" r = requests.get(url, headers={'Authorization': f'Bearer {HF_TOKEN}'}, timeout=60)\n",
|
|
71
|
+
" if r.status_code == 200:\n",
|
|
72
|
+
" data = r.json()\n",
|
|
73
|
+
" samples = data if isinstance(data, list) else data.get('data', [])\n",
|
|
74
|
+
" all_samples.extend(samples)\n",
|
|
75
|
+
" print(f'Loaded {len(samples)} from {url}')\n",
|
|
76
|
+
" except Exception as e:\n",
|
|
77
|
+
" print(f'Skipped {url}: {e}')\n",
|
|
78
|
+
"\n",
|
|
79
|
+
"# Fallback: synthetic CoT samples if no data\n",
|
|
80
|
+
"if len(all_samples) < 10:\n",
|
|
81
|
+
" print('No remote data found — using synthetic samples')\n",
|
|
82
|
+
" all_samples = [\n",
|
|
83
|
+
" {'instruction': 'List files in current directory', 'response': 'I will run the ls command.\\n<tool>bash</tool><cmd>ls -la</cmd>', 'tools': 'bash'},\n",
|
|
84
|
+
" {'instruction': 'Read the file config.json', 'response': 'Let me read that file.\\n<tool>read_file</tool><path>config.json</path>', 'tools': 'read_file'},\n",
|
|
85
|
+
" {'instruction': 'Write hello world script', 'response': 'I will create the file.\\n<tool>write_file</tool><path>hello.py</path><content>print(\"hello\")</content>', 'tools': 'write_file'},\n",
|
|
86
|
+
" ]\n",
|
|
87
|
+
"\n",
|
|
88
|
+
"print(f'Total training samples: {len(all_samples)}')"
|
|
89
|
+
],
|
|
90
|
+
"execution_count": null,
|
|
91
|
+
"outputs": []
|
|
92
|
+
},
|
|
93
|
+
{
|
|
94
|
+
"cell_type": "code",
|
|
95
|
+
"metadata": {},
|
|
96
|
+
"source": [
|
|
97
|
+
"# === Format for Training ===\n",
|
|
98
|
+
"def format_chat(sample):\n",
|
|
99
|
+
" inst = sample.get('instruction', sample.get('prompt', sample.get('input', '')))\n",
|
|
100
|
+
" resp = sample.get('response', sample.get('completion', sample.get('output', '')))\n",
|
|
101
|
+
" return {\n",
|
|
102
|
+
" 'text': f'<|im_start|>user\\n{inst}<|im_end|>\\n<|im_start|>assistant\\n{resp}<|im_end|>'\n",
|
|
103
|
+
" }\n",
|
|
104
|
+
"\n",
|
|
105
|
+
"dataset = Dataset.from_list([format_chat(s) for s in all_samples])\n",
|
|
106
|
+
"dataset = dataset.train_test_split(test_size=0.05, seed=42)\n",
|
|
107
|
+
"print(f'Train: {len(dataset[\"train\"])}, Test: {len(dataset[\"test\"])}')"
|
|
108
|
+
],
|
|
109
|
+
"execution_count": null,
|
|
110
|
+
"outputs": []
|
|
111
|
+
},
|
|
112
|
+
{
|
|
113
|
+
"cell_type": "code",
|
|
114
|
+
"metadata": {},
|
|
115
|
+
"source": [
|
|
116
|
+
"# === Load Model ===\n",
|
|
117
|
+
"from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n",
|
|
118
|
+
"import torch\n",
|
|
119
|
+
"\n",
|
|
120
|
+
"MODEL_ID = 'Qwen/Qwen2.5-14B-Instruct'\n",
|
|
121
|
+
"\n",
|
|
122
|
+
"bnb = BitsAndBytesConfig(\n",
|
|
123
|
+
" load_in_4bit=True,\n",
|
|
124
|
+
" bnb_4bit_use_double_quant=True,\n",
|
|
125
|
+
" bnb_4bit_quant_type='nf4',\n",
|
|
126
|
+
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
|
|
127
|
+
")\n",
|
|
128
|
+
"\n",
|
|
129
|
+
"tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN, trust_remote_code=True)\n",
|
|
130
|
+
"tokenizer.pad_token = tokenizer.eos_token\n",
|
|
131
|
+
"\n",
|
|
132
|
+
"model = AutoModelForCausalLM.from_pretrained(\n",
|
|
133
|
+
" MODEL_ID,\n",
|
|
134
|
+
" quantization_config=bnb,\n",
|
|
135
|
+
" device_map='auto',\n",
|
|
136
|
+
" torch_dtype=torch.bfloat16,\n",
|
|
137
|
+
" token=HF_TOKEN,\n",
|
|
138
|
+
" trust_remote_code=True,\n",
|
|
139
|
+
")\n",
|
|
140
|
+
"print(f'Model loaded: {MODEL_ID}')"
|
|
141
|
+
],
|
|
142
|
+
"execution_count": null,
|
|
143
|
+
"outputs": []
|
|
144
|
+
},
|
|
145
|
+
{
|
|
146
|
+
"cell_type": "code",
|
|
147
|
+
"metadata": {},
|
|
148
|
+
"source": [
|
|
149
|
+
"# === LoRA Config ===\n",
|
|
150
|
+
"from peft import LoraConfig, get_peft_model, TaskType\n",
|
|
151
|
+
"\n",
|
|
152
|
+
"lora_config = LoraConfig(\n",
|
|
153
|
+
" task_type=TaskType.CAUSAL_LM,\n",
|
|
154
|
+
" r=16,\n",
|
|
155
|
+
" lora_alpha=32,\n",
|
|
156
|
+
" lora_dropout=0.05,\n",
|
|
157
|
+
" target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],\n",
|
|
158
|
+
" bias='none',\n",
|
|
159
|
+
")\n",
|
|
160
|
+
"model = get_peft_model(model, lora_config)\n",
|
|
161
|
+
"model.print_trainable_parameters()"
|
|
162
|
+
],
|
|
163
|
+
"execution_count": null,
|
|
164
|
+
"outputs": []
|
|
165
|
+
},
|
|
166
|
+
{
|
|
167
|
+
"cell_type": "code",
|
|
168
|
+
"metadata": {},
|
|
169
|
+
"source": [
|
|
170
|
+
"# === Training ===\n",
|
|
171
|
+
"from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq\n",
|
|
172
|
+
"import numpy as np\n",
|
|
173
|
+
"\n",
|
|
174
|
+
"def tokenize_fn(examples):\n",
|
|
175
|
+
" tok = tokenizer(examples['text'], truncation=True, max_length=2048, padding=False)\n",
|
|
176
|
+
" tok['labels'] = tok['input_ids'].copy()\n",
|
|
177
|
+
" return tok\n",
|
|
178
|
+
"\n",
|
|
179
|
+
"tokenized = dataset.map(tokenize_fn, remove_columns=['text'], batched=True)\n",
|
|
180
|
+
"\n",
|
|
181
|
+
"args = TrainingArguments(\n",
|
|
182
|
+
" output_dir='./clarity-flash-14b',\n",
|
|
183
|
+
" per_device_train_batch_size=1,\n",
|
|
184
|
+
" gradient_accumulation_steps=16,\n",
|
|
185
|
+
" num_train_epochs=3,\n",
|
|
186
|
+
" learning_rate=2e-4,\n",
|
|
187
|
+
" bf16=True,\n",
|
|
188
|
+
" logging_steps=10,\n",
|
|
189
|
+
" save_steps=200,\n",
|
|
190
|
+
" save_total_limit=2,\n",
|
|
191
|
+
" optim='adamw_8bit',\n",
|
|
192
|
+
" report_to='none',\n",
|
|
193
|
+
" dataloader_drop_last=False,\n",
|
|
194
|
+
")\n",
|
|
195
|
+
"\n",
|
|
196
|
+
"trainer = Trainer(\n",
|
|
197
|
+
" model=model,\n",
|
|
198
|
+
" args=args,\n",
|
|
199
|
+
" train_dataset=tokenized['train'],\n",
|
|
200
|
+
" eval_dataset=tokenized['test'],\n",
|
|
201
|
+
" data_collator=DataCollatorForSeq2Seq(tokenizer, padding=True),\n",
|
|
202
|
+
")\n",
|
|
203
|
+
"\n",
|
|
204
|
+
"trainer.train()"
|
|
205
|
+
],
|
|
206
|
+
"execution_count": null,
|
|
207
|
+
"outputs": []
|
|
208
|
+
},
|
|
209
|
+
{
|
|
210
|
+
"cell_type": "code",
|
|
211
|
+
"metadata": {},
|
|
212
|
+
"source": [
|
|
213
|
+
"# === Push Weights to HF ===\n",
|
|
214
|
+
"WEIGHTS_REPO = 'Universal-618/Clarity-flash-weights'\n",
|
|
215
|
+
"try:\n",
|
|
216
|
+
" create_repo(WEIGHTS_REPO, repo_type='dataset', exist_ok=True, token=HF_TOKEN)\n",
|
|
217
|
+
" print(f'Repo {WEIGHTS_REPO} ready')\n",
|
|
218
|
+
"except Exception as e:\n",
|
|
219
|
+
" print(f'Repo exists or error: {e}')\n",
|
|
220
|
+
"\n",
|
|
221
|
+
"model.push_to_hub(WEIGHTS_REPO, token=HF_TOKEN, use_temp_dir=True)\n",
|
|
222
|
+
"tokenizer.push_to_hub(WEIGHTS_REPO, token=HF_TOKEN)\n",
|
|
223
|
+
"print(f'Weights pushed to {WEIGHTS_REPO}')"
|
|
224
|
+
],
|
|
225
|
+
"execution_count": null,
|
|
226
|
+
"outputs": []
|
|
227
|
+
}
|
|
228
|
+
],
|
|
229
|
+
"metadata": {
|
|
230
|
+
"accelerator": "TPU",
|
|
231
|
+
"colab": {
|
|
232
|
+
"provenance": []
|
|
233
|
+
},
|
|
234
|
+
"kernelspec": {
|
|
235
|
+
"display_name": "Python 3",
|
|
236
|
+
"name": "python3"
|
|
237
|
+
},
|
|
238
|
+
"language_info": {
|
|
239
|
+
"name": "python",
|
|
240
|
+
"version": "3.10.0"
|
|
241
|
+
}
|
|
242
|
+
},
|
|
243
|
+
"nbformat": 4,
|
|
244
|
+
"nbformat_minor": 4
|
|
245
|
+
}
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"metadata": {},
|
|
6
|
+
"source": [
|
|
7
|
+
"# CLARITY Heavy 20B MoE — Multi-GPU Fine-Tuning\n",
|
|
8
|
+
"Trains a 20B Mixture-of-Experts model on deep CoT + recursive tool execution data.\n",
|
|
9
|
+
"Target: Kaggle dual T4 (2x 16GB) with 4-bit quantization + FSDP.\n",
|
|
10
|
+
"HF token: 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"
|
|
11
|
+
]
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
"cell_type": "code",
|
|
15
|
+
"metadata": {},
|
|
16
|
+
"source": [
|
|
17
|
+
"# === Install ===\n",
|
|
18
|
+
"!pip install -q torch transformers datasets accelerate peft bitsandbytes\n",
|
|
19
|
+
"!pip install -q deepspeed sentencepiece huggingface_hub"
|
|
20
|
+
],
|
|
21
|
+
"execution_count": null,
|
|
22
|
+
"outputs": []
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"cell_type": "code",
|
|
26
|
+
"metadata": {},
|
|
27
|
+
"source": [
|
|
28
|
+
"# === Check GPUs ===\n",
|
|
29
|
+
"import torch\n",
|
|
30
|
+
"n_gpus = torch.cuda.device_count()\n",
|
|
31
|
+
"for i in range(n_gpus):\n",
|
|
32
|
+
" print(f'GPU {i}: {torch.cuda.get_device_name(i)} — {torch.cuda.get_device_properties(i).total_memory / 1e9:.1f} GB')\n",
|
|
33
|
+
"assert n_gpus >= 2, 'Need at least 2 GPUs'"
|
|
34
|
+
],
|
|
35
|
+
"execution_count": null,
|
|
36
|
+
"outputs": []
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
"cell_type": "code",
|
|
40
|
+
"metadata": {},
|
|
41
|
+
"source": [
|
|
42
|
+
"# === HF Auth ===\n",
|
|
43
|
+
"from huggingface_hub import login, HfApi, create_repo\n",
|
|
44
|
+
"HF_TOKEN = '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'\n",
|
|
45
|
+
"login(token=HF_TOKEN, add_to_git_credential=True)\n",
|
|
46
|
+
"api = HfApi(token=HF_TOKEN)"
|
|
47
|
+
],
|
|
48
|
+
"execution_count": null,
|
|
49
|
+
"outputs": []
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
"cell_type": "code",
|
|
53
|
+
"metadata": {},
|
|
54
|
+
"source": [
|
|
55
|
+
"# === Data Loading ===\n",
|
|
56
|
+
"import requests\n",
|
|
57
|
+
"import json\n",
|
|
58
|
+
"from datasets import Dataset, concatenate_datasets\n",
|
|
59
|
+
"\n",
|
|
60
|
+
"DATA_URLS = [\n",
|
|
61
|
+
" 'https://huggingface.co/spaces/Universal-618/Clarity-4/main-data',\n",
|
|
62
|
+
" 'https://huggingface.co/spaces/Universal-618/Clarity-5/main-data',\n",
|
|
63
|
+
" 'https://huggingface.co/spaces/Universal-618/Clarity-6/main-data',\n",
|
|
64
|
+
" 'https://huggingface.co/spaces/Universal-618/Clarity-main/main-data',\n",
|
|
65
|
+
"]\n",
|
|
66
|
+
"\n",
|
|
67
|
+
"all_samples = []\n",
|
|
68
|
+
"for url in DATA_URLS:\n",
|
|
69
|
+
" try:\n",
|
|
70
|
+
" r = requests.get(url, headers={'Authorization': f'Bearer {HF_TOKEN}'}, timeout=120)\n",
|
|
71
|
+
" if r.status_code == 200:\n",
|
|
72
|
+
" data = r.json()\n",
|
|
73
|
+
" samples = data if isinstance(data, list) else data.get('data', [])\n",
|
|
74
|
+
" all_samples.extend(samples)\n",
|
|
75
|
+
" print(f'Loaded {len(samples)} from {url}')\n",
|
|
76
|
+
" except Exception as e:\n",
|
|
77
|
+
" print(f'Skipped {url}: {e}')\n",
|
|
78
|
+
"\n",
|
|
79
|
+
"if len(all_samples) < 10:\n",
|
|
80
|
+
" print('No remote data — generating synthetic deep CoT samples')\n",
|
|
81
|
+
" import random\n",
|
|
82
|
+
" code_snippets = [\n",
|
|
83
|
+
" 'def fib(n): return n if n < 2 else fib(n-1) + fib(n-2)',\n",
|
|
84
|
+
" 'for i in range(10): print(i**2)',\n",
|
|
85
|
+
" 'with open(\"data.txt\") as f: content = f.read()',\n",
|
|
86
|
+
" ]\n",
|
|
87
|
+
" for _ in range(50):\n",
|
|
88
|
+
" cs = random.choice(code_snippets)\n",
|
|
89
|
+
" all_samples.append({\n",
|
|
90
|
+
" 'instruction': f'Write and test a function',\n",
|
|
91
|
+
" 'thinking': f'I need to think step by step. First, I will analyze what the user wants. Then I will write the code. Let me reason through this carefully.',\n",
|
|
92
|
+
" 'response': f'I will write the code now.\\n<tool>bash</tool><cmd>cat > /tmp/test.py << \\'EOF\\'\\n{cs}\\nEOF\\npython3 /tmp/test.py</cmd>',\n",
|
|
93
|
+
" 'tools': 'bash,write_file',\n",
|
|
94
|
+
" })\n",
|
|
95
|
+
"\n",
|
|
96
|
+
"print(f'Total training samples: {len(all_samples)}')"
|
|
97
|
+
],
|
|
98
|
+
"execution_count": null,
|
|
99
|
+
"outputs": []
|
|
100
|
+
},
|
|
101
|
+
{
|
|
102
|
+
"cell_type": "code",
|
|
103
|
+
"metadata": {},
|
|
104
|
+
"source": [
|
|
105
|
+
"# === Format ===\n",
|
|
106
|
+
"def format_deep_cot(sample):\n",
|
|
107
|
+
" inst = sample.get('instruction', sample.get('prompt', ''))\n",
|
|
108
|
+
" thinking = sample.get('thinking', '')\n",
|
|
109
|
+
" resp = sample.get('response', sample.get('completion', ''))\n",
|
|
110
|
+
" thinking_block = f'<|thinking_start|>{thinking}<|thinking_end|>' if thinking else ''\n",
|
|
111
|
+
" return {\n",
|
|
112
|
+
" 'text': f'<|im_start|>user\\n{inst}<|im_end|>\\n<|im_start|>assistant\\n{thinking_block}{resp}<|im_end|>'\n",
|
|
113
|
+
" }\n",
|
|
114
|
+
"\n",
|
|
115
|
+
"dataset = Dataset.from_list([format_deep_cot(s) for s in all_samples])\n",
|
|
116
|
+
"split = dataset.train_test_split(test_size=0.05, seed=42)\n",
|
|
117
|
+
"print(f'Train: {len(split[\"train\"])}, Test: {len(split[\"test\"])}')"
|
|
118
|
+
],
|
|
119
|
+
"execution_count": null,
|
|
120
|
+
"outputs": []
|
|
121
|
+
},
|
|
122
|
+
{
|
|
123
|
+
"cell_type": "code",
|
|
124
|
+
"metadata": {},
|
|
125
|
+
"source": [
|
|
126
|
+
"# === Load MoE Model (4-bit) ===\n",
|
|
127
|
+
"from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n",
|
|
128
|
+
"\n",
|
|
129
|
+
"MODEL_ID = 'deepseek-ai/DeepSeek-MoE-16B-Chat'\n",
|
|
130
|
+
"\n",
|
|
131
|
+
"bnb = BitsAndBytesConfig(\n",
|
|
132
|
+
" load_in_4bit=True,\n",
|
|
133
|
+
" bnb_4bit_use_double_quant=True,\n",
|
|
134
|
+
" bnb_4bit_quant_type='nf4',\n",
|
|
135
|
+
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
|
|
136
|
+
")\n",
|
|
137
|
+
"\n",
|
|
138
|
+
"tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN, trust_remote_code=True)\n",
|
|
139
|
+
"tokenizer.pad_token = tokenizer.eos_token\n",
|
|
140
|
+
"\n",
|
|
141
|
+
"model = AutoModelForCausalLM.from_pretrained(\n",
|
|
142
|
+
" MODEL_ID,\n",
|
|
143
|
+
" quantization_config=bnb,\n",
|
|
144
|
+
" device_map='auto',\n",
|
|
145
|
+
" torch_dtype=torch.bfloat16,\n",
|
|
146
|
+
" token=HF_TOKEN,\n",
|
|
147
|
+
" trust_remote_code=True,\n",
|
|
148
|
+
")\n",
|
|
149
|
+
"print(f'MoE model loaded: {MODEL_ID}')\n",
|
|
150
|
+
"print(f'Params: {model.num_parameters():,.0f}')"
|
|
151
|
+
],
|
|
152
|
+
"execution_count": null,
|
|
153
|
+
"outputs": []
|
|
154
|
+
},
|
|
155
|
+
{
|
|
156
|
+
"cell_type": "code",
|
|
157
|
+
"metadata": {},
|
|
158
|
+
"source": [
|
|
159
|
+
"# === LoRA for MoE ===\n",
|
|
160
|
+
"from peft import LoraConfig, get_peft_model, TaskType\n",
|
|
161
|
+
"\n",
|
|
162
|
+
"lora_config = LoraConfig(\n",
|
|
163
|
+
" task_type=TaskType.CAUSAL_LM,\n",
|
|
164
|
+
" r=8,\n",
|
|
165
|
+
" lora_alpha=16,\n",
|
|
166
|
+
" lora_dropout=0.1,\n",
|
|
167
|
+
" target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'gate'],\n",
|
|
168
|
+
" bias='none',\n",
|
|
169
|
+
")\n",
|
|
170
|
+
"model = get_peft_model(model, lora_config)\n",
|
|
171
|
+
"model.print_trainable_parameters()"
|
|
172
|
+
],
|
|
173
|
+
"execution_count": null,
|
|
174
|
+
"outputs": []
|
|
175
|
+
},
|
|
176
|
+
{
|
|
177
|
+
"cell_type": "code",
|
|
178
|
+
"metadata": {},
|
|
179
|
+
"source": [
|
|
180
|
+
"# === Gradient Checkpointing (prevents OOM) ===\n",
|
|
181
|
+
"model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant': False})\n",
|
|
182
|
+
"model.config.use_cache = False\n",
|
|
183
|
+
"print('Gradient checkpointing enabled')"
|
|
184
|
+
],
|
|
185
|
+
"execution_count": null,
|
|
186
|
+
"outputs": []
|
|
187
|
+
},
|
|
188
|
+
{
|
|
189
|
+
"cell_type": "code",
|
|
190
|
+
"metadata": {},
|
|
191
|
+
"source": [
|
|
192
|
+
"# === Training ===\n",
|
|
193
|
+
"from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq\n",
|
|
194
|
+
"\n",
|
|
195
|
+
"def tokenize_fn(examples):\n",
|
|
196
|
+
" tok = tokenizer(examples['text'], truncation=True, max_length=2048, padding=False)\n",
|
|
197
|
+
" tok['labels'] = tok['input_ids'].copy()\n",
|
|
198
|
+
" return tok\n",
|
|
199
|
+
"\n",
|
|
200
|
+
"tokenized = split.map(tokenize_fn, remove_columns=['text'], batched=True)\n",
|
|
201
|
+
"\n",
|
|
202
|
+
"args = TrainingArguments(\n",
|
|
203
|
+
" output_dir='./clarity-heavy-20b-moe',\n",
|
|
204
|
+
" per_device_train_batch_size=1,\n",
|
|
205
|
+
" per_device_eval_batch_size=1,\n",
|
|
206
|
+
" gradient_accumulation_steps=8,\n",
|
|
207
|
+
" num_train_epochs=3,\n",
|
|
208
|
+
" learning_rate=1e-4,\n",
|
|
209
|
+
" bf16=True,\n",
|
|
210
|
+
" logging_steps=10,\n",
|
|
211
|
+
" save_steps=200,\n",
|
|
212
|
+
" save_total_limit=2,\n",
|
|
213
|
+
" optim='adamw_8bit',\n",
|
|
214
|
+
" gradient_checkpointing=True,\n",
|
|
215
|
+
" report_to='none',\n",
|
|
216
|
+
" ddp_find_unused_parameters=False,\n",
|
|
217
|
+
")\n",
|
|
218
|
+
"\n",
|
|
219
|
+
"trainer = Trainer(\n",
|
|
220
|
+
" model=model,\n",
|
|
221
|
+
" args=args,\n",
|
|
222
|
+
" train_dataset=tokenized['train'],\n",
|
|
223
|
+
" eval_dataset=tokenized['test'],\n",
|
|
224
|
+
" data_collator=DataCollatorForSeq2Seq(tokenizer, padding=True, pad_to_multiple_of=8),\n",
|
|
225
|
+
")\n",
|
|
226
|
+
"\n",
|
|
227
|
+
"trainer.train()"
|
|
228
|
+
],
|
|
229
|
+
"execution_count": null,
|
|
230
|
+
"outputs": []
|
|
231
|
+
},
|
|
232
|
+
{
|
|
233
|
+
"cell_type": "code",
|
|
234
|
+
"metadata": {},
|
|
235
|
+
"source": [
|
|
236
|
+
"# === Push to HF ===\n",
|
|
237
|
+
"WEIGHTS_REPO = 'Universal-618/Clarity-heavy-weights'\n",
|
|
238
|
+
"try:\n",
|
|
239
|
+
" create_repo(WEIGHTS_REPO, repo_type='dataset', exist_ok=True, token=HF_TOKEN)\n",
|
|
240
|
+
" print(f'Repo {WEIGHTS_REPO} ready')\n",
|
|
241
|
+
"except Exception as e:\n",
|
|
242
|
+
" print(f'Repo notice: {e}')\n",
|
|
243
|
+
"\n",
|
|
244
|
+
"model.push_to_hub(WEIGHTS_REPO, token=HF_TOKEN, use_temp_dir=True)\n",
|
|
245
|
+
"tokenizer.push_to_hub(WEIGHTS_REPO, token=HF_TOKEN)\n",
|
|
246
|
+
"print(f'Weights pushed to https://huggingface.co/datasets/{WEIGHTS_REPO}')"
|
|
247
|
+
],
|
|
248
|
+
"execution_count": null,
|
|
249
|
+
"outputs": []
|
|
250
|
+
}
|
|
251
|
+
],
|
|
252
|
+
"metadata": {
|
|
253
|
+
"accelerator": "GPU",
|
|
254
|
+
"kaggle": {
|
|
255
|
+
"accelerator": "GPU",
|
|
256
|
+
"gpuModel": "T4",
|
|
257
|
+
"gpuCount": 2
|
|
258
|
+
},
|
|
259
|
+
"kernelspec": {
|
|
260
|
+
"display_name": "Python 3",
|
|
261
|
+
"name": "python3"
|
|
262
|
+
},
|
|
263
|
+
"language_info": {
|
|
264
|
+
"name": "python",
|
|
265
|
+
"version": "3.10.0"
|
|
266
|
+
}
|
|
267
|
+
},
|
|
268
|
+
"nbformat": 4,
|
|
269
|
+
"nbformat_minor": 4
|
|
270
|
+
}
|
package/package.json
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
import React, { useState, useCallback, useRef
|
|
1
|
+
import React, { useState, useCallback, useRef } from 'react';
|
|
2
2
|
import { Box } from 'ink';
|
|
3
3
|
import { createChatState, handleSend, handleCommand } from '../chat.js';
|
|
4
4
|
import { hex } from '../config/theme.js';
|
|
5
5
|
import { Layout } from './Layout.js';
|
|
6
|
+
const { createElement: h } = React;
|
|
6
7
|
|
|
7
8
|
let abortController = null;
|
|
8
9
|
|
|
@@ -73,8 +74,8 @@ export function App({ config }) {
|
|
|
73
74
|
}));
|
|
74
75
|
}
|
|
75
76
|
|
|
76
|
-
return Box
|
|
77
|
-
Layout
|
|
77
|
+
return h(Box, { flexDirection: 'column', backgroundColor: hex.bg },
|
|
78
|
+
h(Layout, {
|
|
78
79
|
state,
|
|
79
80
|
streamContent,
|
|
80
81
|
model,
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import React, { useState } from 'react';
|
|
2
2
|
import { Box, Text, useInput } from 'ink';
|
|
3
|
-
import { hex,
|
|
3
|
+
import { hex, sym } from '../config/theme.js';
|
|
4
|
+
import { getLayout } from '../config/layout.js';
|
|
4
5
|
const { createElement: h } = React;
|
|
5
6
|
|
|
6
7
|
const COMMANDS = [
|
|
@@ -8,7 +9,7 @@ const COMMANDS = [
|
|
|
8
9
|
{ name: '/model', desc: 'Switch model' },
|
|
9
10
|
{ name: '/provider', desc: 'Switch provider' },
|
|
10
11
|
{ name: '/agent', desc: 'Toggle agent mode' },
|
|
11
|
-
{ name: '/stop', desc: 'Cancel
|
|
12
|
+
{ name: '/stop', desc: 'Cancel running stream' },
|
|
12
13
|
{ name: '/clear', desc: 'Clear conversation' },
|
|
13
14
|
{ name: '/export', desc: 'Export conversation' },
|
|
14
15
|
{ name: '/help', desc: 'Show all commands' },
|
|
@@ -18,6 +19,7 @@ const COMMANDS = [
|
|
|
18
19
|
export function CommandPicker({ query, onSelect, onClose }) {
|
|
19
20
|
const [search, setSearch] = useState('');
|
|
20
21
|
const [idx, setIdx] = useState(0);
|
|
22
|
+
const { cols } = getLayout();
|
|
21
23
|
|
|
22
24
|
const filtered = COMMANDS.filter(c =>
|
|
23
25
|
c.name.includes(search) || c.desc.toLowerCase().includes(search.toLowerCase())
|
|
@@ -26,40 +28,36 @@ export function CommandPicker({ query, onSelect, onClose }) {
|
|
|
26
28
|
useInput((input, key) => {
|
|
27
29
|
if (key.upArrow) setIdx(i => Math.max(0, i - 1));
|
|
28
30
|
if (key.downArrow) setIdx(i => Math.min(filtered.length - 1, i + 1));
|
|
29
|
-
if (key.return) onSelect(filtered[idx]
|
|
31
|
+
if (key.return && filtered[idx]) onSelect(filtered[idx].name);
|
|
30
32
|
if (key.escape) onClose();
|
|
31
33
|
if (key.backspace) setSearch(s => s.slice(0, -1));
|
|
32
34
|
else if (input && !key.ctrl && !key.meta) setSearch(s => s + input);
|
|
33
35
|
});
|
|
34
36
|
|
|
35
|
-
const
|
|
36
|
-
const boxWidth = Math.min(tw - 4, 50);
|
|
37
|
+
const w = Math.min(cols - 4, 48);
|
|
37
38
|
|
|
38
|
-
return h(Box, { flexDirection: 'column', width:
|
|
39
|
-
h(Box, {
|
|
40
|
-
h(Text, { color: hex.textMuted },
|
|
41
|
-
h(Text, { color: search ? hex.text : hex.textMuted }, search || 'type to filter...'),
|
|
39
|
+
return h(Box, { flexDirection: 'column', backgroundColor: hex.surfaceAlt, width: w },
|
|
40
|
+
h(Box, { height: 1, backgroundColor: hex.surfaceAlt },
|
|
41
|
+
h(Text, { color: hex.textMuted, backgroundColor: hex.surfaceAlt }, ' ' + sym.star + ' ' + (search || 'filter commands...'))
|
|
42
42
|
),
|
|
43
43
|
filtered.map((cmd, i) =>
|
|
44
44
|
h(Box, {
|
|
45
|
-
key: cmd.name,
|
|
46
|
-
|
|
47
|
-
backgroundColor: i === idx ? hex.selectionBg : undefined,
|
|
48
|
-
width: boxWidth,
|
|
45
|
+
key: cmd.name, height: 1,
|
|
46
|
+
backgroundColor: i === idx ? hex.selectionBg : 'transparent',
|
|
49
47
|
},
|
|
50
48
|
h(Text, {
|
|
51
49
|
color: i === idx ? hex.selectionText : hex.text,
|
|
52
50
|
bold: i === idx,
|
|
53
|
-
backgroundColor: i === idx ? hex.selectionBg :
|
|
54
|
-
|
|
55
|
-
}, ' ' + cmd.name.padEnd(16)),
|
|
51
|
+
backgroundColor: i === idx ? hex.selectionBg : 'transparent',
|
|
52
|
+
}, ' ' + cmd.name + ' '),
|
|
56
53
|
h(Text, {
|
|
57
54
|
color: i === idx ? hex.selectionText : hex.textDim,
|
|
58
|
-
backgroundColor: i === idx ? hex.selectionBg :
|
|
59
|
-
wrap: 'truncate-end',
|
|
55
|
+
backgroundColor: i === idx ? hex.selectionBg : 'transparent',
|
|
60
56
|
}, cmd.desc)
|
|
61
57
|
)
|
|
62
58
|
),
|
|
63
|
-
h(
|
|
59
|
+
h(Box, { height: 1, backgroundColor: hex.surfaceAlt },
|
|
60
|
+
h(Text, { color: hex.textMuted, backgroundColor: hex.surfaceAlt }, ' ' + sym.arrowU + sym.arrowD + ' nav ' + sym.arrowR + ' select Esc close')
|
|
61
|
+
)
|
|
64
62
|
);
|
|
65
63
|
}
|