gitarsenal-cli 1.9.76 → 1.9.78
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.venv_status.json +1 -1
- package/bin/gitarsenal.js +276 -18
- package/gitingest-integration.js +274 -0
- package/kill_claude/prompts/claude-code-tool-prompts.md +11 -1
- package/kill_claude/tools/__pycache__/bash_output_tool.cpython-312.pyc +0 -0
- package/kill_claude/tools/__pycache__/bash_tool.cpython-312.pyc +0 -0
- package/kill_claude/tools/__pycache__/edit_tool.cpython-312.pyc +0 -0
- package/kill_claude/tools/__pycache__/exit_plan_mode_tool.cpython-312.pyc +0 -0
- package/kill_claude/tools/__pycache__/glob_tool.cpython-312.pyc +0 -0
- package/kill_claude/tools/__pycache__/grep_tool.cpython-312.pyc +0 -0
- package/kill_claude/tools/__pycache__/kill_bash_tool.cpython-312.pyc +0 -0
- package/kill_claude/tools/__pycache__/ls_tool.cpython-312.pyc +0 -0
- package/kill_claude/tools/__pycache__/multiedit_tool.cpython-312.pyc +0 -0
- package/kill_claude/tools/__pycache__/notebook_edit_tool.cpython-312.pyc +0 -0
- package/kill_claude/tools/__pycache__/read_tool.cpython-312.pyc +0 -0
- package/kill_claude/tools/__pycache__/task_tool.cpython-312.pyc +0 -0
- package/kill_claude/tools/__pycache__/todo_write_tool.cpython-312.pyc +0 -0
- package/kill_claude/tools/__pycache__/web_fetch_tool.cpython-312.pyc +0 -0
- package/kill_claude/tools/__pycache__/web_search_tool.cpython-312.pyc +0 -0
- package/kill_claude/tools/__pycache__/write_tool.cpython-312.pyc +0 -0
- package/package.json +1 -1
- package/python/__pycache__/analyze_repo_api_keys.cpython-312.pyc +0 -0
- package/python/__pycache__/credentials_manager.cpython-312.pyc +0 -0
- package/python/credentials_manager.py +0 -169
- package/python/gitarsenal_keys.py +8 -2
- package/python/test_modalSandboxScript.py +13 -43
- package/kill_claude/nanoGPT/.gitattributes +0 -3
- package/kill_claude/nanoGPT/LICENSE +0 -21
- package/kill_claude/nanoGPT/README.md +0 -227
- package/kill_claude/nanoGPT/assets/gpt2_124M_loss.png +0 -0
- package/kill_claude/nanoGPT/assets/nanogpt.jpg +0 -0
- package/kill_claude/nanoGPT/bench.py +0 -117
- package/kill_claude/nanoGPT/config/eval_gpt2.py +0 -8
- package/kill_claude/nanoGPT/config/eval_gpt2_large.py +0 -8
- package/kill_claude/nanoGPT/config/eval_gpt2_medium.py +0 -8
- package/kill_claude/nanoGPT/config/eval_gpt2_xl.py +0 -8
- package/kill_claude/nanoGPT/config/finetune_shakespeare.py +0 -25
- package/kill_claude/nanoGPT/config/train_gpt2.py +0 -25
- package/kill_claude/nanoGPT/config/train_shakespeare_char.py +0 -37
- package/kill_claude/nanoGPT/configurator.py +0 -47
- package/kill_claude/nanoGPT/data/openwebtext/prepare.py +0 -81
- package/kill_claude/nanoGPT/data/openwebtext/readme.md +0 -15
- package/kill_claude/nanoGPT/data/shakespeare/prepare.py +0 -33
- package/kill_claude/nanoGPT/data/shakespeare/readme.md +0 -9
- package/kill_claude/nanoGPT/data/shakespeare_char/prepare.py +0 -68
- package/kill_claude/nanoGPT/data/shakespeare_char/readme.md +0 -9
- package/kill_claude/nanoGPT/model.py +0 -330
- package/kill_claude/nanoGPT/sample.py +0 -89
- package/kill_claude/nanoGPT/scaling_laws.ipynb +0 -792
- package/kill_claude/nanoGPT/train.py +0 -336
- package/kill_claude/nanoGPT/transformer_sizing.ipynb +0 -402
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/package.json
CHANGED
|
Binary file
|
|
Binary file
|
|
@@ -113,176 +113,7 @@ class CredentialsManager:
|
|
|
113
113
|
except Exception as e:
|
|
114
114
|
print(f"❌ Error getting input: {e}")
|
|
115
115
|
return None
|
|
116
|
-
|
|
117
|
-
def get_openai_api_key(self):
|
|
118
|
-
"""Get OpenAI API key with validation - for user's repository execution"""
|
|
119
|
-
def validate_openai_key(key):
|
|
120
|
-
# Basic validation - OpenAI keys usually start with "sk-" and are 51 chars
|
|
121
|
-
return key.startswith("sk-") and len(key) > 40
|
|
122
|
-
|
|
123
|
-
# First check stored credentials (user's key)
|
|
124
|
-
credentials = self.load_credentials()
|
|
125
|
-
if "openai_api_key" in credentials:
|
|
126
|
-
stored_key = credentials["openai_api_key"]
|
|
127
|
-
if validate_openai_key(stored_key):
|
|
128
|
-
return stored_key
|
|
129
|
-
|
|
130
|
-
# Then check environment variable
|
|
131
|
-
env_key = os.environ.get("OPENAI_API_KEY")
|
|
132
|
-
if env_key and validate_openai_key(env_key):
|
|
133
|
-
return env_key
|
|
134
|
-
|
|
135
|
-
# For user's repository execution, prompt if no key found
|
|
136
|
-
prompt = "An OpenAI API key is needed to run this repository.\nYou can get your API key from: https://platform.openai.com/api-keys"
|
|
137
|
-
return self.get_credential("openai_api_key", prompt, is_password=True, validate_func=validate_openai_key)
|
|
138
|
-
|
|
139
|
-
def get_gitarsenal_openai_api_key(self):
|
|
140
|
-
"""Get GitArsenal's OpenAI API key for debugging - never prompts user"""
|
|
141
|
-
def validate_openai_key(key):
|
|
142
|
-
# Basic validation - OpenAI keys usually start with "sk-" and are 51 chars
|
|
143
|
-
return key.startswith("sk-") and len(key) > 40
|
|
144
|
-
|
|
145
|
-
# First try to fetch from server using fetch_modal_tokens (GitArsenal's key)
|
|
146
|
-
try:
|
|
147
|
-
from fetch_modal_tokens import get_tokens
|
|
148
|
-
_, _, api_key, _, _ = get_tokens()
|
|
149
|
-
if api_key and validate_openai_key(api_key):
|
|
150
|
-
# Set in environment for future use
|
|
151
|
-
os.environ["OPENAI_API_KEY"] = api_key
|
|
152
|
-
return api_key
|
|
153
|
-
except ImportError:
|
|
154
|
-
pass
|
|
155
|
-
except Exception as e:
|
|
156
|
-
print(f"⚠️ Error fetching GitArsenal API key from server: {e}")
|
|
157
|
-
|
|
158
|
-
# Then check environment variable (for development/testing)
|
|
159
|
-
env_key = os.environ.get("GITARSENAL_OPENAI_API_KEY")
|
|
160
|
-
if env_key and validate_openai_key(env_key):
|
|
161
|
-
return env_key
|
|
162
|
-
|
|
163
|
-
# Check for GitArsenal's key in credentials (for development)
|
|
164
|
-
credentials = self.load_credentials()
|
|
165
|
-
if "gitarsenal_openai_api_key" in credentials:
|
|
166
|
-
stored_key = credentials["gitarsenal_openai_api_key"]
|
|
167
|
-
if validate_openai_key(stored_key):
|
|
168
|
-
return stored_key
|
|
169
|
-
|
|
170
|
-
# If no GitArsenal key found, return None (don't prompt user)
|
|
171
|
-
print("⚠️ GitArsenal's OpenAI API key not available for debugging")
|
|
172
|
-
return None
|
|
173
|
-
|
|
174
|
-
def get_modal_token(self):
|
|
175
|
-
"""Get Modal token with basic validation"""
|
|
176
|
-
# First check if we have a built-in token from setup_modal_token.py
|
|
177
|
-
try:
|
|
178
|
-
from setup_modal_token import BUILT_IN_MODAL_TOKEN
|
|
179
|
-
return BUILT_IN_MODAL_TOKEN
|
|
180
|
-
except ImportError:
|
|
181
|
-
pass
|
|
182
|
-
|
|
183
|
-
# Fall back to credentials file if needed
|
|
184
|
-
credentials = self.load_credentials()
|
|
185
|
-
if "modal_token" in credentials:
|
|
186
|
-
return credentials["modal_token"]
|
|
187
|
-
|
|
188
|
-
# Return the built-in token as a last resort
|
|
189
|
-
return "mo-abcdef1234567890abcdef1234567890" # Same as in setup_modal_token.py
|
|
190
|
-
|
|
191
|
-
def get_huggingface_token(self):
|
|
192
|
-
"""Get Hugging Face token with basic validation"""
|
|
193
|
-
def validate_hf_token(token):
|
|
194
|
-
# HF tokens are typically non-empty strings
|
|
195
|
-
return bool(token) and len(token) > 8
|
|
196
|
-
|
|
197
|
-
# First check stored credentials
|
|
198
|
-
credentials = self.load_credentials()
|
|
199
|
-
if "huggingface_token" in credentials:
|
|
200
|
-
stored_token = credentials["huggingface_token"]
|
|
201
|
-
if validate_hf_token(stored_token):
|
|
202
|
-
return stored_token
|
|
203
|
-
|
|
204
|
-
prompt = "A Hugging Face token is required.\nYou can get your token from: https://huggingface.co/settings/tokens"
|
|
205
|
-
return self.get_credential("huggingface_token", prompt, is_password=True, validate_func=validate_hf_token)
|
|
206
|
-
|
|
207
|
-
def get_wandb_api_key(self):
|
|
208
|
-
"""Get Weights & Biases API key with validation"""
|
|
209
|
-
def validate_wandb_key(key):
|
|
210
|
-
# W&B API keys are typically 40 characters
|
|
211
|
-
return len(key) == 40
|
|
212
|
-
|
|
213
|
-
# First check stored credentials
|
|
214
|
-
credentials = self.load_credentials()
|
|
215
|
-
if "wandb_api_key" in credentials:
|
|
216
|
-
stored_key = credentials["wandb_api_key"]
|
|
217
|
-
if validate_wandb_key(stored_key):
|
|
218
|
-
return stored_key
|
|
219
|
-
|
|
220
|
-
prompt = "A Weights & Biases API key is required.\nYou can get your API key from: https://wandb.ai/authorize"
|
|
221
|
-
return self.get_credential("wandb_api_key", prompt, is_password=True, validate_func=validate_wandb_key)
|
|
222
|
-
|
|
223
|
-
def get_anthropic_api_key(self):
|
|
224
|
-
"""Get Anthropic API key with validation"""
|
|
225
|
-
def validate_anthropic_key(key):
|
|
226
|
-
# Anthropic keys usually start with "sk-ant-" and are typically 48+ characters
|
|
227
|
-
return key.startswith("sk-ant-") and len(key) > 40
|
|
228
|
-
|
|
229
|
-
# First check stored credentials
|
|
230
|
-
credentials = self.load_credentials()
|
|
231
|
-
if "anthropic_api_key" in credentials:
|
|
232
|
-
stored_key = credentials["anthropic_api_key"]
|
|
233
|
-
if validate_anthropic_key(stored_key):
|
|
234
|
-
return stored_key
|
|
235
|
-
|
|
236
|
-
# Then check environment variable
|
|
237
|
-
env_key = os.environ.get("ANTHROPIC_API_KEY")
|
|
238
|
-
if env_key and validate_anthropic_key(env_key):
|
|
239
|
-
return env_key
|
|
240
|
-
|
|
241
|
-
prompt = "An Anthropic API key is required.\nYou can get your API key from: https://console.anthropic.com/"
|
|
242
|
-
return self.get_credential("anthropic_api_key", prompt, is_password=True, validate_func=validate_anthropic_key)
|
|
243
116
|
|
|
244
|
-
def get_groq_api_key(self):
|
|
245
|
-
"""Get Groq API key with validation"""
|
|
246
|
-
def validate_groq_key(key):
|
|
247
|
-
# Groq keys are non-empty; basic length check
|
|
248
|
-
return bool(key) and len(key) > 20
|
|
249
|
-
|
|
250
|
-
# First check stored credentials
|
|
251
|
-
credentials = self.load_credentials()
|
|
252
|
-
if "groq_api_key" in credentials:
|
|
253
|
-
stored_key = credentials["groq_api_key"]
|
|
254
|
-
if validate_groq_key(stored_key):
|
|
255
|
-
return stored_key
|
|
256
|
-
|
|
257
|
-
# Then check environment variable
|
|
258
|
-
env_key = os.environ.get("GROQ_API_KEY")
|
|
259
|
-
if env_key and validate_groq_key(env_key):
|
|
260
|
-
return env_key
|
|
261
|
-
|
|
262
|
-
prompt = "A Groq API key is required for Groq models.\nYou can get your key from: https://console.groq.com/keys"
|
|
263
|
-
return self.get_credential("groq_api_key", prompt, is_password=True, validate_func=validate_groq_key)
|
|
264
|
-
|
|
265
|
-
def get_exa_api_key(self):
|
|
266
|
-
"""Get Exa API key with validation"""
|
|
267
|
-
def validate_exa_key(key):
|
|
268
|
-
# Exa API keys are typically 32+ characters
|
|
269
|
-
return len(key) >= 32
|
|
270
|
-
|
|
271
|
-
# First check stored credentials
|
|
272
|
-
credentials = self.load_credentials()
|
|
273
|
-
if "exa_api_key" in credentials:
|
|
274
|
-
stored_key = credentials["exa_api_key"]
|
|
275
|
-
if validate_exa_key(stored_key):
|
|
276
|
-
return stored_key
|
|
277
|
-
|
|
278
|
-
# Then check environment variable
|
|
279
|
-
env_key = os.environ.get("EXA_API_KEY")
|
|
280
|
-
if env_key and validate_exa_key(env_key):
|
|
281
|
-
return env_key
|
|
282
|
-
|
|
283
|
-
prompt = "An Exa API key is required for web search functionality.\nYou can get your API key from: https://exa.ai/"
|
|
284
|
-
return self.get_credential("exa_api_key", prompt, is_password=True, validate_func=validate_exa_key)
|
|
285
|
-
|
|
286
117
|
def clear_credential(self, key):
|
|
287
118
|
"""Remove a specific credential"""
|
|
288
119
|
credentials = self.load_credentials()
|
|
@@ -16,6 +16,7 @@ def main():
|
|
|
16
16
|
parser.add_argument('command', choices=['add', 'list', 'view', 'delete'], help='Command to execute')
|
|
17
17
|
parser.add_argument('--service', help='Service name (openai_api_key, WANDB_API_KEY, HUGGINGFACE_TOKEN, modal_token)')
|
|
18
18
|
parser.add_argument('--key', help='API key (for add command)')
|
|
19
|
+
parser.add_argument('--json', action='store_true', help='Output in JSON format (for list command)')
|
|
19
20
|
|
|
20
21
|
args = parser.parse_args()
|
|
21
22
|
|
|
@@ -25,7 +26,7 @@ def main():
|
|
|
25
26
|
if args.command == 'add':
|
|
26
27
|
handle_add(credentials_manager, args)
|
|
27
28
|
elif args.command == 'list':
|
|
28
|
-
handle_list(credentials_manager)
|
|
29
|
+
handle_list(credentials_manager, args)
|
|
29
30
|
elif args.command == 'view':
|
|
30
31
|
handle_view(credentials_manager, args)
|
|
31
32
|
elif args.command == 'delete':
|
|
@@ -94,10 +95,15 @@ def handle_add(credentials_manager, args):
|
|
|
94
95
|
print("❌ Failed to save API key")
|
|
95
96
|
sys.exit(1)
|
|
96
97
|
|
|
97
|
-
def handle_list(credentials_manager):
|
|
98
|
+
def handle_list(credentials_manager, args=None):
|
|
98
99
|
"""Handle listing all stored API keys"""
|
|
99
100
|
credentials = credentials_manager.load_credentials()
|
|
100
101
|
|
|
102
|
+
if args and args.json:
|
|
103
|
+
# Return JSON format for programmatic access
|
|
104
|
+
print(json.dumps(credentials))
|
|
105
|
+
return
|
|
106
|
+
|
|
101
107
|
if not credentials:
|
|
102
108
|
print("📭 No API keys stored")
|
|
103
109
|
return
|
|
@@ -46,21 +46,6 @@ def ssh_container_function(ssh_password=None, repo_url=None, repo_name=None, set
|
|
|
46
46
|
import os
|
|
47
47
|
import json
|
|
48
48
|
import sys
|
|
49
|
-
import modal
|
|
50
|
-
|
|
51
|
-
print("🐳 SSH Container Function Started!")
|
|
52
|
-
print(f"📋 Parameters received:")
|
|
53
|
-
print(f" - SSH Password: {'***' if ssh_password else 'None'}")
|
|
54
|
-
print(f" - Repo URL: {repo_url or 'None'}")
|
|
55
|
-
print(f" - Repo Name: {repo_name or 'None'}")
|
|
56
|
-
print(f" - Setup Commands: {len(setup_commands) if setup_commands else 0} commands")
|
|
57
|
-
print(f" - OpenAI API Key: {'Set' if openai_api_key else 'Not set'}")
|
|
58
|
-
print(f" - Anthropic API Key: {'Set' if anthropic_api_key else 'Not set'}")
|
|
59
|
-
print(f" - Stored Credentials: {len(stored_credentials) if stored_credentials else 0} items")
|
|
60
|
-
|
|
61
|
-
# Import only the modules we actually need (none currently for Agent-based approach)
|
|
62
|
-
# Note: CommandListManager and llm_debugging functions are not used in the Agent-based approach
|
|
63
|
-
print("✅ Container setup complete - using Agent-based repository setup")
|
|
64
49
|
|
|
65
50
|
# Set root password
|
|
66
51
|
subprocess.run(["bash", "-c", f"echo 'root:{ssh_password}' | chpasswd"], check=True)
|
|
@@ -113,7 +98,6 @@ def ssh_container_function(ssh_password=None, repo_url=None, repo_name=None, set
|
|
|
113
98
|
os.environ['ANTHROPIC_API_KEY'] = anthropic_api_key
|
|
114
99
|
|
|
115
100
|
# Set up Anthropic API key from stored credentials
|
|
116
|
-
anthropic_api_key = None
|
|
117
101
|
if stored_credentials:
|
|
118
102
|
# Look for Anthropic API key in various possible names
|
|
119
103
|
for key_name in ['ANTHROPIC_API_KEY', 'anthropic_api_key', 'anthropic-api-key']:
|
|
@@ -566,35 +550,21 @@ def create_modal_ssh_container(gpu_type, repo_url=None, repo_name=None, setup_co
|
|
|
566
550
|
try:
|
|
567
551
|
# Wait for the function to start and print connection info (with timeout)
|
|
568
552
|
print("⏳ Waiting for container to initialize...")
|
|
569
|
-
|
|
570
|
-
# Use a timeout to see if the container is starting properly
|
|
571
|
-
print("🔍 Checking container status with 30-second timeout...")
|
|
572
553
|
try:
|
|
573
|
-
|
|
554
|
+
print("\n⏳ Monitoring container (press Ctrl+C to stop monitoring)...")
|
|
555
|
+
result = function_call.get() # Wait indefinitely
|
|
574
556
|
print(f"🔚 Container function completed with result: {result}")
|
|
575
|
-
except
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
print(f"🔚 Container function completed with result: {result}")
|
|
587
|
-
except KeyboardInterrupt:
|
|
588
|
-
print("\n🛑 Stopped monitoring. Container is still running remotely.")
|
|
589
|
-
print("💡 Use Modal's web UI or CLI to stop the container when done.")
|
|
590
|
-
print("🔒 Keeping tokens active since container is still running.")
|
|
591
|
-
return {
|
|
592
|
-
"app_name": app_name,
|
|
593
|
-
"ssh_password": ssh_password,
|
|
594
|
-
"volume_name": volume_name,
|
|
595
|
-
"status": "monitoring_stopped",
|
|
596
|
-
"function_call_id": function_call.object_id
|
|
597
|
-
}
|
|
557
|
+
except KeyboardInterrupt:
|
|
558
|
+
print("\n🛑 Stopped monitoring. Container is still running remotely.")
|
|
559
|
+
print("💡 Use Modal's web UI or CLI to stop the container when done.")
|
|
560
|
+
print("🔒 Keeping tokens active since container is still running.")
|
|
561
|
+
return {
|
|
562
|
+
"app_name": app_name,
|
|
563
|
+
"ssh_password": ssh_password,
|
|
564
|
+
"volume_name": volume_name,
|
|
565
|
+
"status": "monitoring_stopped",
|
|
566
|
+
"function_call_id": function_call.object_id
|
|
567
|
+
}
|
|
598
568
|
|
|
599
569
|
except KeyboardInterrupt:
|
|
600
570
|
print("\n🛑 Interrupted by user. Container may still be running remotely.")
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
MIT License
|
|
2
|
-
|
|
3
|
-
Copyright (c) 2022 Andrej Karpathy
|
|
4
|
-
|
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
-
in the Software without restriction, including without limitation the rights
|
|
8
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
-
furnished to do so, subject to the following conditions:
|
|
11
|
-
|
|
12
|
-
The above copyright notice and this permission notice shall be included in all
|
|
13
|
-
copies or substantial portions of the Software.
|
|
14
|
-
|
|
15
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
-
SOFTWARE.
|
|
@@ -1,227 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
# nanoGPT
|
|
3
|
-
|
|
4
|
-

|
|
5
|
-
|
|
6
|
-
The simplest, fastest repository for training/finetuning medium-sized GPTs. It is a rewrite of [minGPT](https://github.com/karpathy/minGPT) that prioritizes teeth over education. Still under active development, but currently the file `train.py` reproduces GPT-2 (124M) on OpenWebText, running on a single 8XA100 40GB node in about 4 days of training. The code itself is plain and readable: `train.py` is a ~300-line boilerplate training loop and `model.py` a ~300-line GPT model definition, which can optionally load the GPT-2 weights from OpenAI. That's it.
|
|
7
|
-
|
|
8
|
-

|
|
9
|
-
|
|
10
|
-
Because the code is so simple, it is very easy to hack to your needs, train new models from scratch, or finetune pretrained checkpoints (e.g. biggest one currently available as a starting point would be the GPT-2 1.3B model from OpenAI).
|
|
11
|
-
|
|
12
|
-
## install
|
|
13
|
-
|
|
14
|
-
```
|
|
15
|
-
pip install torch numpy transformers datasets tiktoken wandb tqdm
|
|
16
|
-
```
|
|
17
|
-
|
|
18
|
-
Dependencies:
|
|
19
|
-
|
|
20
|
-
- [pytorch](https://pytorch.org) <3
|
|
21
|
-
- [numpy](https://numpy.org/install/) <3
|
|
22
|
-
- `transformers` for huggingface transformers <3 (to load GPT-2 checkpoints)
|
|
23
|
-
- `datasets` for huggingface datasets <3 (if you want to download + preprocess OpenWebText)
|
|
24
|
-
- `tiktoken` for OpenAI's fast BPE code <3
|
|
25
|
-
- `wandb` for optional logging <3
|
|
26
|
-
- `tqdm` for progress bars <3
|
|
27
|
-
|
|
28
|
-
## quick start
|
|
29
|
-
|
|
30
|
-
If you are not a deep learning professional and you just want to feel the magic and get your feet wet, the fastest way to get started is to train a character-level GPT on the works of Shakespeare. First, we download it as a single (1MB) file and turn it from raw text into one large stream of integers:
|
|
31
|
-
|
|
32
|
-
```sh
|
|
33
|
-
python data/shakespeare_char/prepare.py
|
|
34
|
-
```
|
|
35
|
-
|
|
36
|
-
This creates a `train.bin` and `val.bin` in that data directory. Now it is time to train your GPT. The size of it very much depends on the computational resources of your system:
|
|
37
|
-
|
|
38
|
-
**I have a GPU**. Great, we can quickly train a baby GPT with the settings provided in the [config/train_shakespeare_char.py](config/train_shakespeare_char.py) config file:
|
|
39
|
-
|
|
40
|
-
```sh
|
|
41
|
-
python train.py config/train_shakespeare_char.py
|
|
42
|
-
```
|
|
43
|
-
|
|
44
|
-
If you peek inside it, you'll see that we're training a GPT with a context size of up to 256 characters, 384 feature channels, and it is a 6-layer Transformer with 6 heads in each layer. On one A100 GPU this training run takes about 3 minutes and the best validation loss is 1.4697. Based on the configuration, the model checkpoints are being written into the `--out_dir` directory `out-shakespeare-char`. So once the training finishes we can sample from the best model by pointing the sampling script at this directory:
|
|
45
|
-
|
|
46
|
-
```sh
|
|
47
|
-
python sample.py --out_dir=out-shakespeare-char
|
|
48
|
-
```
|
|
49
|
-
|
|
50
|
-
This generates a few samples, for example:
|
|
51
|
-
|
|
52
|
-
```
|
|
53
|
-
ANGELO:
|
|
54
|
-
And cowards it be strawn to my bed,
|
|
55
|
-
And thrust the gates of my threats,
|
|
56
|
-
Because he that ale away, and hang'd
|
|
57
|
-
An one with him.
|
|
58
|
-
|
|
59
|
-
DUKE VINCENTIO:
|
|
60
|
-
I thank your eyes against it.
|
|
61
|
-
|
|
62
|
-
DUKE VINCENTIO:
|
|
63
|
-
Then will answer him to save the malm:
|
|
64
|
-
And what have you tyrannous shall do this?
|
|
65
|
-
|
|
66
|
-
DUKE VINCENTIO:
|
|
67
|
-
If you have done evils of all disposition
|
|
68
|
-
To end his power, the day of thrust for a common men
|
|
69
|
-
That I leave, to fight with over-liking
|
|
70
|
-
Hasting in a roseman.
|
|
71
|
-
```
|
|
72
|
-
|
|
73
|
-
lol `¯\_(ツ)_/¯`. Not bad for a character-level model after 3 minutes of training on a GPU. Better results are quite likely obtainable by instead finetuning a pretrained GPT-2 model on this dataset (see finetuning section later).
|
|
74
|
-
|
|
75
|
-
**I only have a macbook** (or other cheap computer). No worries, we can still train a GPT but we want to dial things down a notch. I recommend getting the bleeding edge PyTorch nightly ([select it here](https://pytorch.org/get-started/locally/) when installing) as it is currently quite likely to make your code more efficient. But even without it, a simple train run could look as follows:
|
|
76
|
-
|
|
77
|
-
```sh
|
|
78
|
-
python train.py config/train_shakespeare_char.py --device=cpu --compile=False --eval_iters=20 --log_interval=1 --block_size=64 --batch_size=12 --n_layer=4 --n_head=4 --n_embd=128 --max_iters=2000 --lr_decay_iters=2000 --dropout=0.0
|
|
79
|
-
```
|
|
80
|
-
|
|
81
|
-
Here, since we are running on CPU instead of GPU we must set both `--device=cpu` and also turn off PyTorch 2.0 compile with `--compile=False`. Then when we evaluate we get a bit more noisy but faster estimate (`--eval_iters=20`, down from 200), our context size is only 64 characters instead of 256, and the batch size only 12 examples per iteration, not 64. We'll also use a much smaller Transformer (4 layers, 4 heads, 128 embedding size), and decrease the number of iterations to 2000 (and correspondingly usually decay the learning rate to around max_iters with `--lr_decay_iters`). Because our network is so small we also ease down on regularization (`--dropout=0.0`). This still runs in about ~3 minutes, but gets us a loss of only 1.88 and therefore also worse samples, but it's still good fun:
|
|
82
|
-
|
|
83
|
-
```sh
|
|
84
|
-
python sample.py --out_dir=out-shakespeare-char --device=cpu
|
|
85
|
-
```
|
|
86
|
-
Generates samples like this:
|
|
87
|
-
|
|
88
|
-
```
|
|
89
|
-
GLEORKEN VINGHARD III:
|
|
90
|
-
Whell's the couse, the came light gacks,
|
|
91
|
-
And the for mought you in Aut fries the not high shee
|
|
92
|
-
bot thou the sought bechive in that to doth groan you,
|
|
93
|
-
No relving thee post mose the wear
|
|
94
|
-
```
|
|
95
|
-
|
|
96
|
-
Not bad for ~3 minutes on a CPU, for a hint of the right character gestalt. If you're willing to wait longer, feel free to tune the hyperparameters, increase the size of the network, the context length (`--block_size`), the length of training, etc.
|
|
97
|
-
|
|
98
|
-
Finally, on Apple Silicon Macbooks and with a recent PyTorch version make sure to add `--device=mps` (short for "Metal Performance Shaders"); PyTorch then uses the on-chip GPU that can *significantly* accelerate training (2-3X) and allow you to use larger networks. See [Issue 28](https://github.com/karpathy/nanoGPT/issues/28) for more.
|
|
99
|
-
|
|
100
|
-
## reproducing GPT-2
|
|
101
|
-
|
|
102
|
-
A more serious deep learning professional may be more interested in reproducing GPT-2 results. So here we go - we first tokenize the dataset, in this case the [OpenWebText](https://openwebtext2.readthedocs.io/en/latest/), an open reproduction of OpenAI's (private) WebText:
|
|
103
|
-
|
|
104
|
-
```sh
|
|
105
|
-
python data/openwebtext/prepare.py
|
|
106
|
-
```
|
|
107
|
-
|
|
108
|
-
This downloads and tokenizes the [OpenWebText](https://huggingface.co/datasets/openwebtext) dataset. It will create a `train.bin` and `val.bin` which holds the GPT2 BPE token ids in one sequence, stored as raw uint16 bytes. Then we're ready to kick off training. To reproduce GPT-2 (124M) you'll want at least an 8X A100 40GB node and run:
|
|
109
|
-
|
|
110
|
-
```sh
|
|
111
|
-
torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py
|
|
112
|
-
```
|
|
113
|
-
|
|
114
|
-
This will run for about 4 days using PyTorch Distributed Data Parallel (DDP) and go down to loss of ~2.85. Now, a GPT-2 model just evaluated on OWT gets a val loss of about 3.11, but if you finetune it it will come down to ~2.85 territory (due to an apparent domain gap), making the two models ~match.
|
|
115
|
-
|
|
116
|
-
If you're in a cluster environment and you are blessed with multiple GPU nodes you can make GPU go brrrr e.g. across 2 nodes like:
|
|
117
|
-
|
|
118
|
-
```sh
|
|
119
|
-
# Run on the first (master) node with example IP 123.456.123.456:
|
|
120
|
-
torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=123.456.123.456 --master_port=1234 train.py
|
|
121
|
-
# Run on the worker node:
|
|
122
|
-
torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=123.456.123.456 --master_port=1234 train.py
|
|
123
|
-
```
|
|
124
|
-
|
|
125
|
-
It is a good idea to benchmark your interconnect (e.g. iperf3). In particular, if you don't have Infiniband then also prepend `NCCL_IB_DISABLE=1` to the above launches. Your multinode training will work, but most likely _crawl_. By default checkpoints are periodically written to the `--out_dir`. We can sample from the model by simply `python sample.py`.
|
|
126
|
-
|
|
127
|
-
Finally, to train on a single GPU simply run the `python train.py` script. Have a look at all of its args, the script tries to be very readable, hackable and transparent. You'll most likely want to tune a number of those variables depending on your needs.
|
|
128
|
-
|
|
129
|
-
## baselines
|
|
130
|
-
|
|
131
|
-
OpenAI GPT-2 checkpoints allow us to get some baselines in place for openwebtext. We can get the numbers as follows:
|
|
132
|
-
|
|
133
|
-
```sh
|
|
134
|
-
$ python train.py config/eval_gpt2.py
|
|
135
|
-
$ python train.py config/eval_gpt2_medium.py
|
|
136
|
-
$ python train.py config/eval_gpt2_large.py
|
|
137
|
-
$ python train.py config/eval_gpt2_xl.py
|
|
138
|
-
```
|
|
139
|
-
|
|
140
|
-
and observe the following losses on train and val:
|
|
141
|
-
|
|
142
|
-
| model | params | train loss | val loss |
|
|
143
|
-
| ------| ------ | ---------- | -------- |
|
|
144
|
-
| gpt2 | 124M | 3.11 | 3.12 |
|
|
145
|
-
| gpt2-medium | 350M | 2.85 | 2.84 |
|
|
146
|
-
| gpt2-large | 774M | 2.66 | 2.67 |
|
|
147
|
-
| gpt2-xl | 1558M | 2.56 | 2.54 |
|
|
148
|
-
|
|
149
|
-
However, we have to note that GPT-2 was trained on (closed, never released) WebText, while OpenWebText is just a best-effort open reproduction of this dataset. This means there is a dataset domain gap. Indeed, taking the GPT-2 (124M) checkpoint and finetuning on OWT directly for a while reaches loss down to ~2.85. This then becomes the more appropriate baseline w.r.t. reproduction.
|
|
150
|
-
|
|
151
|
-
## finetuning
|
|
152
|
-
|
|
153
|
-
Finetuning is no different than training, we just make sure to initialize from a pretrained model and train with a smaller learning rate. For an example of how to finetune a GPT on new text go to `data/shakespeare` and run `prepare.py` to download the tiny shakespeare dataset and render it into a `train.bin` and `val.bin`, using the OpenAI BPE tokenizer from GPT-2. Unlike OpenWebText this will run in seconds. Finetuning can take very little time, e.g. on a single GPU just a few minutes. Run an example finetuning like:
|
|
154
|
-
|
|
155
|
-
```sh
|
|
156
|
-
python train.py config/finetune_shakespeare.py
|
|
157
|
-
```
|
|
158
|
-
|
|
159
|
-
This will load the config parameter overrides in `config/finetune_shakespeare.py` (I didn't tune them much though). Basically, we initialize from a GPT2 checkpoint with `init_from` and train as normal, except shorter and with a small learning rate. If you're running out of memory try decreasing the model size (they are `{'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}`) or possibly decreasing the `block_size` (context length). The best checkpoint (lowest validation loss) will be in the `out_dir` directory, e.g. in `out-shakespeare` by default, per the config file. You can then run the code in `sample.py --out_dir=out-shakespeare`:
|
|
160
|
-
|
|
161
|
-
```
|
|
162
|
-
THEODORE:
|
|
163
|
-
Thou shalt sell me to the highest bidder: if I die,
|
|
164
|
-
I sell thee to the first; if I go mad,
|
|
165
|
-
I sell thee to the second; if I
|
|
166
|
-
lie, I sell thee to the third; if I slay,
|
|
167
|
-
I sell thee to the fourth: so buy or sell,
|
|
168
|
-
I tell thee again, thou shalt not sell my
|
|
169
|
-
possession.
|
|
170
|
-
|
|
171
|
-
JULIET:
|
|
172
|
-
And if thou steal, thou shalt not sell thyself.
|
|
173
|
-
|
|
174
|
-
THEODORE:
|
|
175
|
-
I do not steal; I sell the stolen goods.
|
|
176
|
-
|
|
177
|
-
THEODORE:
|
|
178
|
-
Thou know'st not what thou sell'st; thou, a woman,
|
|
179
|
-
Thou art ever a victim, a thing of no worth:
|
|
180
|
-
Thou hast no right, no right, but to be sold.
|
|
181
|
-
```
|
|
182
|
-
|
|
183
|
-
Whoa there, GPT, entering some dark place over there. I didn't really tune the hyperparameters in the config too much, feel free to try!
|
|
184
|
-
|
|
185
|
-
## sampling / inference
|
|
186
|
-
|
|
187
|
-
Use the script `sample.py` to sample either from pre-trained GPT-2 models released by OpenAI, or from a model you trained yourself. For example, here is a way to sample from the largest available `gpt2-xl` model:
|
|
188
|
-
|
|
189
|
-
```sh
|
|
190
|
-
python sample.py \
|
|
191
|
-
--init_from=gpt2-xl \
|
|
192
|
-
--start="What is the answer to life, the universe, and everything?" \
|
|
193
|
-
--num_samples=5 --max_new_tokens=100
|
|
194
|
-
```
|
|
195
|
-
|
|
196
|
-
If you'd like to sample from a model you trained, use the `--out_dir` to point the code appropriately. You can also prompt the model with some text from a file, e.g. ```python sample.py --start=FILE:prompt.txt```.
|
|
197
|
-
|
|
198
|
-
## efficiency notes
|
|
199
|
-
|
|
200
|
-
For simple model benchmarking and profiling, `bench.py` might be useful. It's identical to what happens in the meat of the training loop of `train.py`, but omits much of the other complexities.
|
|
201
|
-
|
|
202
|
-
Note that the code by default uses [PyTorch 2.0](https://pytorch.org/get-started/pytorch-2.0/). At the time of writing (Dec 29, 2022) this makes `torch.compile()` available in the nightly release. The improvement from the one line of code is noticeable, e.g. cutting down iteration time from ~250ms / iter to 135ms / iter. Nice work PyTorch team!
|
|
203
|
-
|
|
204
|
-
## todos
|
|
205
|
-
|
|
206
|
-
- Investigate and add FSDP instead of DDP
|
|
207
|
-
- Eval zero-shot perplexities on standard evals (e.g. LAMBADA? HELM? etc.)
|
|
208
|
-
- Finetune the finetuning script, I think the hyperparams are not great
|
|
209
|
-
- Schedule for linear batch size increase during training
|
|
210
|
-
- Incorporate other embeddings (rotary, alibi)
|
|
211
|
-
- Separate out the optim buffers from model params in checkpoints I think
|
|
212
|
-
- Additional logging around network health (e.g. gradient clip events, magnitudes)
|
|
213
|
-
- Few more investigations around better init etc.
|
|
214
|
-
|
|
215
|
-
## troubleshooting
|
|
216
|
-
|
|
217
|
-
Note that by default this repo uses PyTorch 2.0 (i.e. `torch.compile`). This is fairly new and experimental, and not yet available on all platforms (e.g. Windows). If you're running into related error messages try to disable this by adding `--compile=False` flag. This will slow down the code but at least it will run.
|
|
218
|
-
|
|
219
|
-
For some context on this repository, GPT, and language modeling it might be helpful to watch my [Zero To Hero series](https://karpathy.ai/zero-to-hero.html). Specifically, the [GPT video](https://www.youtube.com/watch?v=kCc8FmEb1nY) is popular if you have some prior language modeling context.
|
|
220
|
-
|
|
221
|
-
For more questions/discussions feel free to stop by **#nanoGPT** on Discord:
|
|
222
|
-
|
|
223
|
-
[](https://discord.gg/3zy8kqD9Cp)
|
|
224
|
-
|
|
225
|
-
## acknowledgements
|
|
226
|
-
|
|
227
|
-
All nanoGPT experiments are powered by GPUs on [Lambda labs](https://lambdalabs.com), my favorite Cloud GPU provider. Thank you Lambda labs for sponsoring nanoGPT!
|
|
Binary file
|
|
Binary file
|