nanogpt 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/data/openwebtext/prepare.rb +13 -4
- data/data/shakespeare/prepare.rb +6 -2
- data/data/shakespeare_char/prepare.rb +7 -3
- data/exe/nanogpt +43 -4
- data/lib/nano_gpt/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: cf308fcec8ccec074200361b2327a2381a5412b92497391bb71ec5f154cd1283
|
|
4
|
+
data.tar.gz: 87dcc389df03af0ac59fc0e75bbef7be7071f00613a2662cecf365ba38bc853e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2b6ceeb10236b639c82398c94d3c1a876eff549a17289ff45abae489e480a3d6a1db45f95a4b2e56f1d1df6afde28159965b188342d1e38c2482359dfb11e061
|
|
7
|
+
data.tar.gz: 0fd4653c2719d1d3c339a904437fd0657f17e49fa0a9d4361a3a259806fab8dc798ed7f179722b41913d3471e8799abf78823f207988987d86cba680d7f90f03
|
data/data/openwebtext/prepare.rb
CHANGED
|
@@ -33,7 +33,10 @@ require "rubygems/package"
|
|
|
33
33
|
require "zlib"
|
|
34
34
|
|
|
35
35
|
SCRIPT_DIR = File.dirname(__FILE__)
|
|
36
|
-
|
|
36
|
+
OUTPUT_DIR = ENV["NANOGPT_DATA_DIR"] || SCRIPT_DIR
|
|
37
|
+
RAW_DIR = File.join(OUTPUT_DIR, "raw")
|
|
38
|
+
|
|
39
|
+
FileUtils.mkdir_p(OUTPUT_DIR) if ENV["NANOGPT_DATA_DIR"]
|
|
37
40
|
DEFAULT_VAL_RATIO = 0.0005 # ~0.5% for validation
|
|
38
41
|
|
|
39
42
|
def parse_args
|
|
@@ -60,11 +63,17 @@ def parse_args
|
|
|
60
63
|
end
|
|
61
64
|
|
|
62
65
|
def find_data_files
|
|
63
|
-
# Look for various supported formats
|
|
66
|
+
# Look for various supported formats in both SCRIPT_DIR (gem) and OUTPUT_DIR (local)
|
|
64
67
|
patterns = [
|
|
65
68
|
File.join(RAW_DIR, "**", "*.parquet"), # Parquet (from Python export)
|
|
66
69
|
File.join(RAW_DIR, "**", "*.tar"), # Original tar files
|
|
67
70
|
File.join(RAW_DIR, "**", "*.txt"), # Plain text files
|
|
71
|
+
File.join(OUTPUT_DIR, "*.parquet"),
|
|
72
|
+
File.join(OUTPUT_DIR, "*.tar"),
|
|
73
|
+
File.join(OUTPUT_DIR, "*.txt"),
|
|
74
|
+
File.join(SCRIPT_DIR, "raw", "**", "*.parquet"),
|
|
75
|
+
File.join(SCRIPT_DIR, "raw", "**", "*.tar"),
|
|
76
|
+
File.join(SCRIPT_DIR, "raw", "**", "*.txt"),
|
|
68
77
|
File.join(SCRIPT_DIR, "*.parquet"),
|
|
69
78
|
File.join(SCRIPT_DIR, "*.tar"),
|
|
70
79
|
File.join(SCRIPT_DIR, "*.txt")
|
|
@@ -273,8 +282,8 @@ def main
|
|
|
273
282
|
|
|
274
283
|
# Write binary files
|
|
275
284
|
puts "Writing binary files..."
|
|
276
|
-
write_binary(train_tokens, File.join(
|
|
277
|
-
write_binary(val_tokens, File.join(
|
|
285
|
+
write_binary(train_tokens, File.join(OUTPUT_DIR, "train.bin"))
|
|
286
|
+
write_binary(val_tokens, File.join(OUTPUT_DIR, "val.bin"))
|
|
278
287
|
|
|
279
288
|
puts ""
|
|
280
289
|
puts "Done! OpenWebText dataset prepared."
|
data/data/shakespeare/prepare.rb
CHANGED
|
@@ -11,9 +11,13 @@ require "net/http"
|
|
|
11
11
|
require "openssl"
|
|
12
12
|
require "numo/narray"
|
|
13
13
|
require "tiktoken_ruby"
|
|
14
|
+
require "fileutils"
|
|
14
15
|
|
|
15
16
|
DATA_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
|
|
16
17
|
SCRIPT_DIR = File.dirname(__FILE__)
|
|
18
|
+
OUTPUT_DIR = ENV["NANOGPT_DATA_DIR"] || SCRIPT_DIR
|
|
19
|
+
|
|
20
|
+
FileUtils.mkdir_p(OUTPUT_DIR) if ENV["NANOGPT_DATA_DIR"]
|
|
17
21
|
|
|
18
22
|
def download_file(url)
|
|
19
23
|
uri = URI(url)
|
|
@@ -53,8 +57,8 @@ puts "Val has #{val_ids.length} tokens"
|
|
|
53
57
|
# Export to binary files (uint16)
|
|
54
58
|
train_arr = Numo::UInt16.cast(train_ids)
|
|
55
59
|
val_arr = Numo::UInt16.cast(val_ids)
|
|
56
|
-
File.binwrite(File.join(
|
|
57
|
-
File.binwrite(File.join(
|
|
60
|
+
File.binwrite(File.join(OUTPUT_DIR, "train.bin"), train_arr.to_binary)
|
|
61
|
+
File.binwrite(File.join(OUTPUT_DIR, "val.bin"), val_arr.to_binary)
|
|
58
62
|
|
|
59
63
|
# No meta.json - indicates GPT-2 BPE tokenizer should be used
|
|
60
64
|
puts "Done! Created train.bin and val.bin"
|
|
@@ -8,9 +8,13 @@ require "net/http"
|
|
|
8
8
|
require "openssl"
|
|
9
9
|
require "numo/narray"
|
|
10
10
|
require "json"
|
|
11
|
+
require "fileutils"
|
|
11
12
|
|
|
12
13
|
DATA_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
|
|
13
14
|
SCRIPT_DIR = File.dirname(__FILE__)
|
|
15
|
+
OUTPUT_DIR = ENV["NANOGPT_DATA_DIR"] || SCRIPT_DIR
|
|
16
|
+
|
|
17
|
+
FileUtils.mkdir_p(OUTPUT_DIR) if ENV["NANOGPT_DATA_DIR"]
|
|
14
18
|
|
|
15
19
|
def download_file(url)
|
|
16
20
|
uri = URI(url)
|
|
@@ -59,8 +63,8 @@ puts "Val has #{val_ids.length} tokens"
|
|
|
59
63
|
# Export to binary files (uint16)
|
|
60
64
|
train_arr = Numo::UInt16.cast(train_ids)
|
|
61
65
|
val_arr = Numo::UInt16.cast(val_ids)
|
|
62
|
-
File.binwrite(File.join(
|
|
63
|
-
File.binwrite(File.join(
|
|
66
|
+
File.binwrite(File.join(OUTPUT_DIR, "train.bin"), train_arr.to_binary)
|
|
67
|
+
File.binwrite(File.join(OUTPUT_DIR, "val.bin"), val_arr.to_binary)
|
|
64
68
|
|
|
65
69
|
# Save meta information as JSON
|
|
66
70
|
meta = {
|
|
@@ -68,6 +72,6 @@ meta = {
|
|
|
68
72
|
"itos" => itos.transform_keys(&:to_s),
|
|
69
73
|
"stoi" => stoi
|
|
70
74
|
}
|
|
71
|
-
File.write(File.join(
|
|
75
|
+
File.write(File.join(OUTPUT_DIR, "meta.json"), JSON.pretty_generate(meta))
|
|
72
76
|
|
|
73
77
|
puts "Done! Created train.bin, val.bin, and meta.json"
|
data/exe/nanogpt
CHANGED
|
@@ -6,7 +6,7 @@ $stdout.sync = true
|
|
|
6
6
|
require "nano_gpt"
|
|
7
7
|
|
|
8
8
|
class NanoGPTCLI
|
|
9
|
-
COMMANDS = %w[train sample bench version help].freeze
|
|
9
|
+
COMMANDS = %w[prepare train sample bench version help].freeze
|
|
10
10
|
|
|
11
11
|
def initialize(args)
|
|
12
12
|
@command = args.shift
|
|
@@ -15,6 +15,8 @@ class NanoGPTCLI
|
|
|
15
15
|
|
|
16
16
|
def run
|
|
17
17
|
case @command
|
|
18
|
+
when "prepare"
|
|
19
|
+
prepare
|
|
18
20
|
when "train"
|
|
19
21
|
train
|
|
20
22
|
when "sample"
|
|
@@ -35,6 +37,42 @@ class NanoGPTCLI
|
|
|
35
37
|
|
|
36
38
|
private
|
|
37
39
|
|
|
40
|
+
def prepare
|
|
41
|
+
dataset = @args.first
|
|
42
|
+
|
|
43
|
+
# Find available datasets
|
|
44
|
+
data_dir = File.join(File.dirname(__FILE__), "..", "data")
|
|
45
|
+
available = Dir.glob(File.join(data_dir, "*", "prepare.rb")).map do |path|
|
|
46
|
+
File.basename(File.dirname(path))
|
|
47
|
+
end.sort
|
|
48
|
+
|
|
49
|
+
if dataset.nil?
|
|
50
|
+
puts "Usage: nanogpt prepare <dataset>"
|
|
51
|
+
puts ""
|
|
52
|
+
puts "Available datasets:"
|
|
53
|
+
available.each { |d| puts " #{d}" }
|
|
54
|
+
exit 1
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
prepare_script = File.join(data_dir, dataset, "prepare.rb")
|
|
58
|
+
|
|
59
|
+
unless File.exist?(prepare_script)
|
|
60
|
+
puts "Error: Unknown dataset '#{dataset}'"
|
|
61
|
+
puts ""
|
|
62
|
+
puts "Available datasets:"
|
|
63
|
+
available.each { |d| puts " #{d}" }
|
|
64
|
+
exit 1
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Set output directory to current working directory
|
|
68
|
+
output_dir = File.join(Dir.pwd, "data", dataset)
|
|
69
|
+
ENV["NANOGPT_DATA_DIR"] = output_dir
|
|
70
|
+
|
|
71
|
+
puts "Preparing dataset: #{dataset}"
|
|
72
|
+
puts "Output directory: #{output_dir}"
|
|
73
|
+
load prepare_script
|
|
74
|
+
end
|
|
75
|
+
|
|
38
76
|
def train
|
|
39
77
|
config = NanoGPT::TrainConfig.load(@args)
|
|
40
78
|
|
|
@@ -318,6 +356,7 @@ class NanoGPTCLI
|
|
|
318
356
|
Usage: nanogpt <command> [options]
|
|
319
357
|
|
|
320
358
|
Commands:
|
|
359
|
+
prepare Download and prepare a dataset
|
|
321
360
|
train Train a GPT model
|
|
322
361
|
sample Generate text from a trained model
|
|
323
362
|
bench Run performance benchmarks
|
|
@@ -325,9 +364,9 @@ class NanoGPTCLI
|
|
|
325
364
|
help Show this help message
|
|
326
365
|
|
|
327
366
|
Examples:
|
|
328
|
-
nanogpt
|
|
329
|
-
nanogpt train --dataset=shakespeare_char --
|
|
330
|
-
nanogpt sample --
|
|
367
|
+
nanogpt prepare shakespeare_char
|
|
368
|
+
nanogpt train --dataset=shakespeare_char --device=mps
|
|
369
|
+
nanogpt sample --dataset=shakespeare_char --num_samples=3
|
|
331
370
|
nanogpt bench --batch_size=8 --block_size=512
|
|
332
371
|
|
|
333
372
|
For more information, visit: https://github.com/khasinski/nanogpt-rb
|
data/lib/nano_gpt/version.rb
CHANGED