nanogpt 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/data/openwebtext/prepare.rb +13 -4
- data/data/shakespeare/prepare.rb +6 -2
- data/data/shakespeare_char/prepare.rb +7 -3
- data/exe/nanogpt +5 -0
- data/lib/nano_gpt/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: cf308fcec8ccec074200361b2327a2381a5412b92497391bb71ec5f154cd1283
|
|
4
|
+
data.tar.gz: 87dcc389df03af0ac59fc0e75bbef7be7071f00613a2662cecf365ba38bc853e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2b6ceeb10236b639c82398c94d3c1a876eff549a17289ff45abae489e480a3d6a1db45f95a4b2e56f1d1df6afde28159965b188342d1e38c2482359dfb11e061
|
|
7
|
+
data.tar.gz: 0fd4653c2719d1d3c339a904437fd0657f17e49fa0a9d4361a3a259806fab8dc798ed7f179722b41913d3471e8799abf78823f207988987d86cba680d7f90f03
|
data/data/openwebtext/prepare.rb
CHANGED
|
@@ -33,7 +33,10 @@ require "rubygems/package"
|
|
|
33
33
|
require "zlib"
|
|
34
34
|
|
|
35
35
|
SCRIPT_DIR = File.dirname(__FILE__)
|
|
36
|
-
|
|
36
|
+
OUTPUT_DIR = ENV["NANOGPT_DATA_DIR"] || SCRIPT_DIR
|
|
37
|
+
RAW_DIR = File.join(OUTPUT_DIR, "raw")
|
|
38
|
+
|
|
39
|
+
FileUtils.mkdir_p(OUTPUT_DIR) if ENV["NANOGPT_DATA_DIR"]
|
|
37
40
|
DEFAULT_VAL_RATIO = 0.0005 # ~0.5% for validation
|
|
38
41
|
|
|
39
42
|
def parse_args
|
|
@@ -60,11 +63,17 @@ def parse_args
|
|
|
60
63
|
end
|
|
61
64
|
|
|
62
65
|
def find_data_files
|
|
63
|
-
# Look for various supported formats
|
|
66
|
+
# Look for various supported formats in both SCRIPT_DIR (gem) and OUTPUT_DIR (local)
|
|
64
67
|
patterns = [
|
|
65
68
|
File.join(RAW_DIR, "**", "*.parquet"), # Parquet (from Python export)
|
|
66
69
|
File.join(RAW_DIR, "**", "*.tar"), # Original tar files
|
|
67
70
|
File.join(RAW_DIR, "**", "*.txt"), # Plain text files
|
|
71
|
+
File.join(OUTPUT_DIR, "*.parquet"),
|
|
72
|
+
File.join(OUTPUT_DIR, "*.tar"),
|
|
73
|
+
File.join(OUTPUT_DIR, "*.txt"),
|
|
74
|
+
File.join(SCRIPT_DIR, "raw", "**", "*.parquet"),
|
|
75
|
+
File.join(SCRIPT_DIR, "raw", "**", "*.tar"),
|
|
76
|
+
File.join(SCRIPT_DIR, "raw", "**", "*.txt"),
|
|
68
77
|
File.join(SCRIPT_DIR, "*.parquet"),
|
|
69
78
|
File.join(SCRIPT_DIR, "*.tar"),
|
|
70
79
|
File.join(SCRIPT_DIR, "*.txt")
|
|
@@ -273,8 +282,8 @@ def main
|
|
|
273
282
|
|
|
274
283
|
# Write binary files
|
|
275
284
|
puts "Writing binary files..."
|
|
276
|
-
write_binary(train_tokens, File.join(
|
|
277
|
-
write_binary(val_tokens, File.join(
|
|
285
|
+
write_binary(train_tokens, File.join(OUTPUT_DIR, "train.bin"))
|
|
286
|
+
write_binary(val_tokens, File.join(OUTPUT_DIR, "val.bin"))
|
|
278
287
|
|
|
279
288
|
puts ""
|
|
280
289
|
puts "Done! OpenWebText dataset prepared."
|
data/data/shakespeare/prepare.rb
CHANGED
|
@@ -11,9 +11,13 @@ require "net/http"
|
|
|
11
11
|
require "openssl"
|
|
12
12
|
require "numo/narray"
|
|
13
13
|
require "tiktoken_ruby"
|
|
14
|
+
require "fileutils"
|
|
14
15
|
|
|
15
16
|
DATA_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
|
|
16
17
|
SCRIPT_DIR = File.dirname(__FILE__)
|
|
18
|
+
OUTPUT_DIR = ENV["NANOGPT_DATA_DIR"] || SCRIPT_DIR
|
|
19
|
+
|
|
20
|
+
FileUtils.mkdir_p(OUTPUT_DIR) if ENV["NANOGPT_DATA_DIR"]
|
|
17
21
|
|
|
18
22
|
def download_file(url)
|
|
19
23
|
uri = URI(url)
|
|
@@ -53,8 +57,8 @@ puts "Val has #{val_ids.length} tokens"
|
|
|
53
57
|
# Export to binary files (uint16)
|
|
54
58
|
train_arr = Numo::UInt16.cast(train_ids)
|
|
55
59
|
val_arr = Numo::UInt16.cast(val_ids)
|
|
56
|
-
File.binwrite(File.join(
|
|
57
|
-
File.binwrite(File.join(
|
|
60
|
+
File.binwrite(File.join(OUTPUT_DIR, "train.bin"), train_arr.to_binary)
|
|
61
|
+
File.binwrite(File.join(OUTPUT_DIR, "val.bin"), val_arr.to_binary)
|
|
58
62
|
|
|
59
63
|
# No meta.json - indicates GPT-2 BPE tokenizer should be used
|
|
60
64
|
puts "Done! Created train.bin and val.bin"
|
|
@@ -8,9 +8,13 @@ require "net/http"
|
|
|
8
8
|
require "openssl"
|
|
9
9
|
require "numo/narray"
|
|
10
10
|
require "json"
|
|
11
|
+
require "fileutils"
|
|
11
12
|
|
|
12
13
|
DATA_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
|
|
13
14
|
SCRIPT_DIR = File.dirname(__FILE__)
|
|
15
|
+
OUTPUT_DIR = ENV["NANOGPT_DATA_DIR"] || SCRIPT_DIR
|
|
16
|
+
|
|
17
|
+
FileUtils.mkdir_p(OUTPUT_DIR) if ENV["NANOGPT_DATA_DIR"]
|
|
14
18
|
|
|
15
19
|
def download_file(url)
|
|
16
20
|
uri = URI(url)
|
|
@@ -59,8 +63,8 @@ puts "Val has #{val_ids.length} tokens"
|
|
|
59
63
|
# Export to binary files (uint16)
|
|
60
64
|
train_arr = Numo::UInt16.cast(train_ids)
|
|
61
65
|
val_arr = Numo::UInt16.cast(val_ids)
|
|
62
|
-
File.binwrite(File.join(
|
|
63
|
-
File.binwrite(File.join(
|
|
66
|
+
File.binwrite(File.join(OUTPUT_DIR, "train.bin"), train_arr.to_binary)
|
|
67
|
+
File.binwrite(File.join(OUTPUT_DIR, "val.bin"), val_arr.to_binary)
|
|
64
68
|
|
|
65
69
|
# Save meta information as JSON
|
|
66
70
|
meta = {
|
|
@@ -68,6 +72,6 @@ meta = {
|
|
|
68
72
|
"itos" => itos.transform_keys(&:to_s),
|
|
69
73
|
"stoi" => stoi
|
|
70
74
|
}
|
|
71
|
-
File.write(File.join(
|
|
75
|
+
File.write(File.join(OUTPUT_DIR, "meta.json"), JSON.pretty_generate(meta))
|
|
72
76
|
|
|
73
77
|
puts "Done! Created train.bin, val.bin, and meta.json"
|
data/exe/nanogpt
CHANGED
|
@@ -64,7 +64,12 @@ class NanoGPTCLI
|
|
|
64
64
|
exit 1
|
|
65
65
|
end
|
|
66
66
|
|
|
67
|
+
# Set output directory to current working directory
|
|
68
|
+
output_dir = File.join(Dir.pwd, "data", dataset)
|
|
69
|
+
ENV["NANOGPT_DATA_DIR"] = output_dir
|
|
70
|
+
|
|
67
71
|
puts "Preparing dataset: #{dataset}"
|
|
72
|
+
puts "Output directory: #{output_dir}"
|
|
68
73
|
load prepare_script
|
|
69
74
|
end
|
|
70
75
|
|
data/lib/nano_gpt/version.rb
CHANGED