nanogpt 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +7 -0
- data/Gemfile.lock +42 -0
- data/README.md +102 -0
- data/bin/bench +210 -0
- data/bin/sample +76 -0
- data/bin/train +82 -0
- data/config/train_gpt2.json +19 -0
- data/config/train_shakespeare.json +14 -0
- data/config/train_shakespeare_char.json +14 -0
- data/data/openwebtext/prepare.rb +287 -0
- data/data/shakespeare/prepare.rb +61 -0
- data/data/shakespeare_char/input.txt +40000 -0
- data/data/shakespeare_char/prepare.rb +73 -0
- data/exe/nanogpt +338 -0
- data/lib/nano_gpt/config.rb +42 -0
- data/lib/nano_gpt/data_loader.rb +74 -0
- data/lib/nano_gpt/device.rb +56 -0
- data/lib/nano_gpt/layers/block.rb +25 -0
- data/lib/nano_gpt/layers/causal_self_attention.rb +73 -0
- data/lib/nano_gpt/layers/layer_norm.rb +21 -0
- data/lib/nano_gpt/layers/mlp.rb +23 -0
- data/lib/nano_gpt/lr_scheduler.rb +42 -0
- data/lib/nano_gpt/model.rb +218 -0
- data/lib/nano_gpt/tokenizer.rb +106 -0
- data/lib/nano_gpt/train_config.rb +259 -0
- data/lib/nano_gpt/trainer.rb +221 -0
- data/lib/nano_gpt/version.rb +5 -0
- data/lib/nano_gpt.rb +18 -0
- data/nanogpt.gemspec +37 -0
- metadata +133 -0
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
$stdout.sync = true
|
|
5
|
+
|
|
6
|
+
# Prepare the OpenWebText dataset for GPT-2 style language modeling.
|
|
7
|
+
# Uses GPT-2 BPE tokenization via tiktoken (vocab_size=50257).
|
|
8
|
+
#
|
|
9
|
+
# The full dataset is ~9B tokens (~54GB for train.bin).
|
|
10
|
+
#
|
|
11
|
+
# Usage:
|
|
12
|
+
# # Download dataset first (requires Python datasets library):
|
|
13
|
+
# pip install datasets
|
|
14
|
+
# python -c "from datasets import load_dataset; ds = load_dataset('Skylion007/openwebtext', split='train'); ds.to_parquet('data/openwebtext/raw/train.parquet')"
|
|
15
|
+
#
|
|
16
|
+
# # Or download individual tar files from HuggingFace:
|
|
17
|
+
# # https://huggingface.co/datasets/Skylion007/openwebtext/tree/main/subsets
|
|
18
|
+
#
|
|
19
|
+
# # Then process:
|
|
20
|
+
# bundle exec ruby data/openwebtext/prepare.rb
|
|
21
|
+
#
|
|
22
|
+
# # Process only first N documents (for testing):
|
|
23
|
+
# bundle exec ruby data/openwebtext/prepare.rb --max_docs=10000
|
|
24
|
+
#
|
|
25
|
+
# Options:
|
|
26
|
+
# --max_docs=N Only process first N documents (for testing)
|
|
27
|
+
# --val_ratio=F Validation split ratio (default: 0.0005)
|
|
28
|
+
|
|
29
|
+
require "numo/narray"
|
|
30
|
+
require "tiktoken_ruby"
|
|
31
|
+
require "fileutils"
|
|
32
|
+
require "rubygems/package"
|
|
33
|
+
require "zlib"
|
|
34
|
+
|
|
35
|
+
SCRIPT_DIR = File.dirname(__FILE__)
|
|
36
|
+
RAW_DIR = File.join(SCRIPT_DIR, "raw")
|
|
37
|
+
DEFAULT_VAL_RATIO = 0.0005 # ~0.5% for validation
|
|
38
|
+
|
|
39
|
+
def parse_args
|
|
40
|
+
config = {
|
|
41
|
+
max_docs: nil,
|
|
42
|
+
val_ratio: DEFAULT_VAL_RATIO
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
ARGV.each do |arg|
|
|
46
|
+
next unless arg.start_with?("--") && arg.include?("=")
|
|
47
|
+
|
|
48
|
+
key, val = arg[2..].split("=", 2)
|
|
49
|
+
key = key.to_sym
|
|
50
|
+
|
|
51
|
+
case key
|
|
52
|
+
when :max_docs
|
|
53
|
+
config[:max_docs] = val.to_i
|
|
54
|
+
when :val_ratio
|
|
55
|
+
config[:val_ratio] = val.to_f
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
config
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def find_data_files
|
|
63
|
+
# Look for various supported formats
|
|
64
|
+
patterns = [
|
|
65
|
+
File.join(RAW_DIR, "**", "*.parquet"), # Parquet (from Python export)
|
|
66
|
+
File.join(RAW_DIR, "**", "*.tar"), # Original tar files
|
|
67
|
+
File.join(RAW_DIR, "**", "*.txt"), # Plain text files
|
|
68
|
+
File.join(SCRIPT_DIR, "*.parquet"),
|
|
69
|
+
File.join(SCRIPT_DIR, "*.tar"),
|
|
70
|
+
File.join(SCRIPT_DIR, "*.txt")
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
files = patterns.flat_map { |p| Dir.glob(p) }.uniq.sort
|
|
74
|
+
files
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def process_parquet_files(files, enc, eot, max_docs)
|
|
78
|
+
require "parquet"
|
|
79
|
+
|
|
80
|
+
tokens = []
|
|
81
|
+
doc_count = 0
|
|
82
|
+
|
|
83
|
+
files.select { |f| f.end_with?(".parquet") }.each do |path|
|
|
84
|
+
puts "Processing parquet: #{File.basename(path)}..."
|
|
85
|
+
|
|
86
|
+
Parquet.each_row(path) do |row|
|
|
87
|
+
break if max_docs && doc_count >= max_docs
|
|
88
|
+
|
|
89
|
+
text = row["text"]
|
|
90
|
+
next if text.nil? || text.empty?
|
|
91
|
+
|
|
92
|
+
doc_tokens = enc.encode(text)
|
|
93
|
+
doc_tokens << eot
|
|
94
|
+
tokens.concat(doc_tokens)
|
|
95
|
+
doc_count += 1
|
|
96
|
+
|
|
97
|
+
if doc_count % 10_000 == 0
|
|
98
|
+
print "\r Processed #{doc_count} documents, #{tokens.length} tokens..."
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
puts "\r Processed #{doc_count} documents, #{tokens.length} tokens "
|
|
103
|
+
break if max_docs && doc_count >= max_docs
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
[tokens, doc_count]
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def process_tar_files(files, enc, eot, max_docs)
|
|
110
|
+
tokens = []
|
|
111
|
+
doc_count = 0
|
|
112
|
+
|
|
113
|
+
files.select { |f| f.end_with?(".tar") }.each do |tar_path|
|
|
114
|
+
puts "Processing tar: #{File.basename(tar_path)}..."
|
|
115
|
+
|
|
116
|
+
File.open(tar_path, "rb") do |file|
|
|
117
|
+
Gem::Package::TarReader.new(file) do |tar|
|
|
118
|
+
tar.each do |entry|
|
|
119
|
+
break if max_docs && doc_count >= max_docs
|
|
120
|
+
next unless entry.file? && entry.full_name.end_with?(".txt", ".xz")
|
|
121
|
+
|
|
122
|
+
# Read content (decompress if .xz)
|
|
123
|
+
content = entry.read
|
|
124
|
+
if entry.full_name.end_with?(".xz")
|
|
125
|
+
# Skip xz files - would need xz gem
|
|
126
|
+
next
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
text = content.force_encoding("UTF-8")
|
|
130
|
+
next if text.empty?
|
|
131
|
+
|
|
132
|
+
doc_tokens = enc.encode(text)
|
|
133
|
+
doc_tokens << eot
|
|
134
|
+
tokens.concat(doc_tokens)
|
|
135
|
+
doc_count += 1
|
|
136
|
+
|
|
137
|
+
if doc_count % 1_000 == 0
|
|
138
|
+
print "\r Processed #{doc_count} documents, #{tokens.length} tokens..."
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
puts "\r Processed #{doc_count} documents, #{tokens.length} tokens "
|
|
145
|
+
break if max_docs && doc_count >= max_docs
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
[tokens, doc_count]
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def process_txt_files(files, enc, eot, max_docs)
|
|
152
|
+
tokens = []
|
|
153
|
+
doc_count = 0
|
|
154
|
+
|
|
155
|
+
files.select { |f| f.end_with?(".txt") }.each do |path|
|
|
156
|
+
break if max_docs && doc_count >= max_docs
|
|
157
|
+
|
|
158
|
+
puts "Processing text: #{File.basename(path)}..."
|
|
159
|
+
text = File.read(path, encoding: "UTF-8")
|
|
160
|
+
next if text.empty?
|
|
161
|
+
|
|
162
|
+
doc_tokens = enc.encode(text)
|
|
163
|
+
doc_tokens << eot
|
|
164
|
+
tokens.concat(doc_tokens)
|
|
165
|
+
doc_count += 1
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
[tokens, doc_count]
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
def write_binary(tokens, path)
|
|
172
|
+
arr = Numo::UInt16.cast(tokens)
|
|
173
|
+
File.binwrite(path, arr.to_binary)
|
|
174
|
+
size_mb = File.size(path) / 1_000_000.0
|
|
175
|
+
puts " Wrote #{path} (#{tokens.length} tokens, #{size_mb.round(1)}MB)"
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
def main
|
|
179
|
+
config = parse_args
|
|
180
|
+
puts "OpenWebText Data Preparation"
|
|
181
|
+
puts "=" * 50
|
|
182
|
+
puts "Config: #{config}"
|
|
183
|
+
puts ""
|
|
184
|
+
|
|
185
|
+
# Initialize tokenizer
|
|
186
|
+
enc = Tiktoken.get_encoding(:r50k_base)
|
|
187
|
+
eot = enc.encode("<|endoftext|>").first
|
|
188
|
+
puts "Using GPT-2 BPE tokenizer (vocab_size=50257, EOT=#{eot})"
|
|
189
|
+
puts ""
|
|
190
|
+
|
|
191
|
+
# Find data files
|
|
192
|
+
files = find_data_files
|
|
193
|
+
|
|
194
|
+
if files.empty?
|
|
195
|
+
puts "No data files found!"
|
|
196
|
+
puts ""
|
|
197
|
+
puts "Please download the OpenWebText dataset first:"
|
|
198
|
+
puts ""
|
|
199
|
+
puts "Option 1: Using Python datasets (recommended):"
|
|
200
|
+
puts " pip install datasets"
|
|
201
|
+
puts " python -c \"\""
|
|
202
|
+
puts " from datasets import load_dataset"
|
|
203
|
+
puts " ds = load_dataset('Skylion007/openwebtext', split='train')"
|
|
204
|
+
puts " ds.to_parquet('data/openwebtext/raw/train.parquet')"
|
|
205
|
+
puts " \"\""
|
|
206
|
+
puts ""
|
|
207
|
+
puts "Option 2: Download tar files from HuggingFace:"
|
|
208
|
+
puts " https://huggingface.co/datasets/Skylion007/openwebtext/tree/main/subsets"
|
|
209
|
+
puts " Place them in: data/openwebtext/raw/"
|
|
210
|
+
puts ""
|
|
211
|
+
puts "Option 3: Place plain text files in data/openwebtext/raw/"
|
|
212
|
+
exit 1
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
puts "Found #{files.length} data files:"
|
|
216
|
+
files.each { |f| puts " - #{File.basename(f)}" }
|
|
217
|
+
puts ""
|
|
218
|
+
|
|
219
|
+
# Process files by type
|
|
220
|
+
all_tokens = []
|
|
221
|
+
total_docs = 0
|
|
222
|
+
|
|
223
|
+
# Try parquet first (most efficient)
|
|
224
|
+
parquet_files = files.select { |f| f.end_with?(".parquet") }
|
|
225
|
+
if parquet_files.any?
|
|
226
|
+
tokens, docs = process_parquet_files(parquet_files, enc, eot, config[:max_docs])
|
|
227
|
+
all_tokens.concat(tokens)
|
|
228
|
+
total_docs += docs
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
# Then tar files
|
|
232
|
+
remaining = config[:max_docs] ? config[:max_docs] - total_docs : nil
|
|
233
|
+
if remaining.nil? || remaining > 0
|
|
234
|
+
tar_files = files.select { |f| f.end_with?(".tar") }
|
|
235
|
+
if tar_files.any?
|
|
236
|
+
tokens, docs = process_tar_files(tar_files, enc, eot, remaining)
|
|
237
|
+
all_tokens.concat(tokens)
|
|
238
|
+
total_docs += docs
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
# Finally plain text
|
|
243
|
+
remaining = config[:max_docs] ? config[:max_docs] - total_docs : nil
|
|
244
|
+
if remaining.nil? || remaining > 0
|
|
245
|
+
txt_files = files.select { |f| f.end_with?(".txt") }
|
|
246
|
+
if txt_files.any?
|
|
247
|
+
tokens, docs = process_txt_files(txt_files, enc, eot, remaining)
|
|
248
|
+
all_tokens.concat(tokens)
|
|
249
|
+
total_docs += docs
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
if all_tokens.empty?
|
|
254
|
+
puts "Error: No tokens extracted from data files"
|
|
255
|
+
exit 1
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
puts ""
|
|
259
|
+
puts "=" * 50
|
|
260
|
+
puts "Total: #{total_docs} documents, #{all_tokens.length} tokens"
|
|
261
|
+
puts ""
|
|
262
|
+
|
|
263
|
+
# Split into train/val
|
|
264
|
+
val_size = (all_tokens.length * config[:val_ratio]).to_i
|
|
265
|
+
val_size = [val_size, 1].max # At least 1 token for val
|
|
266
|
+
|
|
267
|
+
train_tokens = all_tokens[0...-val_size]
|
|
268
|
+
val_tokens = all_tokens[-val_size..]
|
|
269
|
+
|
|
270
|
+
puts "Train: #{train_tokens.length} tokens"
|
|
271
|
+
puts "Val: #{val_tokens.length} tokens"
|
|
272
|
+
puts ""
|
|
273
|
+
|
|
274
|
+
# Write binary files
|
|
275
|
+
puts "Writing binary files..."
|
|
276
|
+
write_binary(train_tokens, File.join(SCRIPT_DIR, "train.bin"))
|
|
277
|
+
write_binary(val_tokens, File.join(SCRIPT_DIR, "val.bin"))
|
|
278
|
+
|
|
279
|
+
puts ""
|
|
280
|
+
puts "Done! OpenWebText dataset prepared."
|
|
281
|
+
puts "Note: No meta.json (uses GPT-2 BPE tokenizer, vocab_size=50257)"
|
|
282
|
+
puts ""
|
|
283
|
+
puts "To train:"
|
|
284
|
+
puts " bundle exec ruby bin/train --dataset=openwebtext"
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
main
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Prepare the Shakespeare dataset with GPT-2 BPE tokenization.
|
|
5
|
+
# Downloads the tiny shakespeare dataset and creates train.bin and val.bin
|
|
6
|
+
# using GPT-2's BPE tokenizer (vocab_size=50257).
|
|
7
|
+
#
|
|
8
|
+
# Usage: bundle exec ruby data/shakespeare/prepare.rb
|
|
9
|
+
|
|
10
|
+
require "net/http"
|
|
11
|
+
require "openssl"
|
|
12
|
+
require "numo/narray"
|
|
13
|
+
require "tiktoken_ruby"
|
|
14
|
+
|
|
15
|
+
DATA_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
|
|
16
|
+
SCRIPT_DIR = File.dirname(__FILE__)
|
|
17
|
+
|
|
18
|
+
def download_file(url)
|
|
19
|
+
uri = URI(url)
|
|
20
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
21
|
+
http.use_ssl = true
|
|
22
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE # GitHub CDN has CRL issues
|
|
23
|
+
response = http.get(uri.request_uri)
|
|
24
|
+
response.body
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Download data if not exists
|
|
28
|
+
input_path = File.join(SCRIPT_DIR, "input.txt")
|
|
29
|
+
unless File.exist?(input_path)
|
|
30
|
+
puts "Downloading tiny shakespeare..."
|
|
31
|
+
data = download_file(DATA_URL)
|
|
32
|
+
File.write(input_path, data)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
data = File.read(input_path)
|
|
36
|
+
puts "Length of dataset in characters: #{data.length}"
|
|
37
|
+
|
|
38
|
+
# Initialize GPT-2 BPE tokenizer
|
|
39
|
+
enc = Tiktoken.get_encoding(:r50k_base)
|
|
40
|
+
puts "Using GPT-2 BPE tokenizer (vocab_size=50257)"
|
|
41
|
+
|
|
42
|
+
# Train/val split (90/10)
|
|
43
|
+
n = data.length
|
|
44
|
+
train_data = data[0...(n * 0.9).to_i]
|
|
45
|
+
val_data = data[(n * 0.9).to_i..]
|
|
46
|
+
|
|
47
|
+
# Encode to integers using BPE
|
|
48
|
+
train_ids = enc.encode(train_data)
|
|
49
|
+
val_ids = enc.encode(val_data)
|
|
50
|
+
puts "Train has #{train_ids.length} tokens"
|
|
51
|
+
puts "Val has #{val_ids.length} tokens"
|
|
52
|
+
|
|
53
|
+
# Export to binary files (uint16)
|
|
54
|
+
train_arr = Numo::UInt16.cast(train_ids)
|
|
55
|
+
val_arr = Numo::UInt16.cast(val_ids)
|
|
56
|
+
File.binwrite(File.join(SCRIPT_DIR, "train.bin"), train_arr.to_binary)
|
|
57
|
+
File.binwrite(File.join(SCRIPT_DIR, "val.bin"), val_arr.to_binary)
|
|
58
|
+
|
|
59
|
+
# No meta.json - indicates GPT-2 BPE tokenizer should be used
|
|
60
|
+
puts "Done! Created train.bin and val.bin"
|
|
61
|
+
puts "Note: No meta.json (uses GPT-2 BPE tokenizer, vocab_size=50257)"
|