nanogpt 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,287 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ $stdout.sync = true
5
+
6
+ # Prepare the OpenWebText dataset for GPT-2 style language modeling.
7
+ # Uses GPT-2 BPE tokenization via tiktoken (vocab_size=50257).
8
+ #
9
+ # The full dataset is ~9B tokens (~54GB for train.bin).
10
+ #
11
+ # Usage:
12
+ # # Download dataset first (requires Python datasets library):
13
+ # pip install datasets
14
+ # python -c "from datasets import load_dataset; ds = load_dataset('Skylion007/openwebtext', split='train'); ds.to_parquet('data/openwebtext/raw/train.parquet')"
15
+ #
16
+ # # Or download individual tar files from HuggingFace:
17
+ # # https://huggingface.co/datasets/Skylion007/openwebtext/tree/main/subsets
18
+ #
19
+ # # Then process:
20
+ # bundle exec ruby data/openwebtext/prepare.rb
21
+ #
22
+ # # Process only first N documents (for testing):
23
+ # bundle exec ruby data/openwebtext/prepare.rb --max_docs=10000
24
+ #
25
+ # Options:
26
+ # --max_docs=N Only process first N documents (for testing)
27
+ # --val_ratio=F Validation split ratio (default: 0.0005)
28
+
29
+ require "numo/narray"
30
+ require "tiktoken_ruby"
31
+ require "fileutils"
32
+ require "rubygems/package"
33
+ require "zlib"
34
+
35
+ SCRIPT_DIR = File.dirname(__FILE__)
36
+ RAW_DIR = File.join(SCRIPT_DIR, "raw")
37
+ DEFAULT_VAL_RATIO = 0.0005 # ~0.5% for validation
38
+
39
+ def parse_args
40
+ config = {
41
+ max_docs: nil,
42
+ val_ratio: DEFAULT_VAL_RATIO
43
+ }
44
+
45
+ ARGV.each do |arg|
46
+ next unless arg.start_with?("--") && arg.include?("=")
47
+
48
+ key, val = arg[2..].split("=", 2)
49
+ key = key.to_sym
50
+
51
+ case key
52
+ when :max_docs
53
+ config[:max_docs] = val.to_i
54
+ when :val_ratio
55
+ config[:val_ratio] = val.to_f
56
+ end
57
+ end
58
+
59
+ config
60
+ end
61
+
62
+ def find_data_files
63
+ # Look for various supported formats
64
+ patterns = [
65
+ File.join(RAW_DIR, "**", "*.parquet"), # Parquet (from Python export)
66
+ File.join(RAW_DIR, "**", "*.tar"), # Original tar files
67
+ File.join(RAW_DIR, "**", "*.txt"), # Plain text files
68
+ File.join(SCRIPT_DIR, "*.parquet"),
69
+ File.join(SCRIPT_DIR, "*.tar"),
70
+ File.join(SCRIPT_DIR, "*.txt")
71
+ ]
72
+
73
+ files = patterns.flat_map { |p| Dir.glob(p) }.uniq.sort
74
+ files
75
+ end
76
+
77
+ def process_parquet_files(files, enc, eot, max_docs)
78
+ require "parquet"
79
+
80
+ tokens = []
81
+ doc_count = 0
82
+
83
+ files.select { |f| f.end_with?(".parquet") }.each do |path|
84
+ puts "Processing parquet: #{File.basename(path)}..."
85
+
86
+ Parquet.each_row(path) do |row|
87
+ break if max_docs && doc_count >= max_docs
88
+
89
+ text = row["text"]
90
+ next if text.nil? || text.empty?
91
+
92
+ doc_tokens = enc.encode(text)
93
+ doc_tokens << eot
94
+ tokens.concat(doc_tokens)
95
+ doc_count += 1
96
+
97
+ if doc_count % 10_000 == 0
98
+ print "\r Processed #{doc_count} documents, #{tokens.length} tokens..."
99
+ end
100
+ end
101
+
102
+ puts "\r Processed #{doc_count} documents, #{tokens.length} tokens "
103
+ break if max_docs && doc_count >= max_docs
104
+ end
105
+
106
+ [tokens, doc_count]
107
+ end
108
+
109
+ def process_tar_files(files, enc, eot, max_docs)
110
+ tokens = []
111
+ doc_count = 0
112
+
113
+ files.select { |f| f.end_with?(".tar") }.each do |tar_path|
114
+ puts "Processing tar: #{File.basename(tar_path)}..."
115
+
116
+ File.open(tar_path, "rb") do |file|
117
+ Gem::Package::TarReader.new(file) do |tar|
118
+ tar.each do |entry|
119
+ break if max_docs && doc_count >= max_docs
120
+ next unless entry.file? && entry.full_name.end_with?(".txt", ".xz")
121
+
122
+ # Read content (decompress if .xz)
123
+ content = entry.read
124
+ if entry.full_name.end_with?(".xz")
125
+ # Skip xz files - would need xz gem
126
+ next
127
+ end
128
+
129
+ text = content.force_encoding("UTF-8")
130
+ next if text.empty?
131
+
132
+ doc_tokens = enc.encode(text)
133
+ doc_tokens << eot
134
+ tokens.concat(doc_tokens)
135
+ doc_count += 1
136
+
137
+ if doc_count % 1_000 == 0
138
+ print "\r Processed #{doc_count} documents, #{tokens.length} tokens..."
139
+ end
140
+ end
141
+ end
142
+ end
143
+
144
+ puts "\r Processed #{doc_count} documents, #{tokens.length} tokens "
145
+ break if max_docs && doc_count >= max_docs
146
+ end
147
+
148
+ [tokens, doc_count]
149
+ end
150
+
151
+ def process_txt_files(files, enc, eot, max_docs)
152
+ tokens = []
153
+ doc_count = 0
154
+
155
+ files.select { |f| f.end_with?(".txt") }.each do |path|
156
+ break if max_docs && doc_count >= max_docs
157
+
158
+ puts "Processing text: #{File.basename(path)}..."
159
+ text = File.read(path, encoding: "UTF-8")
160
+ next if text.empty?
161
+
162
+ doc_tokens = enc.encode(text)
163
+ doc_tokens << eot
164
+ tokens.concat(doc_tokens)
165
+ doc_count += 1
166
+ end
167
+
168
+ [tokens, doc_count]
169
+ end
170
+
171
+ def write_binary(tokens, path)
172
+ arr = Numo::UInt16.cast(tokens)
173
+ File.binwrite(path, arr.to_binary)
174
+ size_mb = File.size(path) / 1_000_000.0
175
+ puts " Wrote #{path} (#{tokens.length} tokens, #{size_mb.round(1)}MB)"
176
+ end
177
+
178
+ def main
179
+ config = parse_args
180
+ puts "OpenWebText Data Preparation"
181
+ puts "=" * 50
182
+ puts "Config: #{config}"
183
+ puts ""
184
+
185
+ # Initialize tokenizer
186
+ enc = Tiktoken.get_encoding(:r50k_base)
187
+ eot = enc.encode("<|endoftext|>").first
188
+ puts "Using GPT-2 BPE tokenizer (vocab_size=50257, EOT=#{eot})"
189
+ puts ""
190
+
191
+ # Find data files
192
+ files = find_data_files
193
+
194
+ if files.empty?
195
+ puts "No data files found!"
196
+ puts ""
197
+ puts "Please download the OpenWebText dataset first:"
198
+ puts ""
199
+ puts "Option 1: Using Python datasets (recommended):"
200
+ puts " pip install datasets"
201
+ puts " python -c \"\""
202
+ puts " from datasets import load_dataset"
203
+ puts " ds = load_dataset('Skylion007/openwebtext', split='train')"
204
+ puts " ds.to_parquet('data/openwebtext/raw/train.parquet')"
205
+ puts " \"\""
206
+ puts ""
207
+ puts "Option 2: Download tar files from HuggingFace:"
208
+ puts " https://huggingface.co/datasets/Skylion007/openwebtext/tree/main/subsets"
209
+ puts " Place them in: data/openwebtext/raw/"
210
+ puts ""
211
+ puts "Option 3: Place plain text files in data/openwebtext/raw/"
212
+ exit 1
213
+ end
214
+
215
+ puts "Found #{files.length} data files:"
216
+ files.each { |f| puts " - #{File.basename(f)}" }
217
+ puts ""
218
+
219
+ # Process files by type
220
+ all_tokens = []
221
+ total_docs = 0
222
+
223
+ # Try parquet first (most efficient)
224
+ parquet_files = files.select { |f| f.end_with?(".parquet") }
225
+ if parquet_files.any?
226
+ tokens, docs = process_parquet_files(parquet_files, enc, eot, config[:max_docs])
227
+ all_tokens.concat(tokens)
228
+ total_docs += docs
229
+ end
230
+
231
+ # Then tar files
232
+ remaining = config[:max_docs] ? config[:max_docs] - total_docs : nil
233
+ if remaining.nil? || remaining > 0
234
+ tar_files = files.select { |f| f.end_with?(".tar") }
235
+ if tar_files.any?
236
+ tokens, docs = process_tar_files(tar_files, enc, eot, remaining)
237
+ all_tokens.concat(tokens)
238
+ total_docs += docs
239
+ end
240
+ end
241
+
242
+ # Finally plain text
243
+ remaining = config[:max_docs] ? config[:max_docs] - total_docs : nil
244
+ if remaining.nil? || remaining > 0
245
+ txt_files = files.select { |f| f.end_with?(".txt") }
246
+ if txt_files.any?
247
+ tokens, docs = process_txt_files(txt_files, enc, eot, remaining)
248
+ all_tokens.concat(tokens)
249
+ total_docs += docs
250
+ end
251
+ end
252
+
253
+ if all_tokens.empty?
254
+ puts "Error: No tokens extracted from data files"
255
+ exit 1
256
+ end
257
+
258
+ puts ""
259
+ puts "=" * 50
260
+ puts "Total: #{total_docs} documents, #{all_tokens.length} tokens"
261
+ puts ""
262
+
263
+ # Split into train/val
264
+ val_size = (all_tokens.length * config[:val_ratio]).to_i
265
+ val_size = [val_size, 1].max # At least 1 token for val
266
+
267
+ train_tokens = all_tokens[0...-val_size]
268
+ val_tokens = all_tokens[-val_size..]
269
+
270
+ puts "Train: #{train_tokens.length} tokens"
271
+ puts "Val: #{val_tokens.length} tokens"
272
+ puts ""
273
+
274
+ # Write binary files
275
+ puts "Writing binary files..."
276
+ write_binary(train_tokens, File.join(SCRIPT_DIR, "train.bin"))
277
+ write_binary(val_tokens, File.join(SCRIPT_DIR, "val.bin"))
278
+
279
+ puts ""
280
+ puts "Done! OpenWebText dataset prepared."
281
+ puts "Note: No meta.json (uses GPT-2 BPE tokenizer, vocab_size=50257)"
282
+ puts ""
283
+ puts "To train:"
284
+ puts " bundle exec ruby bin/train --dataset=openwebtext"
285
+ end
286
+
287
+ main
@@ -0,0 +1,61 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # Prepare the Shakespeare dataset with GPT-2 BPE tokenization.
5
+ # Downloads the tiny shakespeare dataset and creates train.bin and val.bin
6
+ # using GPT-2's BPE tokenizer (vocab_size=50257).
7
+ #
8
+ # Usage: bundle exec ruby data/shakespeare/prepare.rb
9
+
10
+ require "net/http"
11
+ require "openssl"
12
+ require "numo/narray"
13
+ require "tiktoken_ruby"
14
+
15
+ DATA_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
16
+ SCRIPT_DIR = File.dirname(__FILE__)
17
+
18
+ def download_file(url)
19
+ uri = URI(url)
20
+ http = Net::HTTP.new(uri.host, uri.port)
21
+ http.use_ssl = true
22
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE # GitHub CDN has CRL issues
23
+ response = http.get(uri.request_uri)
24
+ response.body
25
+ end
26
+
27
+ # Download data if not exists
28
+ input_path = File.join(SCRIPT_DIR, "input.txt")
29
+ unless File.exist?(input_path)
30
+ puts "Downloading tiny shakespeare..."
31
+ data = download_file(DATA_URL)
32
+ File.write(input_path, data)
33
+ end
34
+
35
+ data = File.read(input_path)
36
+ puts "Length of dataset in characters: #{data.length}"
37
+
38
+ # Initialize GPT-2 BPE tokenizer
39
+ enc = Tiktoken.get_encoding(:r50k_base)
40
+ puts "Using GPT-2 BPE tokenizer (vocab_size=50257)"
41
+
42
+ # Train/val split (90/10)
43
+ n = data.length
44
+ train_data = data[0...(n * 0.9).to_i]
45
+ val_data = data[(n * 0.9).to_i..]
46
+
47
+ # Encode to integers using BPE
48
+ train_ids = enc.encode(train_data)
49
+ val_ids = enc.encode(val_data)
50
+ puts "Train has #{train_ids.length} tokens"
51
+ puts "Val has #{val_ids.length} tokens"
52
+
53
+ # Export to binary files (uint16)
54
+ train_arr = Numo::UInt16.cast(train_ids)
55
+ val_arr = Numo::UInt16.cast(val_ids)
56
+ File.binwrite(File.join(SCRIPT_DIR, "train.bin"), train_arr.to_binary)
57
+ File.binwrite(File.join(SCRIPT_DIR, "val.bin"), val_arr.to_binary)
58
+
59
+ # No meta.json - indicates GPT-2 BPE tokenizer should be used
60
+ puts "Done! Created train.bin and val.bin"
61
+ puts "Note: No meta.json (uses GPT-2 BPE tokenizer, vocab_size=50257)"