nanogpt 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +45 -0
- data/Gemfile.lock +2 -1
- data/README.md +51 -2
- data/exe/nanogpt +183 -0
- data/lib/nano_gpt/version.rb +1 -1
- metadata +4 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c2d0853148f473bb23f4ffffaa5a7b9e45f3faff570cc0a85253d40b6b63b80a
|
|
4
|
+
data.tar.gz: 68f31e52460273d97d1a97de844fb79c9bfab22b939a897482b6def4c77bdecc
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 42a617a95e04d7727cc011a033ea039b8763a9103093dbec7ad614eb74a9a4f4ed818d82ffedeb8e627494dd73fd7950fdb30f97dd5ff63866123a0e892befc1
|
|
7
|
+
data.tar.gz: af2916e7179830cc91fc516be16b765e46a3c6f37cb732583900bf71591abf16d8bbdc250bbe07a0b83eac124a5ec5b7241684017b4c166ebe3fcb8611702592
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [0.2.0] - 2025-12-08
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- **Custom text file training**: New `nanogpt prepare textfile <path>` command to train on any text file with character-level tokenization
|
|
13
|
+
- Streams through large files without loading everything into memory
|
|
14
|
+
- Auto-detects file encoding (UTF-8 or Windows-1252)
|
|
15
|
+
- Configurable output directory name (`--output=NAME`)
|
|
16
|
+
- Configurable train/validation split ratio (`--val_ratio=F`, default 0.1)
|
|
17
|
+
- Updated README with documentation for training on custom text files
|
|
18
|
+
|
|
19
|
+
## [0.1.2] - 2025-12-07
|
|
20
|
+
|
|
21
|
+
### Fixed
|
|
22
|
+
|
|
23
|
+
- Fixed `prepare` command to output files to current working directory
|
|
24
|
+
|
|
25
|
+
## [0.1.1] - 2025-12-07
|
|
26
|
+
|
|
27
|
+
### Fixed
|
|
28
|
+
|
|
29
|
+
- Fixed typo in documentation
|
|
30
|
+
|
|
31
|
+
## [0.1.0] - 2025-12-07
|
|
32
|
+
|
|
33
|
+
### Added
|
|
34
|
+
|
|
35
|
+
- Initial release
|
|
36
|
+
- Full GPT-2 architecture implementation in Ruby
|
|
37
|
+
- MPS (Metal) and CUDA GPU acceleration via torch.rb
|
|
38
|
+
- Flash attention support when dropout=0
|
|
39
|
+
- Character-level and GPT-2 BPE tokenizers
|
|
40
|
+
- Cosine learning rate schedule with warmup
|
|
41
|
+
- Gradient accumulation for larger effective batch sizes
|
|
42
|
+
- Checkpointing and training resumption
|
|
43
|
+
- Shakespeare character-level dataset
|
|
44
|
+
- OpenWebText dataset support
|
|
45
|
+
- CLI commands: `train`, `sample`, `bench`, `prepare`
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# nanoGPT
|
|
2
2
|
|
|
3
|
+
[](https://rubygems.org/gems/nanogpt)
|
|
4
|
+
|
|
3
5
|
A Ruby port of Karpathy's [nanoGPT](https://github.com/karpathy/nanoGPT). Train GPT-2 style language models from scratch using [torch.rb](https://github.com/ankane/torch.rb).
|
|
4
6
|
|
|
5
7
|
Built for Ruby developers who want to understand how LLMs work by building one.
|
|
@@ -13,7 +15,7 @@ gem install nanogpt
|
|
|
13
15
|
nanogpt prepare shakespeare_char
|
|
14
16
|
|
|
15
17
|
# Train (use MPS on Apple Silicon for 17x speedup)
|
|
16
|
-
nanogpt train --dataset=shakespeare_char --device=mps
|
|
18
|
+
nanogpt train --dataset=shakespeare_char --device=mps --max_iters=2000
|
|
17
19
|
|
|
18
20
|
# Generate text
|
|
19
21
|
nanogpt sample --dataset=shakespeare_char
|
|
@@ -30,7 +32,7 @@ bundle install
|
|
|
30
32
|
bundle exec ruby data/shakespeare_char/prepare.rb
|
|
31
33
|
|
|
32
34
|
# Train
|
|
33
|
-
bundle exec exe/nanogpt train --dataset=shakespeare_char --device=mps
|
|
35
|
+
bundle exec exe/nanogpt train --dataset=shakespeare_char --device=mps --max_iters=2000
|
|
34
36
|
|
|
35
37
|
# Sample
|
|
36
38
|
bundle exec exe/nanogpt sample --dataset=shakespeare_char
|
|
@@ -81,6 +83,53 @@ nanogpt bench [options] # Run performance benchmarks
|
|
|
81
83
|
--top_k=N # Top-k sampling (default: 200)
|
|
82
84
|
```
|
|
83
85
|
|
|
86
|
+
## Training on Your Own Text
|
|
87
|
+
|
|
88
|
+
You can train on any text file using the `textfile` command:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
# Prepare your text file (creates char-level tokenizer)
|
|
92
|
+
nanogpt prepare textfile /path/to/mybook.txt --output=mybook
|
|
93
|
+
|
|
94
|
+
# Train a model
|
|
95
|
+
nanogpt train --dataset=mybook --device=mps --max_iters=2000
|
|
96
|
+
|
|
97
|
+
# Generate text
|
|
98
|
+
nanogpt sample --dataset=mybook --start="Once upon a time"
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### Options
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
--output=NAME # Output directory name (default: derived from filename)
|
|
105
|
+
--val_ratio=F # Validation split ratio (default: 0.1)
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### Example: Training on a Novel
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
# Download a book
|
|
112
|
+
curl -o lotr.txt "https://example.com/fellowship.txt"
|
|
113
|
+
|
|
114
|
+
# Prepare (handles UTF-8 and Windows-1252 encodings)
|
|
115
|
+
nanogpt prepare textfile lotr.txt --output=lotr
|
|
116
|
+
|
|
117
|
+
# Train a larger model for better results
|
|
118
|
+
nanogpt train --dataset=lotr --device=mps \
|
|
119
|
+
--max_iters=2000 \
|
|
120
|
+
--n_layer=6 --n_head=6 --n_embd=384 \
|
|
121
|
+
--block_size=256 --batch_size=32
|
|
122
|
+
|
|
123
|
+
# Sample with a prompt
|
|
124
|
+
nanogpt sample --dataset=lotr --start="Frodo" --max_new_tokens=500
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
The `textfile` command:
|
|
128
|
+
- Streams through large files without loading everything into memory
|
|
129
|
+
- Auto-detects encoding (UTF-8 or Windows-1252)
|
|
130
|
+
- Creates a character-level vocabulary from your text
|
|
131
|
+
- Splits into train/validation sets
|
|
132
|
+
|
|
84
133
|
## Features
|
|
85
134
|
|
|
86
135
|
- Full GPT-2 architecture (attention, MLP, layer norm, embeddings)
|
data/exe/nanogpt
CHANGED
|
@@ -48,12 +48,25 @@ class NanoGPTCLI
|
|
|
48
48
|
|
|
49
49
|
if dataset.nil?
|
|
50
50
|
puts "Usage: nanogpt prepare <dataset>"
|
|
51
|
+
puts " nanogpt prepare textfile <path> [options]"
|
|
51
52
|
puts ""
|
|
52
53
|
puts "Available datasets:"
|
|
53
54
|
available.each { |d| puts " #{d}" }
|
|
55
|
+
puts ""
|
|
56
|
+
puts "Custom text file:"
|
|
57
|
+
puts " textfile Prepare a custom text file with char-level tokenization"
|
|
58
|
+
puts ""
|
|
59
|
+
puts "Textfile options:"
|
|
60
|
+
puts " --output=NAME Output directory name (default: derived from filename)"
|
|
61
|
+
puts " --val_ratio=F Validation split ratio (default: 0.1)"
|
|
54
62
|
exit 1
|
|
55
63
|
end
|
|
56
64
|
|
|
65
|
+
if dataset == "textfile"
|
|
66
|
+
prepare_textfile
|
|
67
|
+
return
|
|
68
|
+
end
|
|
69
|
+
|
|
57
70
|
prepare_script = File.join(data_dir, dataset, "prepare.rb")
|
|
58
71
|
|
|
59
72
|
unless File.exist?(prepare_script)
|
|
@@ -73,6 +86,176 @@ class NanoGPTCLI
|
|
|
73
86
|
load prepare_script
|
|
74
87
|
end
|
|
75
88
|
|
|
89
|
+
def prepare_textfile
|
|
90
|
+
require "numo/narray"
|
|
91
|
+
require "json"
|
|
92
|
+
require "fileutils"
|
|
93
|
+
|
|
94
|
+
input_path = nil
|
|
95
|
+
output_name = nil
|
|
96
|
+
val_ratio = 0.1
|
|
97
|
+
|
|
98
|
+
@args[1..].each do |arg|
|
|
99
|
+
if arg.start_with?("--output=")
|
|
100
|
+
output_name = arg.split("=", 2).last
|
|
101
|
+
elsif arg.start_with?("--val_ratio=")
|
|
102
|
+
val_ratio = arg.split("=", 2).last.to_f
|
|
103
|
+
elsif !arg.start_with?("--")
|
|
104
|
+
input_path = arg
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
if input_path.nil?
|
|
109
|
+
puts "Error: No input file specified"
|
|
110
|
+
puts ""
|
|
111
|
+
puts "Usage: nanogpt prepare textfile <path> [options]"
|
|
112
|
+
puts ""
|
|
113
|
+
puts "Options:"
|
|
114
|
+
puts " --output=NAME Output directory name (default: derived from filename)"
|
|
115
|
+
puts " --val_ratio=F Validation split ratio (default: 0.1)"
|
|
116
|
+
exit 1
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
unless File.exist?(input_path)
|
|
120
|
+
puts "Error: File not found: #{input_path}"
|
|
121
|
+
exit 1
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
output_name ||= File.basename(input_path, ".*").gsub(/[^a-zA-Z0-9_-]/, "_")
|
|
125
|
+
output_dir = File.join(Dir.pwd, "data", output_name)
|
|
126
|
+
FileUtils.mkdir_p(output_dir)
|
|
127
|
+
|
|
128
|
+
file_size = File.size(input_path)
|
|
129
|
+
puts "Preparing text file: #{input_path}"
|
|
130
|
+
puts "File size: #{(file_size / 1_000_000.0).round(2)} MB"
|
|
131
|
+
puts "Output directory: #{output_dir}"
|
|
132
|
+
puts "Validation ratio: #{val_ratio}"
|
|
133
|
+
puts ""
|
|
134
|
+
|
|
135
|
+
# Phase 1: Build vocabulary by reading entire file
|
|
136
|
+
# For very large files, we read line by line to avoid memory issues
|
|
137
|
+
puts "Phase 1: Building vocabulary..."
|
|
138
|
+
char_set = Set.new
|
|
139
|
+
char_count = 0
|
|
140
|
+
|
|
141
|
+
# Detect encoding: check if file is valid UTF-8, otherwise assume Windows-1252
|
|
142
|
+
sample = File.binread(input_path, 100_000)
|
|
143
|
+
encoding = sample.force_encoding("UTF-8").valid_encoding? ? "UTF-8" : "Windows-1252:UTF-8"
|
|
144
|
+
puts " Detected encoding: #{encoding.split(':').first}"
|
|
145
|
+
|
|
146
|
+
File.foreach(input_path, encoding: encoding) do |line|
|
|
147
|
+
line.each_char { |c| char_set.add(c) }
|
|
148
|
+
char_count += line.length
|
|
149
|
+
print "\r Scanned #{char_count} characters, #{char_set.size} unique..." if (char_count % 100_000) < 1000
|
|
150
|
+
end
|
|
151
|
+
puts "\r Scanned #{char_count} characters, #{char_set.size} unique..."
|
|
152
|
+
|
|
153
|
+
chars = char_set.to_a.sort
|
|
154
|
+
vocab_size = chars.size
|
|
155
|
+
puts "Vocabulary size: #{vocab_size}"
|
|
156
|
+
|
|
157
|
+
stoi = chars.each_with_index.to_h
|
|
158
|
+
itos = chars.each_with_index.map { |c, i| [i, c] }.to_h
|
|
159
|
+
|
|
160
|
+
# Phase 2: Calculate split point
|
|
161
|
+
total_chars = char_count
|
|
162
|
+
val_chars = (total_chars * val_ratio).to_i
|
|
163
|
+
train_chars = total_chars - val_chars
|
|
164
|
+
puts ""
|
|
165
|
+
puts "Train: #{train_chars} characters"
|
|
166
|
+
puts "Val: #{val_chars} characters"
|
|
167
|
+
|
|
168
|
+
# Phase 3: Encode and write train.bin (streaming line by line)
|
|
169
|
+
puts ""
|
|
170
|
+
puts "Phase 2: Encoding and writing train.bin..."
|
|
171
|
+
train_path = File.join(output_dir, "train.bin")
|
|
172
|
+
chars_written = 0
|
|
173
|
+
buffer = []
|
|
174
|
+
buffer_size = 100_000
|
|
175
|
+
|
|
176
|
+
File.open(train_path, "wb") do |output|
|
|
177
|
+
File.foreach(input_path, encoding: encoding) do |line|
|
|
178
|
+
line.each_char do |c|
|
|
179
|
+
break if chars_written >= train_chars
|
|
180
|
+
|
|
181
|
+
buffer << stoi[c]
|
|
182
|
+
chars_written += 1
|
|
183
|
+
|
|
184
|
+
if buffer.size >= buffer_size
|
|
185
|
+
arr = Numo::UInt16.cast(buffer)
|
|
186
|
+
output.write(arr.to_binary)
|
|
187
|
+
buffer.clear
|
|
188
|
+
print "\r Written #{chars_written}/#{train_chars} characters..."
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
break if chars_written >= train_chars
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
unless buffer.empty?
|
|
195
|
+
arr = Numo::UInt16.cast(buffer)
|
|
196
|
+
output.write(arr.to_binary)
|
|
197
|
+
buffer.clear
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
puts ""
|
|
201
|
+
|
|
202
|
+
# Phase 4: Encode and write val.bin (streaming line by line)
|
|
203
|
+
puts "Phase 3: Encoding and writing val.bin..."
|
|
204
|
+
val_path = File.join(output_dir, "val.bin")
|
|
205
|
+
chars_written = 0
|
|
206
|
+
skipped = 0
|
|
207
|
+
buffer = []
|
|
208
|
+
|
|
209
|
+
File.open(val_path, "wb") do |output|
|
|
210
|
+
File.foreach(input_path, encoding: encoding) do |line|
|
|
211
|
+
line.each_char do |c|
|
|
212
|
+
if skipped < train_chars
|
|
213
|
+
skipped += 1
|
|
214
|
+
next
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
buffer << stoi[c]
|
|
218
|
+
chars_written += 1
|
|
219
|
+
|
|
220
|
+
if buffer.size >= buffer_size
|
|
221
|
+
arr = Numo::UInt16.cast(buffer)
|
|
222
|
+
output.write(arr.to_binary)
|
|
223
|
+
buffer.clear
|
|
224
|
+
print "\r Written #{chars_written}/#{val_chars} characters..."
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
unless buffer.empty?
|
|
230
|
+
arr = Numo::UInt16.cast(buffer)
|
|
231
|
+
output.write(arr.to_binary)
|
|
232
|
+
buffer.clear
|
|
233
|
+
end
|
|
234
|
+
end
|
|
235
|
+
puts ""
|
|
236
|
+
|
|
237
|
+
# Phase 5: Save meta.json
|
|
238
|
+
puts "Phase 4: Saving meta.json..."
|
|
239
|
+
meta = {
|
|
240
|
+
"vocab_size" => vocab_size,
|
|
241
|
+
"itos" => itos.transform_keys(&:to_s),
|
|
242
|
+
"stoi" => stoi
|
|
243
|
+
}
|
|
244
|
+
File.write(File.join(output_dir, "meta.json"), JSON.pretty_generate(meta))
|
|
245
|
+
|
|
246
|
+
train_size_mb = File.size(train_path) / 1_000_000.0
|
|
247
|
+
val_size_mb = File.size(val_path) / 1_000_000.0
|
|
248
|
+
|
|
249
|
+
puts ""
|
|
250
|
+
puts "Done!"
|
|
251
|
+
puts " train.bin: #{train_chars} tokens (#{train_size_mb.round(2)} MB)"
|
|
252
|
+
puts " val.bin: #{val_chars} tokens (#{val_size_mb.round(2)} MB)"
|
|
253
|
+
puts " meta.json: vocab_size=#{vocab_size}"
|
|
254
|
+
puts ""
|
|
255
|
+
puts "To train:"
|
|
256
|
+
puts " nanogpt train --dataset=#{output_name}"
|
|
257
|
+
end
|
|
258
|
+
|
|
76
259
|
def train
|
|
77
260
|
config = NanoGPT::TrainConfig.load(@args)
|
|
78
261
|
|
data/lib/nano_gpt/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: nanogpt
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Chris Hasiński
|
|
8
|
-
autorequire:
|
|
9
8
|
bindir: exe
|
|
10
9
|
cert_chain: []
|
|
11
|
-
date:
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
12
11
|
dependencies:
|
|
13
12
|
- !ruby/object:Gem::Dependency
|
|
14
13
|
name: torch-rb
|
|
@@ -75,6 +74,7 @@ executables:
|
|
|
75
74
|
extensions: []
|
|
76
75
|
extra_rdoc_files: []
|
|
77
76
|
files:
|
|
77
|
+
- CHANGELOG.md
|
|
78
78
|
- Gemfile
|
|
79
79
|
- Gemfile.lock
|
|
80
80
|
- README.md
|
|
@@ -111,7 +111,6 @@ metadata:
|
|
|
111
111
|
homepage_uri: https://github.com/khasinski/nanogpt-rb
|
|
112
112
|
source_code_uri: https://github.com/khasinski/nanogpt-rb
|
|
113
113
|
changelog_uri: https://github.com/khasinski/nanogpt-rb/blob/main/CHANGELOG.md
|
|
114
|
-
post_install_message:
|
|
115
114
|
rdoc_options: []
|
|
116
115
|
require_paths:
|
|
117
116
|
- lib
|
|
@@ -126,8 +125,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
126
125
|
- !ruby/object:Gem::Version
|
|
127
126
|
version: '0'
|
|
128
127
|
requirements: []
|
|
129
|
-
rubygems_version: 3.
|
|
130
|
-
signing_key:
|
|
128
|
+
rubygems_version: 3.6.9
|
|
131
129
|
specification_version: 4
|
|
132
130
|
summary: A Ruby port of Karpathy's nanoGPT
|
|
133
131
|
test_files: []
|