nanogpt 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b10e62747a63be10b519fdd8eb1eaabfc63214b01ab553d4230af865a0a8f327
4
- data.tar.gz: 69db126e3d02c897045e543981223d1d29d72f7625bd381e3ba4badee88b6d20
3
+ metadata.gz: cf308fcec8ccec074200361b2327a2381a5412b92497391bb71ec5f154cd1283
4
+ data.tar.gz: 87dcc389df03af0ac59fc0e75bbef7be7071f00613a2662cecf365ba38bc853e
5
5
  SHA512:
6
- metadata.gz: 97f5846373224b889f22b80302a05271c7ab0276037e519c19c58cc0954aa982a4d3ac9992766aa73f6d1eec1dfe2b96b8a750414dd02818529e37789671cd87
7
- data.tar.gz: accd2112564f004cc763dbaac418f35ce24426d4d86e53935a32bdf6411659463ed6cb8e9415146136ee78fa79748e39a1e506b5c44848a7b6e503dfee86e310
6
+ metadata.gz: 2b6ceeb10236b639c82398c94d3c1a876eff549a17289ff45abae489e480a3d6a1db45f95a4b2e56f1d1df6afde28159965b188342d1e38c2482359dfb11e061
7
+ data.tar.gz: 0fd4653c2719d1d3c339a904437fd0657f17e49fa0a9d4361a3a259806fab8dc798ed7f179722b41913d3471e8799abf78823f207988987d86cba680d7f90f03
@@ -33,7 +33,10 @@ require "rubygems/package"
33
33
  require "zlib"
34
34
 
35
35
  SCRIPT_DIR = File.dirname(__FILE__)
36
- RAW_DIR = File.join(SCRIPT_DIR, "raw")
36
+ OUTPUT_DIR = ENV["NANOGPT_DATA_DIR"] || SCRIPT_DIR
37
+ RAW_DIR = File.join(OUTPUT_DIR, "raw")
38
+
39
+ FileUtils.mkdir_p(OUTPUT_DIR) if ENV["NANOGPT_DATA_DIR"]
37
40
  DEFAULT_VAL_RATIO = 0.0005 # ~0.5% for validation
38
41
 
39
42
  def parse_args
@@ -60,11 +63,17 @@ def parse_args
60
63
  end
61
64
 
62
65
  def find_data_files
63
- # Look for various supported formats
66
+ # Look for various supported formats in both SCRIPT_DIR (gem) and OUTPUT_DIR (local)
64
67
  patterns = [
65
68
  File.join(RAW_DIR, "**", "*.parquet"), # Parquet (from Python export)
66
69
  File.join(RAW_DIR, "**", "*.tar"), # Original tar files
67
70
  File.join(RAW_DIR, "**", "*.txt"), # Plain text files
71
+ File.join(OUTPUT_DIR, "*.parquet"),
72
+ File.join(OUTPUT_DIR, "*.tar"),
73
+ File.join(OUTPUT_DIR, "*.txt"),
74
+ File.join(SCRIPT_DIR, "raw", "**", "*.parquet"),
75
+ File.join(SCRIPT_DIR, "raw", "**", "*.tar"),
76
+ File.join(SCRIPT_DIR, "raw", "**", "*.txt"),
68
77
  File.join(SCRIPT_DIR, "*.parquet"),
69
78
  File.join(SCRIPT_DIR, "*.tar"),
70
79
  File.join(SCRIPT_DIR, "*.txt")
@@ -273,8 +282,8 @@ def main
273
282
 
274
283
  # Write binary files
275
284
  puts "Writing binary files..."
276
- write_binary(train_tokens, File.join(SCRIPT_DIR, "train.bin"))
277
- write_binary(val_tokens, File.join(SCRIPT_DIR, "val.bin"))
285
+ write_binary(train_tokens, File.join(OUTPUT_DIR, "train.bin"))
286
+ write_binary(val_tokens, File.join(OUTPUT_DIR, "val.bin"))
278
287
 
279
288
  puts ""
280
289
  puts "Done! OpenWebText dataset prepared."
@@ -11,9 +11,13 @@ require "net/http"
11
11
  require "openssl"
12
12
  require "numo/narray"
13
13
  require "tiktoken_ruby"
14
+ require "fileutils"
14
15
 
15
16
  DATA_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
16
17
  SCRIPT_DIR = File.dirname(__FILE__)
18
+ OUTPUT_DIR = ENV["NANOGPT_DATA_DIR"] || SCRIPT_DIR
19
+
20
+ FileUtils.mkdir_p(OUTPUT_DIR) if ENV["NANOGPT_DATA_DIR"]
17
21
 
18
22
  def download_file(url)
19
23
  uri = URI(url)
@@ -53,8 +57,8 @@ puts "Val has #{val_ids.length} tokens"
53
57
  # Export to binary files (uint16)
54
58
  train_arr = Numo::UInt16.cast(train_ids)
55
59
  val_arr = Numo::UInt16.cast(val_ids)
56
- File.binwrite(File.join(SCRIPT_DIR, "train.bin"), train_arr.to_binary)
57
- File.binwrite(File.join(SCRIPT_DIR, "val.bin"), val_arr.to_binary)
60
+ File.binwrite(File.join(OUTPUT_DIR, "train.bin"), train_arr.to_binary)
61
+ File.binwrite(File.join(OUTPUT_DIR, "val.bin"), val_arr.to_binary)
58
62
 
59
63
  # No meta.json - indicates GPT-2 BPE tokenizer should be used
60
64
  puts "Done! Created train.bin and val.bin"
@@ -8,9 +8,13 @@ require "net/http"
8
8
  require "openssl"
9
9
  require "numo/narray"
10
10
  require "json"
11
+ require "fileutils"
11
12
 
12
13
  DATA_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
13
14
  SCRIPT_DIR = File.dirname(__FILE__)
15
+ OUTPUT_DIR = ENV["NANOGPT_DATA_DIR"] || SCRIPT_DIR
16
+
17
+ FileUtils.mkdir_p(OUTPUT_DIR) if ENV["NANOGPT_DATA_DIR"]
14
18
 
15
19
  def download_file(url)
16
20
  uri = URI(url)
@@ -59,8 +63,8 @@ puts "Val has #{val_ids.length} tokens"
59
63
  # Export to binary files (uint16)
60
64
  train_arr = Numo::UInt16.cast(train_ids)
61
65
  val_arr = Numo::UInt16.cast(val_ids)
62
- File.binwrite(File.join(SCRIPT_DIR, "train.bin"), train_arr.to_binary)
63
- File.binwrite(File.join(SCRIPT_DIR, "val.bin"), val_arr.to_binary)
66
+ File.binwrite(File.join(OUTPUT_DIR, "train.bin"), train_arr.to_binary)
67
+ File.binwrite(File.join(OUTPUT_DIR, "val.bin"), val_arr.to_binary)
64
68
 
65
69
  # Save meta information as JSON
66
70
  meta = {
@@ -68,6 +72,6 @@ meta = {
68
72
  "itos" => itos.transform_keys(&:to_s),
69
73
  "stoi" => stoi
70
74
  }
71
- File.write(File.join(SCRIPT_DIR, "meta.json"), JSON.pretty_generate(meta))
75
+ File.write(File.join(OUTPUT_DIR, "meta.json"), JSON.pretty_generate(meta))
72
76
 
73
77
  puts "Done! Created train.bin, val.bin, and meta.json"
data/exe/nanogpt CHANGED
@@ -6,7 +6,7 @@ $stdout.sync = true
6
6
  require "nano_gpt"
7
7
 
8
8
  class NanoGPTCLI
9
- COMMANDS = %w[train sample bench version help].freeze
9
+ COMMANDS = %w[prepare train sample bench version help].freeze
10
10
 
11
11
  def initialize(args)
12
12
  @command = args.shift
@@ -15,6 +15,8 @@ class NanoGPTCLI
15
15
 
16
16
  def run
17
17
  case @command
18
+ when "prepare"
19
+ prepare
18
20
  when "train"
19
21
  train
20
22
  when "sample"
@@ -35,6 +37,42 @@ class NanoGPTCLI
35
37
 
36
38
  private
37
39
 
40
+ def prepare
41
+ dataset = @args.first
42
+
43
+ # Find available datasets
44
+ data_dir = File.join(File.dirname(__FILE__), "..", "data")
45
+ available = Dir.glob(File.join(data_dir, "*", "prepare.rb")).map do |path|
46
+ File.basename(File.dirname(path))
47
+ end.sort
48
+
49
+ if dataset.nil?
50
+ puts "Usage: nanogpt prepare <dataset>"
51
+ puts ""
52
+ puts "Available datasets:"
53
+ available.each { |d| puts " #{d}" }
54
+ exit 1
55
+ end
56
+
57
+ prepare_script = File.join(data_dir, dataset, "prepare.rb")
58
+
59
+ unless File.exist?(prepare_script)
60
+ puts "Error: Unknown dataset '#{dataset}'"
61
+ puts ""
62
+ puts "Available datasets:"
63
+ available.each { |d| puts " #{d}" }
64
+ exit 1
65
+ end
66
+
67
+ # Set output directory to current working directory
68
+ output_dir = File.join(Dir.pwd, "data", dataset)
69
+ ENV["NANOGPT_DATA_DIR"] = output_dir
70
+
71
+ puts "Preparing dataset: #{dataset}"
72
+ puts "Output directory: #{output_dir}"
73
+ load prepare_script
74
+ end
75
+
38
76
  def train
39
77
  config = NanoGPT::TrainConfig.load(@args)
40
78
 
@@ -318,6 +356,7 @@ class NanoGPTCLI
318
356
  Usage: nanogpt <command> [options]
319
357
 
320
358
  Commands:
359
+ prepare Download and prepare a dataset
321
360
  train Train a GPT model
322
361
  sample Generate text from a trained model
323
362
  bench Run performance benchmarks
@@ -325,9 +364,9 @@ class NanoGPTCLI
325
364
  help Show this help message
326
365
 
327
366
  Examples:
328
- nanogpt train --config=config/train_shakespeare_char.json
329
- nanogpt train --dataset=shakespeare_char --max_iters=1000
330
- nanogpt sample --out_dir=out-shakespeare-char --num_samples=3
367
+ nanogpt prepare shakespeare_char
368
+ nanogpt train --dataset=shakespeare_char --device=mps
369
+ nanogpt sample --dataset=shakespeare_char --num_samples=3
331
370
  nanogpt bench --batch_size=8 --block_size=512
332
371
 
333
372
  For more information, visit: https://github.com/khasinski/nanogpt-rb
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module NanoGPT
4
- VERSION = "0.1.0"
4
+ VERSION = "0.1.2"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nanogpt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Hasiński