nanogpt 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 17c3921412aa2d70e303ba68f979f5ccf9bb3e9b815ddfe3fbfbfcc62b7a3be6
4
- data.tar.gz: 1b0b97a5513a341ae93bd046a86df8237f3941d3cbe11dec480a6cb59c8ea597
3
+ metadata.gz: cf308fcec8ccec074200361b2327a2381a5412b92497391bb71ec5f154cd1283
4
+ data.tar.gz: 87dcc389df03af0ac59fc0e75bbef7be7071f00613a2662cecf365ba38bc853e
5
5
  SHA512:
6
- metadata.gz: b3b7d07a8016bb7817f20bb22476d1d85bdaad42600234f8cf6a763316b6b128e473bccf17eef5135486c5b67051c3d1f10278214a28ec72cd26da18a4f68c8c
7
- data.tar.gz: 90fe296ae42f058d9a918ba630781edca983ef22314797ef44a82e15c257bf06903c10b0442611e78a9ff654c08978720e1ece4bb3bb13b52f5d3cffe2331122
6
+ metadata.gz: 2b6ceeb10236b639c82398c94d3c1a876eff549a17289ff45abae489e480a3d6a1db45f95a4b2e56f1d1df6afde28159965b188342d1e38c2482359dfb11e061
7
+ data.tar.gz: 0fd4653c2719d1d3c339a904437fd0657f17e49fa0a9d4361a3a259806fab8dc798ed7f179722b41913d3471e8799abf78823f207988987d86cba680d7f90f03
@@ -33,7 +33,10 @@ require "rubygems/package"
33
33
  require "zlib"
34
34
 
35
35
  SCRIPT_DIR = File.dirname(__FILE__)
36
- RAW_DIR = File.join(SCRIPT_DIR, "raw")
36
+ OUTPUT_DIR = ENV["NANOGPT_DATA_DIR"] || SCRIPT_DIR
37
+ RAW_DIR = File.join(OUTPUT_DIR, "raw")
38
+
39
+ FileUtils.mkdir_p(OUTPUT_DIR) if ENV["NANOGPT_DATA_DIR"]
37
40
  DEFAULT_VAL_RATIO = 0.0005 # ~0.5% for validation
38
41
 
39
42
  def parse_args
@@ -60,11 +63,17 @@ def parse_args
60
63
  end
61
64
 
62
65
  def find_data_files
63
- # Look for various supported formats
66
+ # Look for various supported formats in both SCRIPT_DIR (gem) and OUTPUT_DIR (local)
64
67
  patterns = [
65
68
  File.join(RAW_DIR, "**", "*.parquet"), # Parquet (from Python export)
66
69
  File.join(RAW_DIR, "**", "*.tar"), # Original tar files
67
70
  File.join(RAW_DIR, "**", "*.txt"), # Plain text files
71
+ File.join(OUTPUT_DIR, "*.parquet"),
72
+ File.join(OUTPUT_DIR, "*.tar"),
73
+ File.join(OUTPUT_DIR, "*.txt"),
74
+ File.join(SCRIPT_DIR, "raw", "**", "*.parquet"),
75
+ File.join(SCRIPT_DIR, "raw", "**", "*.tar"),
76
+ File.join(SCRIPT_DIR, "raw", "**", "*.txt"),
68
77
  File.join(SCRIPT_DIR, "*.parquet"),
69
78
  File.join(SCRIPT_DIR, "*.tar"),
70
79
  File.join(SCRIPT_DIR, "*.txt")
@@ -273,8 +282,8 @@ def main
273
282
 
274
283
  # Write binary files
275
284
  puts "Writing binary files..."
276
- write_binary(train_tokens, File.join(SCRIPT_DIR, "train.bin"))
277
- write_binary(val_tokens, File.join(SCRIPT_DIR, "val.bin"))
285
+ write_binary(train_tokens, File.join(OUTPUT_DIR, "train.bin"))
286
+ write_binary(val_tokens, File.join(OUTPUT_DIR, "val.bin"))
278
287
 
279
288
  puts ""
280
289
  puts "Done! OpenWebText dataset prepared."
@@ -11,9 +11,13 @@ require "net/http"
11
11
  require "openssl"
12
12
  require "numo/narray"
13
13
  require "tiktoken_ruby"
14
+ require "fileutils"
14
15
 
15
16
  DATA_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
16
17
  SCRIPT_DIR = File.dirname(__FILE__)
18
+ OUTPUT_DIR = ENV["NANOGPT_DATA_DIR"] || SCRIPT_DIR
19
+
20
+ FileUtils.mkdir_p(OUTPUT_DIR) if ENV["NANOGPT_DATA_DIR"]
17
21
 
18
22
  def download_file(url)
19
23
  uri = URI(url)
@@ -53,8 +57,8 @@ puts "Val has #{val_ids.length} tokens"
53
57
  # Export to binary files (uint16)
54
58
  train_arr = Numo::UInt16.cast(train_ids)
55
59
  val_arr = Numo::UInt16.cast(val_ids)
56
- File.binwrite(File.join(SCRIPT_DIR, "train.bin"), train_arr.to_binary)
57
- File.binwrite(File.join(SCRIPT_DIR, "val.bin"), val_arr.to_binary)
60
+ File.binwrite(File.join(OUTPUT_DIR, "train.bin"), train_arr.to_binary)
61
+ File.binwrite(File.join(OUTPUT_DIR, "val.bin"), val_arr.to_binary)
58
62
 
59
63
  # No meta.json - indicates GPT-2 BPE tokenizer should be used
60
64
  puts "Done! Created train.bin and val.bin"
@@ -8,9 +8,13 @@ require "net/http"
8
8
  require "openssl"
9
9
  require "numo/narray"
10
10
  require "json"
11
+ require "fileutils"
11
12
 
12
13
  DATA_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
13
14
  SCRIPT_DIR = File.dirname(__FILE__)
15
+ OUTPUT_DIR = ENV["NANOGPT_DATA_DIR"] || SCRIPT_DIR
16
+
17
+ FileUtils.mkdir_p(OUTPUT_DIR) if ENV["NANOGPT_DATA_DIR"]
14
18
 
15
19
  def download_file(url)
16
20
  uri = URI(url)
@@ -59,8 +63,8 @@ puts "Val has #{val_ids.length} tokens"
59
63
  # Export to binary files (uint16)
60
64
  train_arr = Numo::UInt16.cast(train_ids)
61
65
  val_arr = Numo::UInt16.cast(val_ids)
62
- File.binwrite(File.join(SCRIPT_DIR, "train.bin"), train_arr.to_binary)
63
- File.binwrite(File.join(SCRIPT_DIR, "val.bin"), val_arr.to_binary)
66
+ File.binwrite(File.join(OUTPUT_DIR, "train.bin"), train_arr.to_binary)
67
+ File.binwrite(File.join(OUTPUT_DIR, "val.bin"), val_arr.to_binary)
64
68
 
65
69
  # Save meta information as JSON
66
70
  meta = {
@@ -68,6 +72,6 @@ meta = {
68
72
  "itos" => itos.transform_keys(&:to_s),
69
73
  "stoi" => stoi
70
74
  }
71
- File.write(File.join(SCRIPT_DIR, "meta.json"), JSON.pretty_generate(meta))
75
+ File.write(File.join(OUTPUT_DIR, "meta.json"), JSON.pretty_generate(meta))
72
76
 
73
77
  puts "Done! Created train.bin, val.bin, and meta.json"
data/exe/nanogpt CHANGED
@@ -64,7 +64,12 @@ class NanoGPTCLI
64
64
  exit 1
65
65
  end
66
66
 
67
+ # Set output directory to current working directory
68
+ output_dir = File.join(Dir.pwd, "data", dataset)
69
+ ENV["NANOGPT_DATA_DIR"] = output_dir
70
+
67
71
  puts "Preparing dataset: #{dataset}"
72
+ puts "Output directory: #{output_dir}"
68
73
  load prepare_script
69
74
  end
70
75
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module NanoGPT
4
- VERSION = "0.1.1"
4
+ VERSION = "0.1.2"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nanogpt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Hasiński