red-datasets 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1b52a97ab0ea10ea2d5ca5b873adab2e57aa8f8a71093e9e94c13da55bc8f774
4
- data.tar.gz: 5f5525fc4cda3a9a57c08f6855fedd4da3c4be8b72fb7b35cee0e99133492c1f
3
+ metadata.gz: 3b96f5bf8fb7d8d7280451086dda394b65c42023b15ae077167e2d320c4361c1
4
+ data.tar.gz: 96f7936d62d70749f92d3bdd1d7ef2d79cfff3091e7dae8221d6a0537dbd6d7b
5
5
  SHA512:
6
- metadata.gz: ae45fd3d9a6128ddca38c2b37a3a0c8fa89c831bdb1e14c1fdda2183be29385f74df6259ec4ae36934b8c0db57c9d2434208beeeaed854ea254ec6c327f21d64
7
- data.tar.gz: 45209c6e106d78d008e73ab4414a933bc8f7333addf107c773b78f74359ef1fec262d5a22eeb9818db1512e1d0aab30f1baa00944bdd49eb7d8717f02454064b
6
+ metadata.gz: 859196aa39020d924fa7af4df6d96c110f41ac2b90a39dc89ed6935fc64e857b2bffb5776a366660ab61c55a96dd35b9bd6663ec23c7ee4249cae3103bc0a2aa
7
+ data.tar.gz: b07ec53917af58e737058c504685d283850e072f0794c457bd961d39b9815c85b2fc2a9bed4de2a643675dc0e0f7bb2077b4c41b2c28c9c94f948a532baae6bb
data/README.md CHANGED
@@ -18,7 +18,7 @@ You can use datasets easily because you can access each dataset with multiple wa
18
18
 
19
19
  ## Usage
20
20
 
21
- Here is an example to access iris dataset by `#each`:
21
+ Here is an example to access [Iris Data Set](https://archive.ics.uci.edu/ml/datasets/iris) by `#each` or `Table#to_h` or `Table#fetch_values`.
22
22
 
23
23
  ```ruby
24
24
  require "datasets"
@@ -32,12 +32,40 @@ iris.each do |record|
32
32
  record.petal_width,
33
33
  record.class,
34
34
  ]
35
- # [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]
36
- # [7.0, 3.2, 4.7, 1.4, "Iris-versicolor"]
37
35
  end
36
+ # => [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]
37
+ # => [4.9, 3.0, 1.4, 0.2, "Iris-setosa"]
38
+ :
39
+ # => [7.0, 3.2, 4.7, 1.4, "Iris-versicolor"]
40
+
41
+
42
+ iris_hash = iris.to_table.to_h
43
+ p iris_hash[:sepal_length]
44
+ # => [5.1, 4.9, .. , 7.0, ..
45
+ p iris_hash[:sepal_width]
46
+ # => [3.5, 3.0, .. , 3.2, ..
47
+ p iris_hash[:petal_length]
48
+ # => [1.4, 1.4, .. , 4.7, ..
49
+ p iris_hash[:petal_width]
50
+ # => [0.2, 0.2, .. , 1.4, ..
51
+ p iris_hash[:class]
52
+ # => ["Iris-setosa", "Iris-setosa", .. , "Iris-versicolor", ..
53
+
54
+
55
+ iris_table = iris.to_table
56
+ p iris_table.fetch_values(:sepal_length, :sepal_width, :petal_length, :petal_width).transpose
57
+ # => [[5.1, 3.5, 1.4, 0.2],
58
+ [4.9, 3.0, 1.4, 0.2],
59
+ :
60
+ [7.0, 3.2, 4.7, 1.4],
61
+ :
62
+
63
+ p iris_table[:class]
64
+ # => ["Iris-setosa", "Iris-setosa", .. , "Iris-versicolor", ..
38
65
  ```
39
66
 
40
- Here is an example to access CIFAR dataset by `#each`:
67
+
68
+ Here is an example to access [The CIFAR-10/100 dataset](https://www.cs.toronto.edu/~kriz/cifar.html) by `#each`:
41
69
 
42
70
  **CIFAR-10**
43
71
 
@@ -73,6 +101,24 @@ cifar.each do |record|
73
101
  end
74
102
  ```
75
103
 
104
+ **MNIST**
105
+
106
+ ```ruby
107
+ require "datasets"
108
+
109
+ mnist = Datasets::MNIST.new(type: :train)
110
+ mnist.metadata
111
+ #=> #<struct Datasets::Metadata name="MNIST-train", url="http://yann.lecun.com/exdb/mnist/", licenses=nil, description="a training set of 60,000 examples">
112
+
113
+ mnist.each do |record|
114
+ p record.pixels
115
+ # => [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, .....]
116
+ p record.label
117
+ # => 5
118
+ end
119
+ ```
120
+
121
+
76
122
  ## License
77
123
 
78
124
  The MIT license. See `LICENSE.txt` for details.
data/doc/text/news.md CHANGED
@@ -1,5 +1,13 @@
1
1
  # News
2
2
 
3
+ ## 0.0.6 - 2018-07-25
4
+
5
+ ### Improvements
6
+
7
+ * `Datasets::MNIST`: Added.
8
+
9
+ * `Datasets::PennTreebank`: Added.
10
+
3
11
  ## 0.0.5 - 2018-06-06
4
12
 
5
13
  ### Improvements
@@ -27,14 +27,17 @@ module Datasets
27
27
 
28
28
  def initialize(n_classes: 10, type: :train)
29
29
  unless [10, 100].include?(n_classes)
30
- raise 'Please set n_classes 10 or 100'
30
+ message = "Please set n_classes 10 or 100: #{n_classes.inspect}"
31
+ raise ArgumentError, message
31
32
  end
32
33
  unless [:train, :test].include?(type)
33
- raise 'Please set type :train or :test'
34
+ message = "Please set type :train or :test: #{type.inspect}"
35
+ raise ArgumentError, message
34
36
  end
35
37
 
36
38
  super()
37
39
 
40
+ @metadata.id = "cifar-#{n_classes}"
38
41
  @metadata.name = "CIFAR-#{n_classes}"
39
42
  @metadata.url = "https://www.cs.toronto.edu/~kriz/cifar.html"
40
43
  @metadata.description = "CIFAR-#{n_classes} is 32x32 image dataset"
@@ -27,7 +27,7 @@ module Datasets
27
27
  else
28
28
  base_dir = ENV["XDG_CACHE_HOME"] || "~/.cache"
29
29
  end
30
- Pathname(base_dir).expand_path + "red-datasets" + metadata.name
30
+ Pathname(base_dir).expand_path + "red-datasets" + metadata.id
31
31
  end
32
32
 
33
33
  def download(output_path, url)
data/lib/datasets/iris.rb CHANGED
@@ -12,7 +12,8 @@ module Datasets
12
12
 
13
13
  def initialize
14
14
  super()
15
- @metadata.name = "iris"
15
+ @metadata.id = "iris"
16
+ @metadata.name = "Iris"
16
17
  @metadata.url = "https://archive.ics.uci.edu/ml/datasets/Iris"
17
18
  @metadata.description = lambda do
18
19
  read_names
@@ -1,5 +1,6 @@
1
1
  module Datasets
2
- class Metadata < Struct.new(:name,
2
+ class Metadata < Struct.new(:id,
3
+ :name,
3
4
  :url,
4
5
  :licenses,
5
6
  :description)
@@ -0,0 +1,108 @@
1
+ require 'zlib'
2
+
3
+ require_relative "dataset"
4
+
5
+ class SetTypeError < StandardError; end
6
+
7
+ module Datasets
8
+ class MNIST < Dataset
9
+
10
+ class Record < Struct.new(:data, :label)
11
+ def pixels
12
+ data.unpack("C*")
13
+ end
14
+
15
+ def to_h
16
+ hash = super
17
+ hash[:pixels] = pixels
18
+ hash
19
+ end
20
+ end
21
+
22
+ def initialize(type: :train)
23
+ unless [:train, :test].include?(type)
24
+ raise ArgumentError, "Please set type :train or :test: #{type.inspect}"
25
+ end
26
+
27
+ super()
28
+
29
+ @metadata.id = "mnist-#{type}"
30
+ @metadata.name = "MNIST: #{type}"
31
+ @metadata.url = "http://yann.lecun.com/exdb/mnist/"
32
+ @type = type
33
+
34
+ case type
35
+ when :train
36
+ @metadata.description = "a training set of 60,000 examples"
37
+ when :test
38
+ @metadata.description = "a test set of 10,000 examples"
39
+ end
40
+ end
41
+
42
+ def each(&block)
43
+ return to_enum(__method__) unless block_given?
44
+
45
+ image_path = cache_dir_path + target_file(:image)
46
+ label_path = cache_dir_path + target_file(:label)
47
+ base_url = "http://yann.lecun.com/exdb/mnist/"
48
+
49
+ unless image_path.exist?
50
+ download(image_path, base_url + target_file(:image))
51
+ end
52
+
53
+ unless label_path.exist?
54
+ download(label_path, base_url + target_file(:label))
55
+ end
56
+
57
+ open_data(image_path, label_path, &block)
58
+ end
59
+
60
+ private
61
+ def open_data(image_path, label_path, &block)
62
+ labels = parse_labels(label_path)
63
+
64
+ Zlib::GzipReader.open(image_path) do |f|
65
+ n_uint32s = 4
66
+ n_bytes = n_uint32s * 4
67
+ mnist_magic_number = 2051
68
+ magic, n_images, n_rows, n_cols = f.read(n_bytes).unpack("N*")
69
+ raise 'This is not MNIST image file' if magic != mnist_magic_number
70
+ n_images.times do |i|
71
+ data = f.read(n_rows * n_cols)
72
+ label = labels[i]
73
+ yield Record.new(data, label)
74
+ end
75
+ end
76
+ end
77
+
78
+ def target_file(data)
79
+ case @type
80
+ when :train
81
+ case data
82
+ when :image
83
+ "train-images-idx3-ubyte.gz"
84
+ when :label
85
+ "train-labels-idx1-ubyte.gz"
86
+ end
87
+ when :test
88
+ case data
89
+ when :image
90
+ "t10k-images-idx3-ubyte.gz"
91
+ when :label
92
+ "t10k-labels-idx1-ubyte.gz"
93
+ end
94
+ end
95
+ end
96
+
97
+ def parse_labels(file_path)
98
+ Zlib::GzipReader.open(file_path) do |f|
99
+ n_uint32s = 4
100
+ n_bytes = n_uint32s * 2
101
+ mnist_magic_number = 2049
102
+ magic, n_labels = f.read(n_bytes).unpack('N2')
103
+ raise 'This is not MNIST label file' if magic != mnist_magic_number
104
+ f.read(n_labels).unpack('C*')
105
+ end
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,65 @@
1
+ require_relative "dataset"
2
+
3
+ module Datasets
4
+ class PennTreebank < Dataset
5
+ Record = Struct.new(:word, :id)
6
+
7
+ DESCRIPTION = <<~DESC
8
+ `Penn Tree Bank <https://www.cis.upenn.edu/~treebank/>`_ is originally a
9
+ corpus of English sentences with linguistic structure annotations. This
10
+ function uses a variant distributed at
11
+ `https://github.com/wojzaremba/lstm <https://github.com/wojzaremba/lstm>`_,
12
+ which omits the annotation and splits the dataset into three parts:
13
+ training, validation, and test.
14
+ DESC
15
+
16
+ def initialize(type: :train)
17
+ valid_types = [:train, :test, :valid]
18
+ unless valid_types.include?(type)
19
+ valid_types_label = valid_types.collect(&:inspect).join(", ")
20
+ message = "Type must be one of [#{valid_types_label}]: #{type.inspect}"
21
+ raise ArgumentError, message
22
+ end
23
+ @type = type
24
+
25
+ super()
26
+
27
+ @metadata.id = "penn-treebank-#{@type}"
28
+ @metadata.name = "Penn Treebank: #{@type}"
29
+ @metadata.description = DESCRIPTION
30
+ @metadata.url = "https://github.com/wojzaremba/lstm"
31
+ @metadata.licenses = ["Apache-2.0"]
32
+ end
33
+
34
+ def each(&block)
35
+ return to_enum(__method__) unless block_given?
36
+
37
+ base_name = "ptb.#{@type}.txt"
38
+ data_path = cache_dir_path + base_name
39
+ unless data_path.exist?
40
+ base_url = "https://raw.githubusercontent.com/wojzaremba/lstm/master/data"
41
+ download(data_path, "#{base_url}/#{base_name}")
42
+ end
43
+
44
+ parse_data(data_path, &block)
45
+ end
46
+
47
+ private
48
+ def parse_data(data_path)
49
+ index = 0
50
+ vocabulary = {}
51
+ File.open(data_path) do |f|
52
+ f.each_line do |line|
53
+ line.split.each do |word|
54
+ word = word.strip
55
+ unless vocabulary.key?(word)
56
+ vocabulary[word] = index
57
+ index += 1
58
+ end
59
+ yield(Record.new(word, vocabulary[word]))
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.0.5"
2
+ VERSION = "0.0.6"
3
3
  end
@@ -30,14 +30,15 @@ module Datasets
30
30
  super()
31
31
  @language = language
32
32
  @type = type
33
- @metadata.name = "wikipedia-#{@language}-#{@type}"
33
+ @metadata.id = "wikipedia-#{@language}-#{@type}"
34
+ @metadata.name = "Wikipedia #{@type} (#{@language})"
34
35
  @metadata.url = "https://dumps.wikimedia.org/"
35
36
  @metadata.licenses = [
36
37
  "CC-BY-SA-3.0",
37
38
  "CC-BY-SA-4.0",
38
39
  "GFDL-1.3-or-later",
39
40
  ]
40
- @metadata.description = "Wikipedia #{@type} (#{@language})"
41
+ @metadata.description = "Wikipedia #{@type} in #{@language}"
41
42
  end
42
43
 
43
44
  def each(&block)
data/lib/datasets.rb CHANGED
@@ -2,4 +2,6 @@ require "datasets/version"
2
2
 
3
3
  require "datasets/cifar"
4
4
  require "datasets/iris"
5
+ require "datasets/mnist"
6
+ require "datasets/penn-treebank"
5
7
  require "datasets/wikipedia"
data/test/helper.rb CHANGED
@@ -13,6 +13,7 @@ module Helper
13
13
  end
14
14
 
15
15
  def teardown_sandbox
16
+ return unless defined?(@tmp_dir)
16
17
  FileUtils.rm_rf(@tmp_dir)
17
18
  end
18
19
  end
data/test/test-cifar.rb CHANGED
@@ -216,4 +216,14 @@ class CIFARTest < Test::Unit::TestCase
216
216
  end
217
217
  end
218
218
  end
219
+
220
+ sub_test_case("invalid") do
221
+ test("type") do
222
+ invalid_type = :invalid
223
+ message = "Please set type :train or :test: #{invalid_type.inspect}"
224
+ assert_raise(ArgumentError.new(message)) do
225
+ Datasets::CIFAR.new(type: invalid_type)
226
+ end
227
+ end
228
+ end
219
229
  end
@@ -0,0 +1,111 @@
1
+ class MNISTTest < Test::Unit::TestCase
2
+ include Helper::Sandbox
3
+
4
+ sub_test_case("Normal") do
5
+ def setup_data
6
+ setup_sandbox
7
+
8
+ def @dataset.cache_dir_path
9
+ @cache_dir_path
10
+ end
11
+
12
+ def @dataset.cache_dir_path=(path)
13
+ @cache_dir_path = path
14
+ end
15
+ @dataset.cache_dir_path = @tmp_dir
16
+
17
+ def @dataset.download(output_path, url)
18
+ image_magic_number = 2051
19
+ label_magic_number = 2049
20
+ n_image, image_size_x, image_size_y, label = 10, 28, 28, 1
21
+
22
+ Zlib::GzipWriter.open(output_path) do |gz|
23
+ if output_path.basename.to_s.include?("-images-")
24
+ image_data = ([image_magic_number, n_image]).pack('N2') +
25
+ ([image_size_x,image_size_y]).pack('N2') +
26
+ ([0] * image_size_x * image_size_y).pack("C*") * n_image
27
+ gz.puts(image_data)
28
+ else
29
+ label_data = ([label_magic_number, n_image]).pack('N2') +
30
+ ([label] * n_image).pack("C*")
31
+ gz.puts(label_data)
32
+ end
33
+ end
34
+ end
35
+ end
36
+
37
+ def teardown
38
+ teardown_sandbox
39
+ end
40
+
41
+ sub_test_case("train") do
42
+ def setup
43
+ @dataset = Datasets::MNIST.new(type: :train)
44
+ setup_data()
45
+ end
46
+
47
+ test("#each") do
48
+ raw_dataset = @dataset.collect do |record|
49
+ {
50
+ :label => record.label,
51
+ :pixels => record.pixels
52
+ }
53
+ end
54
+
55
+ assert_equal([
56
+ {
57
+ :label => 1,
58
+ :pixels => [0] * 28 * 28
59
+ }
60
+ ] * 10,
61
+ raw_dataset)
62
+ end
63
+
64
+ test("#to_table") do
65
+ table_data = @dataset.to_table
66
+ assert_equal([[0] * 28 * 28] * 10,
67
+ table_data[:pixels])
68
+ end
69
+ end
70
+
71
+ sub_test_case("test") do
72
+ def setup
73
+ @dataset = Datasets::MNIST.new(type: :test)
74
+ setup_data()
75
+ end
76
+
77
+ test("#each") do
78
+ raw_dataset = @dataset.collect do |record|
79
+ {
80
+ :label => record.label,
81
+ :pixels => record.pixels
82
+ }
83
+ end
84
+
85
+ assert_equal([
86
+ {
87
+ :label => 1,
88
+ :pixels => [0] * 28 * 28
89
+ }
90
+ ] * 10,
91
+ raw_dataset)
92
+ end
93
+
94
+ test("#to_table") do
95
+ table_data = @dataset.to_table
96
+ assert_equal([[0] * 28 * 28] * 10,
97
+ table_data[:pixels])
98
+ end
99
+ end
100
+ end
101
+
102
+ sub_test_case("Abnormal") do
103
+ test("invalid type") do
104
+ invalid_type = :invalid
105
+ message = "Please set type :train or :test: #{invalid_type.inspect}"
106
+ assert_raise(ArgumentError.new(message)) do
107
+ Datasets::MNIST.new(type: invalid_type)
108
+ end
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,59 @@
1
+ class PennTreebankTest < Test::Unit::TestCase
2
+ def record(*args)
3
+ Datasets::PennTreebank::Record.new(*args)
4
+ end
5
+
6
+ sub_test_case("type") do
7
+ test("train") do
8
+ dataset = Datasets::PennTreebank.new(type: :train)
9
+ records = dataset.to_a
10
+ assert_equal([
11
+ 887521,
12
+ record("aer", 0),
13
+ record("<unk>", 25),
14
+ ],
15
+ [
16
+ records.size,
17
+ records[0],
18
+ records[-1],
19
+ ])
20
+ end
21
+
22
+ test("test") do
23
+ dataset = Datasets::PennTreebank.new(type: :test)
24
+ records = dataset.to_a
25
+ assert_equal([
26
+ 78669,
27
+ record("no", 0),
28
+ record("us", 953),
29
+ ],
30
+ [
31
+ records.size,
32
+ records[0],
33
+ records[-1],
34
+ ])
35
+ end
36
+
37
+ test("valid") do
38
+ dataset = Datasets::PennTreebank.new(type: :valid)
39
+ records = dataset.to_a
40
+ assert_equal([
41
+ 70390,
42
+ record("consumers", 0),
43
+ record("N", 28),
44
+ ],
45
+ [
46
+ records.size,
47
+ records[0],
48
+ records[-1],
49
+ ])
50
+ end
51
+
52
+ test("invalid") do
53
+ message = "Type must be one of [:train, :test, :valid]: :invalid"
54
+ assert_raise(ArgumentError.new(message)) do
55
+ Datasets::PennTreebank.new(type: :invalid)
56
+ end
57
+ end
58
+ end
59
+ end
@@ -83,13 +83,18 @@ class WikipediaTest < Test::Unit::TestCase
83
83
  end
84
84
 
85
85
  sub_test_case("#metadata") do
86
- test("#name") do
86
+ test("#id") do
87
87
  assert_equal("wikipedia-ja-articles",
88
+ @dataset.metadata.id)
89
+ end
90
+
91
+ test("#name") do
92
+ assert_equal("Wikipedia articles (ja)",
88
93
  @dataset.metadata.name)
89
94
  end
90
95
 
91
96
  test("#description") do
92
- assert_equal("Wikipedia articles (ja)",
97
+ assert_equal("Wikipedia articles in ja",
93
98
  @dataset.metadata.description)
94
99
  end
95
100
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red-datasets
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - tomisuker
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2018-06-06 00:00:00.000000000 Z
12
+ date: 2018-07-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -104,6 +104,8 @@ files:
104
104
  - lib/datasets/downloader.rb
105
105
  - lib/datasets/iris.rb
106
106
  - lib/datasets/metadata.rb
107
+ - lib/datasets/mnist.rb
108
+ - lib/datasets/penn-treebank.rb
107
109
  - lib/datasets/table.rb
108
110
  - lib/datasets/version.rb
109
111
  - lib/datasets/wikipedia.rb
@@ -112,6 +114,8 @@ files:
112
114
  - test/run-test.rb
113
115
  - test/test-cifar.rb
114
116
  - test/test-iris.rb
117
+ - test/test-mnist.rb
118
+ - test/test-penn-treebank.rb
115
119
  - test/test-table.rb
116
120
  - test/test-wikipedia.rb
117
121
  homepage: https://github.com/red-data-tools/red-datasets
@@ -139,9 +143,11 @@ signing_key:
139
143
  specification_version: 4
140
144
  summary: Red Datasets provides classes that provide common datasets such as iris dataset.
141
145
  test_files:
142
- - test/test-iris.rb
146
+ - test/run-test.rb
147
+ - test/test-cifar.rb
143
148
  - test/test-wikipedia.rb
149
+ - test/test-iris.rb
144
150
  - test/helper.rb
145
- - test/run-test.rb
151
+ - test/test-mnist.rb
146
152
  - test/test-table.rb
147
- - test/test-cifar.rb
153
+ - test/test-penn-treebank.rb