red-datasets 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3b96f5bf8fb7d8d7280451086dda394b65c42023b15ae077167e2d320c4361c1
4
- data.tar.gz: 96f7936d62d70749f92d3bdd1d7ef2d79cfff3091e7dae8221d6a0537dbd6d7b
3
+ metadata.gz: 222271b814e3a5ce23b5e0dd1d2578bffb84afdab10110b0869985c6056bfd3b
4
+ data.tar.gz: ac30931b3317ab04afd394b28a45a9206c784d78b3bcaf98fc3a2a48227c7930
5
5
  SHA512:
6
- metadata.gz: 859196aa39020d924fa7af4df6d96c110f41ac2b90a39dc89ed6935fc64e857b2bffb5776a366660ab61c55a96dd35b9bd6663ec23c7ee4249cae3103bc0a2aa
7
- data.tar.gz: b07ec53917af58e737058c504685d283850e072f0794c457bd961d39b9815c85b2fc2a9bed4de2a643675dc0e0f7bb2077b4c41b2c28c9c94f948a532baae6bb
6
+ metadata.gz: 8a94a3d66baaed4948904e97dc53100d73ae96c528c09b02252caabd05b8545587abf6fbcba3a578725812327a9a2c8827bbb7e283ccd3d7e66753bf30035e2e
7
+ data.tar.gz: 2ab44b5aa3ee5da0ac8e8307546c71942938de4497bfec05fc929715a4e5ef6df1cb091bce0d5f12978582d2c9fa7eaffff9edd54be0d845627dccfce42a63dd
data/README.md CHANGED
@@ -30,7 +30,7 @@ iris.each do |record|
30
30
  record.sepal_width,
31
31
  record.petal_length,
32
32
  record.petal_width,
33
- record.class,
33
+ record.label,
34
34
  ]
35
35
  end
36
36
  # => [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]
@@ -48,7 +48,7 @@ p iris_hash[:petal_length]
48
48
  # => [1.4, 1.4, .. , 4.7, ..
49
49
  p iris_hash[:petal_width]
50
50
  # => [0.2, 0.2, .. , 1.4, ..
51
- p iris_hash[:class]
51
+ p iris_hash[:label]
52
52
  # => ["Iris-setosa", "Iris-setosa", .. , "Iris-versicolor", ..
53
53
 
54
54
 
@@ -60,7 +60,7 @@ p iris_table.fetch_values(:sepal_length, :sepal_width, :petal_length, :petal_wid
60
60
  [7.0, 3.2, 4.7, 1.4],
61
61
  :
62
62
 
63
- p iris_table[:class]
63
+ p iris_table[:label]
64
64
  # => ["Iris-setosa", "Iris-setosa", .. , "Iris-versicolor", ..
65
65
  ```
66
66
 
data/doc/text/news.md CHANGED
@@ -1,5 +1,36 @@
1
1
  # News
2
2
 
3
+ ## 0.0.7 - 2018-11-21
4
+
5
+ ### Improvements
6
+
7
+ * `Datasets::Table#dictionary_encode`: Added.
8
+ [GitHub#22]
9
+
10
+ * `Datasets::Table#label_encode`: Added.
11
+
12
+ * `Datasets::Dictionary`: Added.
13
+
14
+ * `Datasets::Wine`: Added.
15
+ [GitHub#26][Patch by Ryuta Suzuki]
16
+
17
+ * `Datasets::FashionMNIST`: Added.
18
+ [GitHub#27][Patch by chimame]
19
+
20
+ * `Datasets::Iris::Record#label`: Renamed from `#class`. This is an
21
+ incompatible change.
22
+
23
+ * `Datasets::Adult`: Added.
24
+ [GitHub#30][Patch by Yasuo Honda]
25
+
26
+ ### Thanks
27
+
28
+ * Ryuta Suzuki
29
+
30
+ * chimame
31
+
32
+ * Yasuo Honda
33
+
3
34
  ## 0.0.6 - 2018-07-25
4
35
 
5
36
  ### Improvements
data/lib/datasets.rb CHANGED
@@ -1,7 +1,10 @@
1
1
  require "datasets/version"
2
2
 
3
+ require "datasets/adult"
3
4
  require "datasets/cifar"
5
+ require "datasets/fashion-mnist"
4
6
  require "datasets/iris"
5
7
  require "datasets/mnist"
6
8
  require "datasets/penn-treebank"
7
9
  require "datasets/wikipedia"
10
+ require "datasets/wine"
@@ -0,0 +1,83 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class Adult < Dataset
7
+ Record = Struct.new(
8
+ :age,
9
+ :work_class,
10
+ :final_weight,
11
+ :education,
12
+ :n_education_years,
13
+ :marital_status,
14
+ :occupation,
15
+ :relationship,
16
+ :race,
17
+ :sex,
18
+ :capital_gain,
19
+ :capital_loss,
20
+ :hours_per_week,
21
+ :native_country,
22
+ :label
23
+ )
24
+
25
+ def initialize(type: :train)
26
+ unless [:train, :test].include?(type)
27
+ raise ArgumentError, 'Please set type :train or :test'
28
+ end
29
+
30
+ super()
31
+ @type = type
32
+ @metadata.id = "adult-#{@type}"
33
+ @metadata.name = "Adult: #{@type}"
34
+ @metadata.url = "http://archive.ics.uci.edu/ml/datasets/adult"
35
+ @metadata.description = lambda do
36
+ read_names
37
+ end
38
+ end
39
+
40
+ def each
41
+ return to_enum(__method__) unless block_given?
42
+
43
+ open_data do |csv|
44
+ csv.each do |row|
45
+ next if row[0].nil?
46
+ record = Record.new(*row)
47
+ yield(record)
48
+ end
49
+ end
50
+ end
51
+
52
+ private
53
+ def open_data
54
+ case @type
55
+ when :train
56
+ ext = "data"
57
+ when :test
58
+ ext = "test"
59
+ end
60
+ data_path = cache_dir_path + "adult-#{ext}.csv"
61
+ unless data_path.exist?
62
+ data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.#{ext}"
63
+ download(data_path, data_url)
64
+ end
65
+ CSV.open(data_path,
66
+ {
67
+ converters: [:numeric, lambda {|f| f.strip}],
68
+ skip_lines: /\A\|/,
69
+ }) do |csv|
70
+ yield(csv)
71
+ end
72
+ end
73
+
74
+ def read_names
75
+ names_path = cache_dir_path + "adult.names"
76
+ unless names_path.exist?
77
+ names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names"
78
+ download(names_path, names_url)
79
+ end
80
+ names_path.read
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,59 @@
1
+ module Datasets
2
+ class Dictionary
3
+ include Enumerable
4
+
5
+ def initialize(values)
6
+ build_dictionary(values)
7
+ end
8
+
9
+ def id(value)
10
+ @value_to_id[value]
11
+ end
12
+
13
+ def value(id)
14
+ @id_to_value[id]
15
+ end
16
+
17
+ def ids
18
+ @id_to_value.keys
19
+ end
20
+
21
+ def values
22
+ @id_to_value.values
23
+ end
24
+
25
+ def each(&block)
26
+ @id_to_value.each(&block)
27
+ end
28
+
29
+ def size
30
+ @id_to_value.size
31
+ end
32
+ alias_method :length, :size
33
+
34
+ def encode(values)
35
+ values.collect do |value|
36
+ id(value)
37
+ end
38
+ end
39
+
40
+ def decode(ids)
41
+ ids.collect do |id|
42
+ value(id)
43
+ end
44
+ end
45
+
46
+ private
47
+ def build_dictionary(values)
48
+ @id_to_value = {}
49
+ @value_to_id = {}
50
+ id = 0
51
+ values.each do |value|
52
+ next if @value_to_id.key?(value)
53
+ @id_to_value[id] = value
54
+ @value_to_id[value] = id
55
+ id += 1
56
+ end
57
+ end
58
+ end
59
+ end
@@ -3,7 +3,7 @@ begin
3
3
  require "io/console"
4
4
  rescue LoadError
5
5
  end
6
- require "open-uri"
6
+ require "net/http"
7
7
  require "pathname"
8
8
 
9
9
  module Datasets
@@ -15,84 +15,57 @@ module Datasets
15
15
  url = URI.parse(url)
16
16
  end
17
17
  @url = url
18
- @url.extend(CurrentBufferReadable)
18
+ unless @url.is_a?(URI::HTTP)
19
+ raise ArgumentError, "download URL must be HTTP or HTTPS: <#{@url}>"
20
+ end
19
21
  end
20
22
 
21
23
  def download(output_path)
22
24
  output_path.parent.mkpath
23
25
 
26
+ headers = {"User-Agent" => "Red Datasets/#{VERSION}"}
24
27
  start = nil
25
28
  partial_output_path = Pathname.new("#{output_path}.partial")
26
29
  if partial_output_path.exist?
27
30
  start = partial_output_path.size
31
+ headers["Range"] = "bytes=#{start}-"
28
32
  end
29
33
 
30
- progress_reporter = nil
31
- content_length_proc = lambda do |content_length|
32
- base_name = @url.path.split("/").last
33
- size_max = content_length
34
- size_max += start if start
35
- progress_reporter = ProgressReporter.new(base_name, size_max)
36
- end
37
- progress_proc = lambda do |size_current|
38
- size_current += start if start
39
- progress_reporter.report(size_current) if progress_reporter
40
- end
41
- options = {
42
- :content_length_proc => content_length_proc,
43
- :progress_proc => progress_proc,
44
- }
45
- if start
46
- options["Range"] = "bytes=#{start}-"
47
- end
34
+ Net::HTTP.start(@url.hostname,
35
+ @url.port,
36
+ :use_ssl => (@url.scheme == "https")) do |http|
37
+ request = Net::HTTP::Get.new(@url.path, headers)
38
+ http.request(request) do |response|
39
+ case response
40
+ when Net::HTTPPartialContent
41
+ mode = "ab"
42
+ when Net::HTTPSuccess
43
+ start = nil
44
+ mode = "wb"
45
+ else
46
+ break
47
+ end
48
48
 
49
- begin
50
- @url.open(options) do |input|
51
- copy_stream(input, partial_output_path)
52
- end
53
- rescue Interrupt, Net::ReadTimeout
54
- if @url.current_buffer
55
- input = @url.current_buffer.io
56
- input.rewind
57
- copy_stream(input, partial_output_path)
49
+ base_name = @url.path.split("/").last
50
+ size_current = 0
51
+ size_max = response.content_length
52
+ if start
53
+ size_current += start
54
+ size_max += start
55
+ end
56
+ progress_reporter = ProgressReporter.new(base_name, size_max)
57
+ partial_output_path.open(mode) do |output|
58
+ response.read_body do |chunk|
59
+ size_current += chunk.bytesize
60
+ progress_reporter.report(size_current)
61
+ output.write(chunk)
62
+ end
63
+ end
58
64
  end
59
- raise
60
65
  end
61
-
62
66
  FileUtils.mv(partial_output_path, output_path)
63
67
  end
64
68
 
65
- private
66
- def copy_stream(input, partial_output_path)
67
- if partial_output_path.exist?
68
- # TODO: It's better that we use "206 Partial Content" response
69
- # to detect partial response.
70
- partial_head = partial_output_path.open("rb") do |partial_output|
71
- partial_output.read(256)
72
- end
73
- input_head = input.read(partial_head.bytesize)
74
- input.rewind
75
- if partial_head == input_head
76
- mode = "wb"
77
- else
78
- mode = "ab"
79
- end
80
- else
81
- mode = "wb"
82
- end
83
- partial_output_path.open(mode) do |partial_output|
84
- IO.copy_stream(input, partial_output)
85
- end
86
- end
87
-
88
- module CurrentBufferReadable
89
- attr_reader :current_buffer
90
- def buffer_open(buffer, proxy, options)
91
- @current_buffer = buffer
92
- super
93
- end
94
- end
95
-
96
69
  class ProgressReporter
97
70
  def initialize(base_name, size_max)
98
71
  @base_name = base_name
@@ -0,0 +1,12 @@
1
+ require_relative 'mnist'
2
+
3
+ module Datasets
4
+ class FashionMNIST < MNIST
5
+ BASE_URL = "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/"
6
+
7
+ private
8
+ def dataset_name
9
+ "Fashion-MNIST"
10
+ end
11
+ end
12
+ end
data/lib/datasets/iris.rb CHANGED
@@ -8,7 +8,7 @@ module Datasets
8
8
  :sepal_width,
9
9
  :petal_length,
10
10
  :petal_width,
11
- :class)
11
+ :label)
12
12
 
13
13
  def initialize
14
14
  super()
@@ -6,6 +6,7 @@ class SetTypeError < StandardError; end
6
6
 
7
7
  module Datasets
8
8
  class MNIST < Dataset
9
+ BASE_URL = "http://yann.lecun.com/exdb/mnist/"
9
10
 
10
11
  class Record < Struct.new(:data, :label)
11
12
  def pixels
@@ -26,9 +27,9 @@ module Datasets
26
27
 
27
28
  super()
28
29
 
29
- @metadata.id = "mnist-#{type}"
30
- @metadata.name = "MNIST: #{type}"
31
- @metadata.url = "http://yann.lecun.com/exdb/mnist/"
30
+ @metadata.id = "#{dataset_name.downcase}-#{type}"
31
+ @metadata.name = "#{dataset_name}: #{type}"
32
+ @metadata.url = self.class::BASE_URL
32
33
  @type = type
33
34
 
34
35
  case type
@@ -44,7 +45,7 @@ module Datasets
44
45
 
45
46
  image_path = cache_dir_path + target_file(:image)
46
47
  label_path = cache_dir_path + target_file(:label)
47
- base_url = "http://yann.lecun.com/exdb/mnist/"
48
+ base_url = self.class::BASE_URL
48
49
 
49
50
  unless image_path.exist?
50
51
  download(image_path, base_url + target_file(:image))
@@ -66,7 +67,7 @@ module Datasets
66
67
  n_bytes = n_uint32s * 4
67
68
  mnist_magic_number = 2051
68
69
  magic, n_images, n_rows, n_cols = f.read(n_bytes).unpack("N*")
69
- raise 'This is not MNIST image file' if magic != mnist_magic_number
70
+ raise "This is not #{dataset_name} image file" if magic != mnist_magic_number
70
71
  n_images.times do |i|
71
72
  data = f.read(n_rows * n_cols)
72
73
  label = labels[i]
@@ -100,9 +101,13 @@ module Datasets
100
101
  n_bytes = n_uint32s * 2
101
102
  mnist_magic_number = 2049
102
103
  magic, n_labels = f.read(n_bytes).unpack('N2')
103
- raise 'This is not MNIST label file' if magic != mnist_magic_number
104
+ raise "This is not #{dataset_name} label file" if magic != mnist_magic_number
104
105
  f.read(n_labels).unpack('C*')
105
106
  end
106
107
  end
108
+
109
+ def dataset_name
110
+ "MNIST"
111
+ end
107
112
  end
108
113
  end
@@ -2,7 +2,7 @@ require_relative "dataset"
2
2
 
3
3
  module Datasets
4
4
  class PennTreebank < Dataset
5
- Record = Struct.new(:word, :id)
5
+ Record = Struct.new(:word)
6
6
 
7
7
  DESCRIPTION = <<~DESC
8
8
  `Penn Tree Bank <https://www.cis.upenn.edu/~treebank/>`_ is originally a
@@ -46,17 +46,10 @@ module Datasets
46
46
 
47
47
  private
48
48
  def parse_data(data_path)
49
- index = 0
50
- vocabulary = {}
51
49
  File.open(data_path) do |f|
52
50
  f.each_line do |line|
53
51
  line.split.each do |word|
54
- word = word.strip
55
- unless vocabulary.key?(word)
56
- vocabulary[word] = index
57
- index += 1
58
- end
59
- yield(Record.new(word, vocabulary[word]))
52
+ yield(Record.new(word.strip))
60
53
  end
61
54
  end
62
55
  end
@@ -1,9 +1,12 @@
1
+ require "datasets/dictionary"
2
+
1
3
  module Datasets
2
4
  class Table
3
5
  include Enumerable
4
6
 
5
7
  def initialize(dataset)
6
8
  @dataset = dataset
9
+ @dictionaries = {}
7
10
  end
8
11
 
9
12
  def each(&block)
@@ -11,7 +14,16 @@ module Datasets
11
14
  end
12
15
 
13
16
  def [](name)
14
- columner_data[name.to_sym]
17
+ columner_data[normalize_name(name)]
18
+ end
19
+
20
+ def dictionary_encode(name)
21
+ @dictionaries[normalize_name(name)] ||= Dictionary.new(self[name])
22
+ end
23
+
24
+ def label_encode(name)
25
+ dictionary = dictionary_encode(name)
26
+ dictionary.encode(self[name])
15
27
  end
16
28
 
17
29
  def fetch_values(*keys)
@@ -55,5 +67,9 @@ module Datasets
55
67
  def columner_data
56
68
  @columns ||= to_h
57
69
  end
70
+
71
+ def normalize_name(name)
72
+ name.to_sym
73
+ end
58
74
  end
59
75
  end
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.0.6"
2
+ VERSION = "0.0.7"
3
3
  end
@@ -0,0 +1,64 @@
1
+ require 'csv'
2
+
3
+ require_relative 'dataset'
4
+
5
+ module Datasets
6
+ class Wine < Dataset
7
+ Record = Struct.new(:label,
8
+ :alcohol,
9
+ :malic_acid,
10
+ :ash,
11
+ :alcalinity_of_ash,
12
+ :n_magnesiums,
13
+ :total_phenols,
14
+ :total_flavonoids,
15
+ :total_nonflavanoid_phenols,
16
+ :total_proanthocyanins,
17
+ :color_intensity,
18
+ :hue,
19
+ :optical_nucleic_acid_concentration,
20
+ :n_prolines)
21
+
22
+ def initialize
23
+ super
24
+ @metadata.id = 'wine'
25
+ @metadata.name = 'Wine'
26
+ @metadata.url = 'http://archive.ics.uci.edu/ml/datasets/wine'
27
+ @metadata.description = -> { read_names }
28
+ end
29
+
30
+ def each
31
+ return to_enum(__method__) unless block_given?
32
+
33
+ open_data do |csv|
34
+ csv.each do |row|
35
+ next if row[0].nil?
36
+ record = Record.new(*row)
37
+ yield(record)
38
+ end
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ def read_names
45
+ names_path = cache_dir_path + 'wine.names'
46
+ unless names_path.exist?
47
+ names_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names'
48
+ download(names_path, names_url)
49
+ end
50
+ names_path.read
51
+ end
52
+
53
+ def open_data
54
+ data_path = cache_dir_path + 'wine.data'
55
+ unless data_path.exist?
56
+ data_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
57
+ download(data_path, data_url)
58
+ end
59
+ CSV.open(data_path, converters: %i[numeric]) do |csv|
60
+ yield(csv)
61
+ end
62
+ end
63
+ end
64
+ end
data/test/helper.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require "fileutils"
2
2
  require "pathname"
3
+ require "time"
3
4
 
4
5
  require "datasets"
5
6
 
@@ -0,0 +1,126 @@
1
+ class AdultTest < Test::Unit::TestCase
2
+ sub_test_case("train") do
3
+ def setup
4
+ @dataset = Datasets::Adult.new(type: :train)
5
+ end
6
+
7
+ def record(*args)
8
+ Datasets::Adult::Record.new(*args)
9
+ end
10
+
11
+ test("#each") do
12
+ records = @dataset.each.to_a
13
+ assert_equal([
14
+ 32561,
15
+ {
16
+ :age => 39,
17
+ :work_class => "State-gov",
18
+ :final_weight => 77516,
19
+ :education => "Bachelors",
20
+ :n_education_years => 13,
21
+ :marital_status => "Never-married",
22
+ :occupation => "Adm-clerical",
23
+ :relationship => "Not-in-family",
24
+ :race => "White",
25
+ :sex => "Male",
26
+ :capital_gain => 2174,
27
+ :capital_loss => 0,
28
+ :hours_per_week => 40,
29
+ :native_country => "United-States",
30
+ :label => "<=50K"
31
+ },
32
+ {
33
+ :age => 52,
34
+ :work_class => "Self-emp-inc",
35
+ :final_weight => 287927,
36
+ :education => "HS-grad",
37
+ :n_education_years => 9,
38
+ :marital_status => "Married-civ-spouse",
39
+ :occupation => "Exec-managerial",
40
+ :relationship => "Wife",
41
+ :race => "White",
42
+ :sex => "Female",
43
+ :capital_gain => 15024,
44
+ :capital_loss => 0,
45
+ :hours_per_week => 40,
46
+ :native_country => "United-States",
47
+ :label => ">50K"
48
+ }
49
+ ],
50
+ [
51
+ records.size,
52
+ records[0].to_h,
53
+ records[-1].to_h
54
+ ])
55
+ end
56
+ end
57
+
58
+ sub_test_case("test") do
59
+ def setup
60
+ @dataset = Datasets::Adult.new(type: :test)
61
+ end
62
+
63
+ def record(*args)
64
+ Datasets::Adult::Record.new(*args)
65
+ end
66
+
67
+ test("#each") do
68
+ records = @dataset.each.to_a
69
+ assert_equal([
70
+ 16281,
71
+ {
72
+ :age => 25,
73
+ :work_class => "Private",
74
+ :final_weight => 226802,
75
+ :education => "11th",
76
+ :n_education_years => 7,
77
+ :marital_status => "Never-married",
78
+ :occupation => "Machine-op-inspct",
79
+ :relationship => "Own-child",
80
+ :race => "Black",
81
+ :sex => "Male",
82
+ :capital_gain => 0,
83
+ :capital_loss => 0,
84
+ :hours_per_week => 40,
85
+ :native_country => "United-States",
86
+ :label => "<=50K."
87
+ },
88
+ {
89
+ :age => 35,
90
+ :work_class => "Self-emp-inc",
91
+ :final_weight => 182148,
92
+ :education => "Bachelors",
93
+ :n_education_years => 13,
94
+ :marital_status => "Married-civ-spouse",
95
+ :occupation => "Exec-managerial",
96
+ :relationship => "Husband",
97
+ :race => "White",
98
+ :sex => "Male",
99
+ :capital_gain => 0,
100
+ :capital_loss => 0,
101
+ :hours_per_week => 60,
102
+ :native_country => "United-States",
103
+ :label => ">50K."
104
+ }
105
+ ],
106
+ [
107
+ records.size,
108
+ records[0].to_h,
109
+ records[-1].to_h
110
+ ])
111
+ end
112
+ end
113
+
114
+ sub_test_case("#metadata") do
115
+ def setup
116
+ @dataset = Datasets::Adult.new(type: :train)
117
+ end
118
+
119
+ test("#description") do
120
+ description = @dataset.metadata.description
121
+ assert do
122
+ description.start_with?("| This data was extracted from the census bureau database found at")
123
+ end
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,43 @@
1
+ class DictionaryTest < Test::Unit::TestCase
2
+ def setup
3
+ penn_treebank = Datasets::PennTreebank.new(type: :test)
4
+ @dictionary = penn_treebank.to_table.dictionary_encode(:word)
5
+ end
6
+
7
+ test("#id") do
8
+ assert_equal(95, @dictionary.id("<unk>"))
9
+ end
10
+
11
+ test("#value") do
12
+ assert_equal("<unk>", @dictionary.value(95))
13
+ end
14
+
15
+ test("#ids") do
16
+ assert_equal([0, 1, 2, 3, 4], @dictionary.ids.first(5))
17
+ end
18
+
19
+ test("#values") do
20
+ assert_equal(["no", "it", "was", "n't", "black"],
21
+ @dictionary.values.first(5))
22
+ end
23
+
24
+ test("#each") do
25
+ assert_equal([
26
+ [0, "no"],
27
+ [1, "it"],
28
+ [2, "was"],
29
+ [3, "n't"],
30
+ [4, "black"],
31
+ ],
32
+ @dictionary.each.first(5).to_a)
33
+ end
34
+
35
+ test("#size") do
36
+ assert_equal(6048, @dictionary.size)
37
+ end
38
+
39
+ test("#length") do
40
+ assert_equal(@dictionary.size,
41
+ @dictionary.length)
42
+ end
43
+ end
@@ -0,0 +1,137 @@
1
+ class FashionMNISTTest < Test::Unit::TestCase
2
+ sub_test_case("Normal") do
3
+ sub_test_case("train") do
4
+ def setup
5
+ @dataset = Datasets::FashionMNIST.new(type: :train)
6
+ end
7
+
8
+ test("#each") do
9
+ records = @dataset.each.to_a
10
+ assert_equal([
11
+ 60000,
12
+ [
13
+ 9,
14
+ 784,
15
+ [0, 0, 0, 0, 237, 226, 217, 223, 222, 219],
16
+ [220, 232, 246, 0, 3, 202, 228, 224, 221, 211],
17
+ ],
18
+ [
19
+ 5,
20
+ 784,
21
+ [129, 153, 34, 0, 3, 3, 0, 3, 0, 24],
22
+ [180, 177, 177, 47, 101, 235, 194, 223, 232, 255],
23
+ ],
24
+ ],
25
+ [
26
+ records.size,
27
+ [
28
+ records[0].label,
29
+ records[0].pixels.size,
30
+ records[0].pixels[400, 10],
31
+ records[0].pixels[500, 10],
32
+ ],
33
+ [
34
+ records[-1].label,
35
+ records[-1].pixels.size,
36
+ records[-1].pixels[400, 10],
37
+ records[-1].pixels[500, 10],
38
+ ],
39
+ ])
40
+ end
41
+
42
+ test("#to_table") do
43
+ table_data = @dataset.to_table
44
+ assert_equal([
45
+ [0, 0, 0, 0, 237, 226, 217, 223, 222, 219],
46
+ [129, 153, 34, 0, 3, 3, 0, 3, 0, 24],
47
+ ],
48
+ [
49
+ table_data[:pixels][0][400, 10],
50
+ table_data[:pixels][-1][400, 10],
51
+ ])
52
+ end
53
+
54
+ sub_test_case("#metadata") do
55
+ test("#id") do
56
+ assert_equal("fashion-mnist-train", @dataset.metadata.id)
57
+ end
58
+
59
+ test("#name") do
60
+ assert_equal("Fashion-MNIST: train", @dataset.metadata.name)
61
+ end
62
+ end
63
+ end
64
+
65
+ sub_test_case("test") do
66
+ def setup
67
+ @dataset = Datasets::FashionMNIST.new(type: :test)
68
+ end
69
+
70
+ test("#each") do
71
+ records = @dataset.each.to_a
72
+ assert_equal([
73
+ 10000,
74
+ [
75
+ 9,
76
+ 784,
77
+ [1, 0, 0, 0, 98, 136, 110, 109, 110, 162],
78
+ [172, 161, 189, 62, 0, 68, 94, 90, 111, 114],
79
+ ],
80
+ [
81
+ 5,
82
+ 784,
83
+ [45, 45, 69, 128, 100, 120, 132, 123, 135, 171],
84
+ [63, 74, 72, 0, 1, 0, 0, 0, 4, 85],
85
+ ],
86
+ ],
87
+ [
88
+ records.size,
89
+ [
90
+ records[0].label,
91
+ records[0].pixels.size,
92
+ records[0].pixels[400, 10],
93
+ records[0].pixels[500, 10],
94
+ ],
95
+ [
96
+ records[-1].label,
97
+ records[-1].pixels.size,
98
+ records[-1].pixels[400, 10],
99
+ records[-1].pixels[500, 10],
100
+ ],
101
+ ])
102
+ end
103
+
104
+ test("#to_table") do
105
+ table_data = @dataset.to_table
106
+ assert_equal([
107
+ [1, 0, 0, 0, 98, 136, 110, 109, 110, 162],
108
+ [45, 45, 69, 128, 100, 120, 132, 123, 135, 171],
109
+ ],
110
+ [
111
+ table_data[:pixels][0][400, 10],
112
+ table_data[:pixels][-1][400, 10],
113
+ ])
114
+ end
115
+
116
+ sub_test_case("#metadata") do
117
+ test("#id") do
118
+ assert_equal("fashion-mnist-test", @dataset.metadata.id)
119
+ end
120
+
121
+ test("#name") do
122
+ assert_equal("Fashion-MNIST: test", @dataset.metadata.name)
123
+ end
124
+ end
125
+ end
126
+ end
127
+
128
+ sub_test_case("Abnormal") do
129
+ test("invalid type") do
130
+ invalid_type = :invalid
131
+ message = "Please set type :train or :test: #{invalid_type.inspect}"
132
+ assert_raise(ArgumentError.new(message)) do
133
+ Datasets::FashionMNIST.new(type: invalid_type)
134
+ end
135
+ end
136
+ end
137
+ end
data/test/test-mnist.rb CHANGED
@@ -1,100 +1,125 @@
1
1
  class MNISTTest < Test::Unit::TestCase
2
- include Helper::Sandbox
3
-
4
2
  sub_test_case("Normal") do
5
- def setup_data
6
- setup_sandbox
7
-
8
- def @dataset.cache_dir_path
9
- @cache_dir_path
10
- end
11
-
12
- def @dataset.cache_dir_path=(path)
13
- @cache_dir_path = path
14
- end
15
- @dataset.cache_dir_path = @tmp_dir
16
-
17
- def @dataset.download(output_path, url)
18
- image_magic_number = 2051
19
- label_magic_number = 2049
20
- n_image, image_size_x, image_size_y, label = 10, 28, 28, 1
21
-
22
- Zlib::GzipWriter.open(output_path) do |gz|
23
- if output_path.basename.to_s.include?("-images-")
24
- image_data = ([image_magic_number, n_image]).pack('N2') +
25
- ([image_size_x,image_size_y]).pack('N2') +
26
- ([0] * image_size_x * image_size_y).pack("C*") * n_image
27
- gz.puts(image_data)
28
- else
29
- label_data = ([label_magic_number, n_image]).pack('N2') +
30
- ([label] * n_image).pack("C*")
31
- gz.puts(label_data)
32
- end
33
- end
34
- end
35
- end
36
-
37
- def teardown
38
- teardown_sandbox
39
- end
40
-
41
3
  sub_test_case("train") do
42
4
  def setup
43
5
  @dataset = Datasets::MNIST.new(type: :train)
44
- setup_data()
45
6
  end
46
7
 
47
8
  test("#each") do
48
- raw_dataset = @dataset.collect do |record|
49
- {
50
- :label => record.label,
51
- :pixels => record.pixels
52
- }
53
- end
54
-
9
+ records = @dataset.each.to_a
55
10
  assert_equal([
56
- {
57
- :label => 1,
58
- :pixels => [0] * 28 * 28
59
- }
60
- ] * 10,
61
- raw_dataset)
11
+ 60000,
12
+ [
13
+ 5,
14
+ 784,
15
+ [0, 0, 0, 49, 238, 253, 253, 253, 253, 253],
16
+ [0, 0, 0, 0, 0, 81, 240, 253, 253, 119],
17
+ ],
18
+ [8,
19
+ 784,
20
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 62],
21
+ [0, 0, 190, 196, 14, 2, 97, 254, 252, 146],
22
+ ],
23
+ ],
24
+ [
25
+ records.size,
26
+ [
27
+ records[0].label,
28
+ records[0].pixels.size,
29
+ records[0].pixels[200, 10],
30
+ records[0].pixels[400, 10],
31
+ ],
32
+ [
33
+ records[-1].label,
34
+ records[-1].pixels.size,
35
+ records[-1].pixels[200, 10],
36
+ records[-1].pixels[400, 10],
37
+ ],
38
+ ])
62
39
  end
63
40
 
64
41
  test("#to_table") do
65
42
  table_data = @dataset.to_table
66
- assert_equal([[0] * 28 * 28] * 10,
67
- table_data[:pixels])
43
+ assert_equal([
44
+ [0, 0, 0, 49, 238, 253, 253, 253, 253, 253],
45
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 62],
46
+ ],
47
+ [
48
+ table_data[:pixels][0][200, 10],
49
+ table_data[:pixels][-1][200, 10],
50
+ ])
51
+ end
52
+
53
+ sub_test_case("#metadata") do
54
+ test("#id") do
55
+ assert_equal("mnist-train", @dataset.metadata.id)
56
+ end
57
+
58
+ test("#name") do
59
+ assert_equal("MNIST: train", @dataset.metadata.name)
60
+ end
68
61
  end
69
62
  end
70
63
 
71
64
  sub_test_case("test") do
72
65
  def setup
73
66
  @dataset = Datasets::MNIST.new(type: :test)
74
- setup_data()
75
67
  end
76
68
 
77
69
  test("#each") do
78
- raw_dataset = @dataset.collect do |record|
79
- {
80
- :label => record.label,
81
- :pixels => record.pixels
82
- }
83
- end
84
-
70
+ records = @dataset.each.to_a
85
71
  assert_equal([
86
- {
87
- :label => 1,
88
- :pixels => [0] * 28 * 28
89
- }
90
- ] * 10,
91
- raw_dataset)
72
+ 10000,
73
+ [
74
+ 7,
75
+ 784,
76
+ [0, 0, 84, 185, 159, 151, 60, 36, 0, 0],
77
+ [0, 0, 0, 0, 0, 0, 0, 0, 59, 249],
78
+ ],
79
+ [
80
+ 6,
81
+ 784,
82
+ [0, 0, 0, 0, 0, 15, 60, 60, 168, 253],
83
+ [253, 253, 132, 64, 0, 0, 18, 43, 157, 171],
84
+ ],
85
+ ],
86
+ [
87
+ records.size,
88
+ [
89
+ records[0].label,
90
+ records[0].pixels.size,
91
+ records[0].pixels[200, 10],
92
+ records[0].pixels[400, 10],
93
+ ],
94
+ [
95
+ records[-1].label,
96
+ records[-1].pixels.size,
97
+ records[-1].pixels[200, 10],
98
+ records[-1].pixels[400, 10],
99
+ ],
100
+ ])
92
101
  end
93
102
 
94
103
  test("#to_table") do
95
104
  table_data = @dataset.to_table
96
- assert_equal([[0] * 28 * 28] * 10,
97
- table_data[:pixels])
105
+ assert_equal([
106
+ [0, 0, 84, 185, 159, 151, 60, 36, 0, 0],
107
+ [0, 0, 0, 0, 0, 15, 60, 60, 168, 253],
108
+ ],
109
+ [
110
+ table_data[:pixels][0][200, 10],
111
+ table_data[:pixels][-1][200, 10],
112
+ ])
113
+ end
114
+
115
+ sub_test_case("#metadata") do
116
+ test("#id") do
117
+ assert_equal("mnist-test", @dataset.metadata.id)
118
+ end
119
+
120
+ test("#name") do
121
+ assert_equal("MNIST: test", @dataset.metadata.name)
122
+ end
98
123
  end
99
124
  end
100
125
  end
@@ -9,8 +9,8 @@ class PennTreebankTest < Test::Unit::TestCase
9
9
  records = dataset.to_a
10
10
  assert_equal([
11
11
  887521,
12
- record("aer", 0),
13
- record("<unk>", 25),
12
+ record("aer"),
13
+ record("<unk>"),
14
14
  ],
15
15
  [
16
16
  records.size,
@@ -24,8 +24,8 @@ class PennTreebankTest < Test::Unit::TestCase
24
24
  records = dataset.to_a
25
25
  assert_equal([
26
26
  78669,
27
- record("no", 0),
28
- record("us", 953),
27
+ record("no"),
28
+ record("us"),
29
29
  ],
30
30
  [
31
31
  records.size,
@@ -39,8 +39,8 @@ class PennTreebankTest < Test::Unit::TestCase
39
39
  records = dataset.to_a
40
40
  assert_equal([
41
41
  70390,
42
- record("consumers", 0),
43
- record("N", 28),
42
+ record("consumers"),
43
+ record("N"),
44
44
  ],
45
45
  [
46
46
  records.size,
data/test/test-table.rb CHANGED
@@ -8,6 +8,26 @@ class TableTest < Test::Unit::TestCase
8
8
  @table[:petal_length].first(5))
9
9
  end
10
10
 
11
+ test("#dictionary_encode") do
12
+ assert_equal([
13
+ [0, "Iris-setosa"],
14
+ [1, "Iris-versicolor"],
15
+ [2, "Iris-virginica"],
16
+ ],
17
+ @table.dictionary_encode(:label).to_a)
18
+ end
19
+
20
+ test("#label_encode") do
21
+ label_encoded_labels = @table.label_encode(:label)
22
+ labels = @table[:label]
23
+ assert_equal([0, 1, 2],
24
+ [
25
+ label_encoded_labels[labels.find_index("Iris-setosa")],
26
+ label_encoded_labels[labels.find_index("Iris-versicolor")],
27
+ label_encoded_labels[labels.find_index("Iris-virginica")],
28
+ ])
29
+ end
30
+
11
31
  sub_test_case("#fetch_values") do
12
32
  test("found") do
13
33
  values = @table.fetch_values(:petal_length, :petal_width)
@@ -44,7 +64,7 @@ class TableTest < Test::Unit::TestCase
44
64
  shorten_hash[name] = values.first(5)
45
65
  end
46
66
  assert_equal({
47
- :class => ["Iris-setosa"] * 5,
67
+ :label => ["Iris-setosa"] * 5,
48
68
  :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
49
69
  :petal_width => [0.2, 0.2, 0.2, 0.2, 0.2],
50
70
  :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
@@ -59,7 +79,7 @@ class TableTest < Test::Unit::TestCase
59
79
  shorten_hash[name] = values.first(5)
60
80
  end
61
81
  assert_equal({
62
- :class => ["Iris-setosa"] * 5,
82
+ :label => ["Iris-setosa"] * 5,
63
83
  :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
64
84
  :petal_width => [0.2, 0.2, 0.2, 0.2, 0.2],
65
85
  :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
data/test/test-wine.rb ADDED
@@ -0,0 +1,58 @@
1
+ class WineTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::Wine.new
4
+ end
5
+
6
+ test('#each') do
7
+ records = @dataset.each.to_a
8
+ assert_equal([
9
+ 178,
10
+ {
11
+ :alcalinity_of_ash => 15.6,
12
+ :alcohol => 14.23,
13
+ :ash => 2.43,
14
+ :label => 1,
15
+ :color_intensity => 5.64,
16
+ :hue => 1.04,
17
+ :malic_acid => 1.71,
18
+ :total_flavonoids => 3.06,
19
+ :n_magnesiums => 127,
20
+ :total_nonflavanoid_phenols => 0.28,
21
+ :total_proanthocyanins => 2.29,
22
+ :n_prolines => 1065,
23
+ :optical_nucleic_acid_concentration => 3.92,
24
+ :total_phenols => 2.8
25
+ },
26
+ {
27
+ :alcalinity_of_ash => 24.5,
28
+ :alcohol => 14.13,
29
+ :ash => 2.74,
30
+ :label => 3,
31
+ :color_intensity => 9.2,
32
+ :hue => 0.61,
33
+ :malic_acid => 4.1,
34
+ :total_flavonoids => 0.76,
35
+ :n_magnesiums => 96,
36
+ :total_nonflavanoid_phenols => 0.56,
37
+ :total_proanthocyanins => 1.35,
38
+ :n_prolines => 560,
39
+ :optical_nucleic_acid_concentration => 1.6,
40
+ :total_phenols => 2.05,
41
+ },
42
+ ],
43
+ [
44
+ records.size,
45
+ records[0].to_h,
46
+ records[-1].to_h,
47
+ ])
48
+ end
49
+
50
+ sub_test_case('#metadata') do
51
+ test('#description') do
52
+ description = @dataset.metadata.description
53
+ assert do
54
+ description.start_with?('1. Title of Database: Wine recognition data')
55
+ end
56
+ end
57
+ end
58
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red-datasets
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - tomisuker
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2018-07-25 00:00:00.000000000 Z
12
+ date: 2018-11-20 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -99,9 +99,12 @@ files:
99
99
  - Rakefile
100
100
  - doc/text/news.md
101
101
  - lib/datasets.rb
102
+ - lib/datasets/adult.rb
102
103
  - lib/datasets/cifar.rb
103
104
  - lib/datasets/dataset.rb
105
+ - lib/datasets/dictionary.rb
104
106
  - lib/datasets/downloader.rb
107
+ - lib/datasets/fashion-mnist.rb
105
108
  - lib/datasets/iris.rb
106
109
  - lib/datasets/metadata.rb
107
110
  - lib/datasets/mnist.rb
@@ -109,15 +112,20 @@ files:
109
112
  - lib/datasets/table.rb
110
113
  - lib/datasets/version.rb
111
114
  - lib/datasets/wikipedia.rb
115
+ - lib/datasets/wine.rb
112
116
  - red-datasets.gemspec
113
117
  - test/helper.rb
114
118
  - test/run-test.rb
119
+ - test/test-adult.rb
115
120
  - test/test-cifar.rb
121
+ - test/test-dictionary.rb
122
+ - test/test-fashion-mnist.rb
116
123
  - test/test-iris.rb
117
124
  - test/test-mnist.rb
118
125
  - test/test-penn-treebank.rb
119
126
  - test/test-table.rb
120
127
  - test/test-wikipedia.rb
128
+ - test/test-wine.rb
121
129
  homepage: https://github.com/red-data-tools/red-datasets
122
130
  licenses:
123
131
  - MIT
@@ -138,16 +146,20 @@ required_rubygems_version: !ruby/object:Gem::Requirement
138
146
  version: '0'
139
147
  requirements: []
140
148
  rubyforge_project:
141
- rubygems_version: 3.0.0.beta1
149
+ rubygems_version: 3.0.0.beta2
142
150
  signing_key:
143
151
  specification_version: 4
144
152
  summary: Red Datasets provides classes that provide common datasets such as iris dataset.
145
153
  test_files:
154
+ - test/test-wine.rb
146
155
  - test/run-test.rb
147
156
  - test/test-cifar.rb
157
+ - test/test-fashion-mnist.rb
148
158
  - test/test-wikipedia.rb
149
159
  - test/test-iris.rb
150
160
  - test/helper.rb
151
161
  - test/test-mnist.rb
152
162
  - test/test-table.rb
163
+ - test/test-adult.rb
153
164
  - test/test-penn-treebank.rb
165
+ - test/test-dictionary.rb