red-datasets 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3b96f5bf8fb7d8d7280451086dda394b65c42023b15ae077167e2d320c4361c1
4
- data.tar.gz: 96f7936d62d70749f92d3bdd1d7ef2d79cfff3091e7dae8221d6a0537dbd6d7b
3
+ metadata.gz: 222271b814e3a5ce23b5e0dd1d2578bffb84afdab10110b0869985c6056bfd3b
4
+ data.tar.gz: ac30931b3317ab04afd394b28a45a9206c784d78b3bcaf98fc3a2a48227c7930
5
5
  SHA512:
6
- metadata.gz: 859196aa39020d924fa7af4df6d96c110f41ac2b90a39dc89ed6935fc64e857b2bffb5776a366660ab61c55a96dd35b9bd6663ec23c7ee4249cae3103bc0a2aa
7
- data.tar.gz: b07ec53917af58e737058c504685d283850e072f0794c457bd961d39b9815c85b2fc2a9bed4de2a643675dc0e0f7bb2077b4c41b2c28c9c94f948a532baae6bb
6
+ metadata.gz: 8a94a3d66baaed4948904e97dc53100d73ae96c528c09b02252caabd05b8545587abf6fbcba3a578725812327a9a2c8827bbb7e283ccd3d7e66753bf30035e2e
7
+ data.tar.gz: 2ab44b5aa3ee5da0ac8e8307546c71942938de4497bfec05fc929715a4e5ef6df1cb091bce0d5f12978582d2c9fa7eaffff9edd54be0d845627dccfce42a63dd
data/README.md CHANGED
@@ -30,7 +30,7 @@ iris.each do |record|
30
30
  record.sepal_width,
31
31
  record.petal_length,
32
32
  record.petal_width,
33
- record.class,
33
+ record.label,
34
34
  ]
35
35
  end
36
36
  # => [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]
@@ -48,7 +48,7 @@ p iris_hash[:petal_length]
48
48
  # => [1.4, 1.4, .. , 4.7, ..
49
49
  p iris_hash[:petal_width]
50
50
  # => [0.2, 0.2, .. , 1.4, ..
51
- p iris_hash[:class]
51
+ p iris_hash[:label]
52
52
  # => ["Iris-setosa", "Iris-setosa", .. , "Iris-versicolor", ..
53
53
 
54
54
 
@@ -60,7 +60,7 @@ p iris_table.fetch_values(:sepal_length, :sepal_width, :petal_length, :petal_wid
60
60
  [7.0, 3.2, 4.7, 1.4],
61
61
  :
62
62
 
63
- p iris_table[:class]
63
+ p iris_table[:label]
64
64
  # => ["Iris-setosa", "Iris-setosa", .. , "Iris-versicolor", ..
65
65
  ```
66
66
 
data/doc/text/news.md CHANGED
@@ -1,5 +1,36 @@
1
1
  # News
2
2
 
3
+ ## 0.0.7 - 2018-11-21
4
+
5
+ ### Improvements
6
+
7
+ * `Datasets::Table#dictionary_encode`: Added.
8
+ [GitHub#22]
9
+
10
+ * `Datasets::Table#label_encode`: Added.
11
+
12
+ * `Datasets::Dictionary`: Added.
13
+
14
+ * `Datasets::Wine`: Added.
15
+ [GitHub#26][Patch by Ryuta Suzuki]
16
+
17
+ * `Datasets::FashionMNIST`: Added.
18
+ [GitHub#27][Patch by chimame]
19
+
20
+ * `Datasets::Iris::Record#label`: Renamed from `#class`. This is an
21
+ incompatible change.
22
+
23
+ * `Datasets::Adult`: Added.
24
+ [GitHub#30][Patch by Yasuo Honda]
25
+
26
+ ### Thanks
27
+
28
+ * Ryuta Suzuki
29
+
30
+ * chimame
31
+
32
+ * Yasuo Honda
33
+
3
34
  ## 0.0.6 - 2018-07-25
4
35
 
5
36
  ### Improvements
data/lib/datasets.rb CHANGED
@@ -1,7 +1,10 @@
1
1
  require "datasets/version"
2
2
 
3
+ require "datasets/adult"
3
4
  require "datasets/cifar"
5
+ require "datasets/fashion-mnist"
4
6
  require "datasets/iris"
5
7
  require "datasets/mnist"
6
8
  require "datasets/penn-treebank"
7
9
  require "datasets/wikipedia"
10
+ require "datasets/wine"
@@ -0,0 +1,83 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class Adult < Dataset
7
+ Record = Struct.new(
8
+ :age,
9
+ :work_class,
10
+ :final_weight,
11
+ :education,
12
+ :n_education_years,
13
+ :marital_status,
14
+ :occupation,
15
+ :relationship,
16
+ :race,
17
+ :sex,
18
+ :capital_gain,
19
+ :capital_loss,
20
+ :hours_per_week,
21
+ :native_country,
22
+ :label
23
+ )
24
+
25
+ def initialize(type: :train)
26
+ unless [:train, :test].include?(type)
27
+ raise ArgumentError, 'Please set type :train or :test'
28
+ end
29
+
30
+ super()
31
+ @type = type
32
+ @metadata.id = "adult-#{@type}"
33
+ @metadata.name = "Adult: #{@type}"
34
+ @metadata.url = "http://archive.ics.uci.edu/ml/datasets/adult"
35
+ @metadata.description = lambda do
36
+ read_names
37
+ end
38
+ end
39
+
40
+ def each
41
+ return to_enum(__method__) unless block_given?
42
+
43
+ open_data do |csv|
44
+ csv.each do |row|
45
+ next if row[0].nil?
46
+ record = Record.new(*row)
47
+ yield(record)
48
+ end
49
+ end
50
+ end
51
+
52
+ private
53
+ def open_data
54
+ case @type
55
+ when :train
56
+ ext = "data"
57
+ when :test
58
+ ext = "test"
59
+ end
60
+ data_path = cache_dir_path + "adult-#{ext}.csv"
61
+ unless data_path.exist?
62
+ data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.#{ext}"
63
+ download(data_path, data_url)
64
+ end
65
+ CSV.open(data_path,
66
+ {
67
+ converters: [:numeric, lambda {|f| f.strip}],
68
+ skip_lines: /\A\|/,
69
+ }) do |csv|
70
+ yield(csv)
71
+ end
72
+ end
73
+
74
+ def read_names
75
+ names_path = cache_dir_path + "adult.names"
76
+ unless names_path.exist?
77
+ names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names"
78
+ download(names_path, names_url)
79
+ end
80
+ names_path.read
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,59 @@
1
+ module Datasets
2
+ class Dictionary
3
+ include Enumerable
4
+
5
+ def initialize(values)
6
+ build_dictionary(values)
7
+ end
8
+
9
+ def id(value)
10
+ @value_to_id[value]
11
+ end
12
+
13
+ def value(id)
14
+ @id_to_value[id]
15
+ end
16
+
17
+ def ids
18
+ @id_to_value.keys
19
+ end
20
+
21
+ def values
22
+ @id_to_value.values
23
+ end
24
+
25
+ def each(&block)
26
+ @id_to_value.each(&block)
27
+ end
28
+
29
+ def size
30
+ @id_to_value.size
31
+ end
32
+ alias_method :length, :size
33
+
34
+ def encode(values)
35
+ values.collect do |value|
36
+ id(value)
37
+ end
38
+ end
39
+
40
+ def decode(ids)
41
+ ids.collect do |id|
42
+ value(id)
43
+ end
44
+ end
45
+
46
+ private
47
+ def build_dictionary(values)
48
+ @id_to_value = {}
49
+ @value_to_id = {}
50
+ id = 0
51
+ values.each do |value|
52
+ next if @value_to_id.key?(value)
53
+ @id_to_value[id] = value
54
+ @value_to_id[value] = id
55
+ id += 1
56
+ end
57
+ end
58
+ end
59
+ end
@@ -3,7 +3,7 @@ begin
3
3
  require "io/console"
4
4
  rescue LoadError
5
5
  end
6
- require "open-uri"
6
+ require "net/http"
7
7
  require "pathname"
8
8
 
9
9
  module Datasets
@@ -15,84 +15,57 @@ module Datasets
15
15
  url = URI.parse(url)
16
16
  end
17
17
  @url = url
18
- @url.extend(CurrentBufferReadable)
18
+ unless @url.is_a?(URI::HTTP)
19
+ raise ArgumentError, "download URL must be HTTP or HTTPS: <#{@url}>"
20
+ end
19
21
  end
20
22
 
21
23
  def download(output_path)
22
24
  output_path.parent.mkpath
23
25
 
26
+ headers = {"User-Agent" => "Red Datasets/#{VERSION}"}
24
27
  start = nil
25
28
  partial_output_path = Pathname.new("#{output_path}.partial")
26
29
  if partial_output_path.exist?
27
30
  start = partial_output_path.size
31
+ headers["Range"] = "bytes=#{start}-"
28
32
  end
29
33
 
30
- progress_reporter = nil
31
- content_length_proc = lambda do |content_length|
32
- base_name = @url.path.split("/").last
33
- size_max = content_length
34
- size_max += start if start
35
- progress_reporter = ProgressReporter.new(base_name, size_max)
36
- end
37
- progress_proc = lambda do |size_current|
38
- size_current += start if start
39
- progress_reporter.report(size_current) if progress_reporter
40
- end
41
- options = {
42
- :content_length_proc => content_length_proc,
43
- :progress_proc => progress_proc,
44
- }
45
- if start
46
- options["Range"] = "bytes=#{start}-"
47
- end
34
+ Net::HTTP.start(@url.hostname,
35
+ @url.port,
36
+ :use_ssl => (@url.scheme == "https")) do |http|
37
+ request = Net::HTTP::Get.new(@url.path, headers)
38
+ http.request(request) do |response|
39
+ case response
40
+ when Net::HTTPPartialContent
41
+ mode = "ab"
42
+ when Net::HTTPSuccess
43
+ start = nil
44
+ mode = "wb"
45
+ else
46
+ break
47
+ end
48
48
 
49
- begin
50
- @url.open(options) do |input|
51
- copy_stream(input, partial_output_path)
52
- end
53
- rescue Interrupt, Net::ReadTimeout
54
- if @url.current_buffer
55
- input = @url.current_buffer.io
56
- input.rewind
57
- copy_stream(input, partial_output_path)
49
+ base_name = @url.path.split("/").last
50
+ size_current = 0
51
+ size_max = response.content_length
52
+ if start
53
+ size_current += start
54
+ size_max += start
55
+ end
56
+ progress_reporter = ProgressReporter.new(base_name, size_max)
57
+ partial_output_path.open(mode) do |output|
58
+ response.read_body do |chunk|
59
+ size_current += chunk.bytesize
60
+ progress_reporter.report(size_current)
61
+ output.write(chunk)
62
+ end
63
+ end
58
64
  end
59
- raise
60
65
  end
61
-
62
66
  FileUtils.mv(partial_output_path, output_path)
63
67
  end
64
68
 
65
- private
66
- def copy_stream(input, partial_output_path)
67
- if partial_output_path.exist?
68
- # TODO: It's better that we use "206 Partial Content" response
69
- # to detect partial response.
70
- partial_head = partial_output_path.open("rb") do |partial_output|
71
- partial_output.read(256)
72
- end
73
- input_head = input.read(partial_head.bytesize)
74
- input.rewind
75
- if partial_head == input_head
76
- mode = "wb"
77
- else
78
- mode = "ab"
79
- end
80
- else
81
- mode = "wb"
82
- end
83
- partial_output_path.open(mode) do |partial_output|
84
- IO.copy_stream(input, partial_output)
85
- end
86
- end
87
-
88
- module CurrentBufferReadable
89
- attr_reader :current_buffer
90
- def buffer_open(buffer, proxy, options)
91
- @current_buffer = buffer
92
- super
93
- end
94
- end
95
-
96
69
  class ProgressReporter
97
70
  def initialize(base_name, size_max)
98
71
  @base_name = base_name
@@ -0,0 +1,12 @@
1
+ require_relative 'mnist'
2
+
3
+ module Datasets
4
+ class FashionMNIST < MNIST
5
+ BASE_URL = "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/"
6
+
7
+ private
8
+ def dataset_name
9
+ "Fashion-MNIST"
10
+ end
11
+ end
12
+ end
data/lib/datasets/iris.rb CHANGED
@@ -8,7 +8,7 @@ module Datasets
8
8
  :sepal_width,
9
9
  :petal_length,
10
10
  :petal_width,
11
- :class)
11
+ :label)
12
12
 
13
13
  def initialize
14
14
  super()
@@ -6,6 +6,7 @@ class SetTypeError < StandardError; end
6
6
 
7
7
  module Datasets
8
8
  class MNIST < Dataset
9
+ BASE_URL = "http://yann.lecun.com/exdb/mnist/"
9
10
 
10
11
  class Record < Struct.new(:data, :label)
11
12
  def pixels
@@ -26,9 +27,9 @@ module Datasets
26
27
 
27
28
  super()
28
29
 
29
- @metadata.id = "mnist-#{type}"
30
- @metadata.name = "MNIST: #{type}"
31
- @metadata.url = "http://yann.lecun.com/exdb/mnist/"
30
+ @metadata.id = "#{dataset_name.downcase}-#{type}"
31
+ @metadata.name = "#{dataset_name}: #{type}"
32
+ @metadata.url = self.class::BASE_URL
32
33
  @type = type
33
34
 
34
35
  case type
@@ -44,7 +45,7 @@ module Datasets
44
45
 
45
46
  image_path = cache_dir_path + target_file(:image)
46
47
  label_path = cache_dir_path + target_file(:label)
47
- base_url = "http://yann.lecun.com/exdb/mnist/"
48
+ base_url = self.class::BASE_URL
48
49
 
49
50
  unless image_path.exist?
50
51
  download(image_path, base_url + target_file(:image))
@@ -66,7 +67,7 @@ module Datasets
66
67
  n_bytes = n_uint32s * 4
67
68
  mnist_magic_number = 2051
68
69
  magic, n_images, n_rows, n_cols = f.read(n_bytes).unpack("N*")
69
- raise 'This is not MNIST image file' if magic != mnist_magic_number
70
+ raise "This is not #{dataset_name} image file" if magic != mnist_magic_number
70
71
  n_images.times do |i|
71
72
  data = f.read(n_rows * n_cols)
72
73
  label = labels[i]
@@ -100,9 +101,13 @@ module Datasets
100
101
  n_bytes = n_uint32s * 2
101
102
  mnist_magic_number = 2049
102
103
  magic, n_labels = f.read(n_bytes).unpack('N2')
103
- raise 'This is not MNIST label file' if magic != mnist_magic_number
104
+ raise "This is not #{dataset_name} label file" if magic != mnist_magic_number
104
105
  f.read(n_labels).unpack('C*')
105
106
  end
106
107
  end
108
+
109
+ def dataset_name
110
+ "MNIST"
111
+ end
107
112
  end
108
113
  end
@@ -2,7 +2,7 @@ require_relative "dataset"
2
2
 
3
3
  module Datasets
4
4
  class PennTreebank < Dataset
5
- Record = Struct.new(:word, :id)
5
+ Record = Struct.new(:word)
6
6
 
7
7
  DESCRIPTION = <<~DESC
8
8
  `Penn Tree Bank <https://www.cis.upenn.edu/~treebank/>`_ is originally a
@@ -46,17 +46,10 @@ module Datasets
46
46
 
47
47
  private
48
48
  def parse_data(data_path)
49
- index = 0
50
- vocabulary = {}
51
49
  File.open(data_path) do |f|
52
50
  f.each_line do |line|
53
51
  line.split.each do |word|
54
- word = word.strip
55
- unless vocabulary.key?(word)
56
- vocabulary[word] = index
57
- index += 1
58
- end
59
- yield(Record.new(word, vocabulary[word]))
52
+ yield(Record.new(word.strip))
60
53
  end
61
54
  end
62
55
  end
@@ -1,9 +1,12 @@
1
+ require "datasets/dictionary"
2
+
1
3
  module Datasets
2
4
  class Table
3
5
  include Enumerable
4
6
 
5
7
  def initialize(dataset)
6
8
  @dataset = dataset
9
+ @dictionaries = {}
7
10
  end
8
11
 
9
12
  def each(&block)
@@ -11,7 +14,16 @@ module Datasets
11
14
  end
12
15
 
13
16
  def [](name)
14
- columner_data[name.to_sym]
17
+ columner_data[normalize_name(name)]
18
+ end
19
+
20
+ def dictionary_encode(name)
21
+ @dictionaries[normalize_name(name)] ||= Dictionary.new(self[name])
22
+ end
23
+
24
+ def label_encode(name)
25
+ dictionary = dictionary_encode(name)
26
+ dictionary.encode(self[name])
15
27
  end
16
28
 
17
29
  def fetch_values(*keys)
@@ -55,5 +67,9 @@ module Datasets
55
67
  def columner_data
56
68
  @columns ||= to_h
57
69
  end
70
+
71
+ def normalize_name(name)
72
+ name.to_sym
73
+ end
58
74
  end
59
75
  end
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.0.6"
2
+ VERSION = "0.0.7"
3
3
  end
@@ -0,0 +1,64 @@
1
+ require 'csv'
2
+
3
+ require_relative 'dataset'
4
+
5
+ module Datasets
6
+ class Wine < Dataset
7
+ Record = Struct.new(:label,
8
+ :alcohol,
9
+ :malic_acid,
10
+ :ash,
11
+ :alcalinity_of_ash,
12
+ :n_magnesiums,
13
+ :total_phenols,
14
+ :total_flavonoids,
15
+ :total_nonflavanoid_phenols,
16
+ :total_proanthocyanins,
17
+ :color_intensity,
18
+ :hue,
19
+ :optical_nucleic_acid_concentration,
20
+ :n_prolines)
21
+
22
+ def initialize
23
+ super
24
+ @metadata.id = 'wine'
25
+ @metadata.name = 'Wine'
26
+ @metadata.url = 'http://archive.ics.uci.edu/ml/datasets/wine'
27
+ @metadata.description = -> { read_names }
28
+ end
29
+
30
+ def each
31
+ return to_enum(__method__) unless block_given?
32
+
33
+ open_data do |csv|
34
+ csv.each do |row|
35
+ next if row[0].nil?
36
+ record = Record.new(*row)
37
+ yield(record)
38
+ end
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ def read_names
45
+ names_path = cache_dir_path + 'wine.names'
46
+ unless names_path.exist?
47
+ names_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names'
48
+ download(names_path, names_url)
49
+ end
50
+ names_path.read
51
+ end
52
+
53
+ def open_data
54
+ data_path = cache_dir_path + 'wine.data'
55
+ unless data_path.exist?
56
+ data_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
57
+ download(data_path, data_url)
58
+ end
59
+ CSV.open(data_path, converters: %i[numeric]) do |csv|
60
+ yield(csv)
61
+ end
62
+ end
63
+ end
64
+ end
data/test/helper.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require "fileutils"
2
2
  require "pathname"
3
+ require "time"
3
4
 
4
5
  require "datasets"
5
6
 
@@ -0,0 +1,126 @@
1
+ class AdultTest < Test::Unit::TestCase
2
+ sub_test_case("train") do
3
+ def setup
4
+ @dataset = Datasets::Adult.new(type: :train)
5
+ end
6
+
7
+ def record(*args)
8
+ Datasets::Adult::Record.new(*args)
9
+ end
10
+
11
+ test("#each") do
12
+ records = @dataset.each.to_a
13
+ assert_equal([
14
+ 32561,
15
+ {
16
+ :age => 39,
17
+ :work_class => "State-gov",
18
+ :final_weight => 77516,
19
+ :education => "Bachelors",
20
+ :n_education_years => 13,
21
+ :marital_status => "Never-married",
22
+ :occupation => "Adm-clerical",
23
+ :relationship => "Not-in-family",
24
+ :race => "White",
25
+ :sex => "Male",
26
+ :capital_gain => 2174,
27
+ :capital_loss => 0,
28
+ :hours_per_week => 40,
29
+ :native_country => "United-States",
30
+ :label => "<=50K"
31
+ },
32
+ {
33
+ :age => 52,
34
+ :work_class => "Self-emp-inc",
35
+ :final_weight => 287927,
36
+ :education => "HS-grad",
37
+ :n_education_years => 9,
38
+ :marital_status => "Married-civ-spouse",
39
+ :occupation => "Exec-managerial",
40
+ :relationship => "Wife",
41
+ :race => "White",
42
+ :sex => "Female",
43
+ :capital_gain => 15024,
44
+ :capital_loss => 0,
45
+ :hours_per_week => 40,
46
+ :native_country => "United-States",
47
+ :label => ">50K"
48
+ }
49
+ ],
50
+ [
51
+ records.size,
52
+ records[0].to_h,
53
+ records[-1].to_h
54
+ ])
55
+ end
56
+ end
57
+
58
+ sub_test_case("test") do
59
+ def setup
60
+ @dataset = Datasets::Adult.new(type: :test)
61
+ end
62
+
63
+ def record(*args)
64
+ Datasets::Adult::Record.new(*args)
65
+ end
66
+
67
+ test("#each") do
68
+ records = @dataset.each.to_a
69
+ assert_equal([
70
+ 16281,
71
+ {
72
+ :age => 25,
73
+ :work_class => "Private",
74
+ :final_weight => 226802,
75
+ :education => "11th",
76
+ :n_education_years => 7,
77
+ :marital_status => "Never-married",
78
+ :occupation => "Machine-op-inspct",
79
+ :relationship => "Own-child",
80
+ :race => "Black",
81
+ :sex => "Male",
82
+ :capital_gain => 0,
83
+ :capital_loss => 0,
84
+ :hours_per_week => 40,
85
+ :native_country => "United-States",
86
+ :label => "<=50K."
87
+ },
88
+ {
89
+ :age => 35,
90
+ :work_class => "Self-emp-inc",
91
+ :final_weight => 182148,
92
+ :education => "Bachelors",
93
+ :n_education_years => 13,
94
+ :marital_status => "Married-civ-spouse",
95
+ :occupation => "Exec-managerial",
96
+ :relationship => "Husband",
97
+ :race => "White",
98
+ :sex => "Male",
99
+ :capital_gain => 0,
100
+ :capital_loss => 0,
101
+ :hours_per_week => 60,
102
+ :native_country => "United-States",
103
+ :label => ">50K."
104
+ }
105
+ ],
106
+ [
107
+ records.size,
108
+ records[0].to_h,
109
+ records[-1].to_h
110
+ ])
111
+ end
112
+ end
113
+
114
+ sub_test_case("#metadata") do
115
+ def setup
116
+ @dataset = Datasets::Adult.new(type: :train)
117
+ end
118
+
119
+ test("#description") do
120
+ description = @dataset.metadata.description
121
+ assert do
122
+ description.start_with?("| This data was extracted from the census bureau database found at")
123
+ end
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,43 @@
1
+ class DictionaryTest < Test::Unit::TestCase
2
+ def setup
3
+ penn_treebank = Datasets::PennTreebank.new(type: :test)
4
+ @dictionary = penn_treebank.to_table.dictionary_encode(:word)
5
+ end
6
+
7
+ test("#id") do
8
+ assert_equal(95, @dictionary.id("<unk>"))
9
+ end
10
+
11
+ test("#value") do
12
+ assert_equal("<unk>", @dictionary.value(95))
13
+ end
14
+
15
+ test("#ids") do
16
+ assert_equal([0, 1, 2, 3, 4], @dictionary.ids.first(5))
17
+ end
18
+
19
+ test("#values") do
20
+ assert_equal(["no", "it", "was", "n't", "black"],
21
+ @dictionary.values.first(5))
22
+ end
23
+
24
+ test("#each") do
25
+ assert_equal([
26
+ [0, "no"],
27
+ [1, "it"],
28
+ [2, "was"],
29
+ [3, "n't"],
30
+ [4, "black"],
31
+ ],
32
+ @dictionary.each.first(5).to_a)
33
+ end
34
+
35
+ test("#size") do
36
+ assert_equal(6048, @dictionary.size)
37
+ end
38
+
39
+ test("#length") do
40
+ assert_equal(@dictionary.size,
41
+ @dictionary.length)
42
+ end
43
+ end
@@ -0,0 +1,137 @@
1
+ class FashionMNISTTest < Test::Unit::TestCase
2
+ sub_test_case("Normal") do
3
+ sub_test_case("train") do
4
+ def setup
5
+ @dataset = Datasets::FashionMNIST.new(type: :train)
6
+ end
7
+
8
+ test("#each") do
9
+ records = @dataset.each.to_a
10
+ assert_equal([
11
+ 60000,
12
+ [
13
+ 9,
14
+ 784,
15
+ [0, 0, 0, 0, 237, 226, 217, 223, 222, 219],
16
+ [220, 232, 246, 0, 3, 202, 228, 224, 221, 211],
17
+ ],
18
+ [
19
+ 5,
20
+ 784,
21
+ [129, 153, 34, 0, 3, 3, 0, 3, 0, 24],
22
+ [180, 177, 177, 47, 101, 235, 194, 223, 232, 255],
23
+ ],
24
+ ],
25
+ [
26
+ records.size,
27
+ [
28
+ records[0].label,
29
+ records[0].pixels.size,
30
+ records[0].pixels[400, 10],
31
+ records[0].pixels[500, 10],
32
+ ],
33
+ [
34
+ records[-1].label,
35
+ records[-1].pixels.size,
36
+ records[-1].pixels[400, 10],
37
+ records[-1].pixels[500, 10],
38
+ ],
39
+ ])
40
+ end
41
+
42
+ test("#to_table") do
43
+ table_data = @dataset.to_table
44
+ assert_equal([
45
+ [0, 0, 0, 0, 237, 226, 217, 223, 222, 219],
46
+ [129, 153, 34, 0, 3, 3, 0, 3, 0, 24],
47
+ ],
48
+ [
49
+ table_data[:pixels][0][400, 10],
50
+ table_data[:pixels][-1][400, 10],
51
+ ])
52
+ end
53
+
54
+ sub_test_case("#metadata") do
55
+ test("#id") do
56
+ assert_equal("fashion-mnist-train", @dataset.metadata.id)
57
+ end
58
+
59
+ test("#name") do
60
+ assert_equal("Fashion-MNIST: train", @dataset.metadata.name)
61
+ end
62
+ end
63
+ end
64
+
65
+ sub_test_case("test") do
66
+ def setup
67
+ @dataset = Datasets::FashionMNIST.new(type: :test)
68
+ end
69
+
70
+ test("#each") do
71
+ records = @dataset.each.to_a
72
+ assert_equal([
73
+ 10000,
74
+ [
75
+ 9,
76
+ 784,
77
+ [1, 0, 0, 0, 98, 136, 110, 109, 110, 162],
78
+ [172, 161, 189, 62, 0, 68, 94, 90, 111, 114],
79
+ ],
80
+ [
81
+ 5,
82
+ 784,
83
+ [45, 45, 69, 128, 100, 120, 132, 123, 135, 171],
84
+ [63, 74, 72, 0, 1, 0, 0, 0, 4, 85],
85
+ ],
86
+ ],
87
+ [
88
+ records.size,
89
+ [
90
+ records[0].label,
91
+ records[0].pixels.size,
92
+ records[0].pixels[400, 10],
93
+ records[0].pixels[500, 10],
94
+ ],
95
+ [
96
+ records[-1].label,
97
+ records[-1].pixels.size,
98
+ records[-1].pixels[400, 10],
99
+ records[-1].pixels[500, 10],
100
+ ],
101
+ ])
102
+ end
103
+
104
+ test("#to_table") do
105
+ table_data = @dataset.to_table
106
+ assert_equal([
107
+ [1, 0, 0, 0, 98, 136, 110, 109, 110, 162],
108
+ [45, 45, 69, 128, 100, 120, 132, 123, 135, 171],
109
+ ],
110
+ [
111
+ table_data[:pixels][0][400, 10],
112
+ table_data[:pixels][-1][400, 10],
113
+ ])
114
+ end
115
+
116
+ sub_test_case("#metadata") do
117
+ test("#id") do
118
+ assert_equal("fashion-mnist-test", @dataset.metadata.id)
119
+ end
120
+
121
+ test("#name") do
122
+ assert_equal("Fashion-MNIST: test", @dataset.metadata.name)
123
+ end
124
+ end
125
+ end
126
+ end
127
+
128
+ sub_test_case("Abnormal") do
129
+ test("invalid type") do
130
+ invalid_type = :invalid
131
+ message = "Please set type :train or :test: #{invalid_type.inspect}"
132
+ assert_raise(ArgumentError.new(message)) do
133
+ Datasets::FashionMNIST.new(type: invalid_type)
134
+ end
135
+ end
136
+ end
137
+ end
data/test/test-mnist.rb CHANGED
@@ -1,100 +1,125 @@
1
1
  class MNISTTest < Test::Unit::TestCase
2
- include Helper::Sandbox
3
-
4
2
  sub_test_case("Normal") do
5
- def setup_data
6
- setup_sandbox
7
-
8
- def @dataset.cache_dir_path
9
- @cache_dir_path
10
- end
11
-
12
- def @dataset.cache_dir_path=(path)
13
- @cache_dir_path = path
14
- end
15
- @dataset.cache_dir_path = @tmp_dir
16
-
17
- def @dataset.download(output_path, url)
18
- image_magic_number = 2051
19
- label_magic_number = 2049
20
- n_image, image_size_x, image_size_y, label = 10, 28, 28, 1
21
-
22
- Zlib::GzipWriter.open(output_path) do |gz|
23
- if output_path.basename.to_s.include?("-images-")
24
- image_data = ([image_magic_number, n_image]).pack('N2') +
25
- ([image_size_x,image_size_y]).pack('N2') +
26
- ([0] * image_size_x * image_size_y).pack("C*") * n_image
27
- gz.puts(image_data)
28
- else
29
- label_data = ([label_magic_number, n_image]).pack('N2') +
30
- ([label] * n_image).pack("C*")
31
- gz.puts(label_data)
32
- end
33
- end
34
- end
35
- end
36
-
37
- def teardown
38
- teardown_sandbox
39
- end
40
-
41
3
  sub_test_case("train") do
42
4
  def setup
43
5
  @dataset = Datasets::MNIST.new(type: :train)
44
- setup_data()
45
6
  end
46
7
 
47
8
  test("#each") do
48
- raw_dataset = @dataset.collect do |record|
49
- {
50
- :label => record.label,
51
- :pixels => record.pixels
52
- }
53
- end
54
-
9
+ records = @dataset.each.to_a
55
10
  assert_equal([
56
- {
57
- :label => 1,
58
- :pixels => [0] * 28 * 28
59
- }
60
- ] * 10,
61
- raw_dataset)
11
+ 60000,
12
+ [
13
+ 5,
14
+ 784,
15
+ [0, 0, 0, 49, 238, 253, 253, 253, 253, 253],
16
+ [0, 0, 0, 0, 0, 81, 240, 253, 253, 119],
17
+ ],
18
+ [8,
19
+ 784,
20
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 62],
21
+ [0, 0, 190, 196, 14, 2, 97, 254, 252, 146],
22
+ ],
23
+ ],
24
+ [
25
+ records.size,
26
+ [
27
+ records[0].label,
28
+ records[0].pixels.size,
29
+ records[0].pixels[200, 10],
30
+ records[0].pixels[400, 10],
31
+ ],
32
+ [
33
+ records[-1].label,
34
+ records[-1].pixels.size,
35
+ records[-1].pixels[200, 10],
36
+ records[-1].pixels[400, 10],
37
+ ],
38
+ ])
62
39
  end
63
40
 
64
41
  test("#to_table") do
65
42
  table_data = @dataset.to_table
66
- assert_equal([[0] * 28 * 28] * 10,
67
- table_data[:pixels])
43
+ assert_equal([
44
+ [0, 0, 0, 49, 238, 253, 253, 253, 253, 253],
45
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 62],
46
+ ],
47
+ [
48
+ table_data[:pixels][0][200, 10],
49
+ table_data[:pixels][-1][200, 10],
50
+ ])
51
+ end
52
+
53
+ sub_test_case("#metadata") do
54
+ test("#id") do
55
+ assert_equal("mnist-train", @dataset.metadata.id)
56
+ end
57
+
58
+ test("#name") do
59
+ assert_equal("MNIST: train", @dataset.metadata.name)
60
+ end
68
61
  end
69
62
  end
70
63
 
71
64
  sub_test_case("test") do
72
65
  def setup
73
66
  @dataset = Datasets::MNIST.new(type: :test)
74
- setup_data()
75
67
  end
76
68
 
77
69
  test("#each") do
78
- raw_dataset = @dataset.collect do |record|
79
- {
80
- :label => record.label,
81
- :pixels => record.pixels
82
- }
83
- end
84
-
70
+ records = @dataset.each.to_a
85
71
  assert_equal([
86
- {
87
- :label => 1,
88
- :pixels => [0] * 28 * 28
89
- }
90
- ] * 10,
91
- raw_dataset)
72
+ 10000,
73
+ [
74
+ 7,
75
+ 784,
76
+ [0, 0, 84, 185, 159, 151, 60, 36, 0, 0],
77
+ [0, 0, 0, 0, 0, 0, 0, 0, 59, 249],
78
+ ],
79
+ [
80
+ 6,
81
+ 784,
82
+ [0, 0, 0, 0, 0, 15, 60, 60, 168, 253],
83
+ [253, 253, 132, 64, 0, 0, 18, 43, 157, 171],
84
+ ],
85
+ ],
86
+ [
87
+ records.size,
88
+ [
89
+ records[0].label,
90
+ records[0].pixels.size,
91
+ records[0].pixels[200, 10],
92
+ records[0].pixels[400, 10],
93
+ ],
94
+ [
95
+ records[-1].label,
96
+ records[-1].pixels.size,
97
+ records[-1].pixels[200, 10],
98
+ records[-1].pixels[400, 10],
99
+ ],
100
+ ])
92
101
  end
93
102
 
94
103
  test("#to_table") do
95
104
  table_data = @dataset.to_table
96
- assert_equal([[0] * 28 * 28] * 10,
97
- table_data[:pixels])
105
+ assert_equal([
106
+ [0, 0, 84, 185, 159, 151, 60, 36, 0, 0],
107
+ [0, 0, 0, 0, 0, 15, 60, 60, 168, 253],
108
+ ],
109
+ [
110
+ table_data[:pixels][0][200, 10],
111
+ table_data[:pixels][-1][200, 10],
112
+ ])
113
+ end
114
+
115
+ sub_test_case("#metadata") do
116
+ test("#id") do
117
+ assert_equal("mnist-test", @dataset.metadata.id)
118
+ end
119
+
120
+ test("#name") do
121
+ assert_equal("MNIST: test", @dataset.metadata.name)
122
+ end
98
123
  end
99
124
  end
100
125
  end
@@ -9,8 +9,8 @@ class PennTreebankTest < Test::Unit::TestCase
9
9
  records = dataset.to_a
10
10
  assert_equal([
11
11
  887521,
12
- record("aer", 0),
13
- record("<unk>", 25),
12
+ record("aer"),
13
+ record("<unk>"),
14
14
  ],
15
15
  [
16
16
  records.size,
@@ -24,8 +24,8 @@ class PennTreebankTest < Test::Unit::TestCase
24
24
  records = dataset.to_a
25
25
  assert_equal([
26
26
  78669,
27
- record("no", 0),
28
- record("us", 953),
27
+ record("no"),
28
+ record("us"),
29
29
  ],
30
30
  [
31
31
  records.size,
@@ -39,8 +39,8 @@ class PennTreebankTest < Test::Unit::TestCase
39
39
  records = dataset.to_a
40
40
  assert_equal([
41
41
  70390,
42
- record("consumers", 0),
43
- record("N", 28),
42
+ record("consumers"),
43
+ record("N"),
44
44
  ],
45
45
  [
46
46
  records.size,
data/test/test-table.rb CHANGED
@@ -8,6 +8,26 @@ class TableTest < Test::Unit::TestCase
8
8
  @table[:petal_length].first(5))
9
9
  end
10
10
 
11
+ test("#dictionary_encode") do
12
+ assert_equal([
13
+ [0, "Iris-setosa"],
14
+ [1, "Iris-versicolor"],
15
+ [2, "Iris-virginica"],
16
+ ],
17
+ @table.dictionary_encode(:label).to_a)
18
+ end
19
+
20
+ test("#label_encode") do
21
+ label_encoded_labels = @table.label_encode(:label)
22
+ labels = @table[:label]
23
+ assert_equal([0, 1, 2],
24
+ [
25
+ label_encoded_labels[labels.find_index("Iris-setosa")],
26
+ label_encoded_labels[labels.find_index("Iris-versicolor")],
27
+ label_encoded_labels[labels.find_index("Iris-virginica")],
28
+ ])
29
+ end
30
+
11
31
  sub_test_case("#fetch_values") do
12
32
  test("found") do
13
33
  values = @table.fetch_values(:petal_length, :petal_width)
@@ -44,7 +64,7 @@ class TableTest < Test::Unit::TestCase
44
64
  shorten_hash[name] = values.first(5)
45
65
  end
46
66
  assert_equal({
47
- :class => ["Iris-setosa"] * 5,
67
+ :label => ["Iris-setosa"] * 5,
48
68
  :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
49
69
  :petal_width => [0.2, 0.2, 0.2, 0.2, 0.2],
50
70
  :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
@@ -59,7 +79,7 @@ class TableTest < Test::Unit::TestCase
59
79
  shorten_hash[name] = values.first(5)
60
80
  end
61
81
  assert_equal({
62
- :class => ["Iris-setosa"] * 5,
82
+ :label => ["Iris-setosa"] * 5,
63
83
  :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
64
84
  :petal_width => [0.2, 0.2, 0.2, 0.2, 0.2],
65
85
  :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
data/test/test-wine.rb ADDED
@@ -0,0 +1,58 @@
1
+ class WineTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::Wine.new
4
+ end
5
+
6
+ test('#each') do
7
+ records = @dataset.each.to_a
8
+ assert_equal([
9
+ 178,
10
+ {
11
+ :alcalinity_of_ash => 15.6,
12
+ :alcohol => 14.23,
13
+ :ash => 2.43,
14
+ :label => 1,
15
+ :color_intensity => 5.64,
16
+ :hue => 1.04,
17
+ :malic_acid => 1.71,
18
+ :total_flavonoids => 3.06,
19
+ :n_magnesiums => 127,
20
+ :total_nonflavanoid_phenols => 0.28,
21
+ :total_proanthocyanins => 2.29,
22
+ :n_prolines => 1065,
23
+ :optical_nucleic_acid_concentration => 3.92,
24
+ :total_phenols => 2.8
25
+ },
26
+ {
27
+ :alcalinity_of_ash => 24.5,
28
+ :alcohol => 14.13,
29
+ :ash => 2.74,
30
+ :label => 3,
31
+ :color_intensity => 9.2,
32
+ :hue => 0.61,
33
+ :malic_acid => 4.1,
34
+ :total_flavonoids => 0.76,
35
+ :n_magnesiums => 96,
36
+ :total_nonflavanoid_phenols => 0.56,
37
+ :total_proanthocyanins => 1.35,
38
+ :n_prolines => 560,
39
+ :optical_nucleic_acid_concentration => 1.6,
40
+ :total_phenols => 2.05,
41
+ },
42
+ ],
43
+ [
44
+ records.size,
45
+ records[0].to_h,
46
+ records[-1].to_h,
47
+ ])
48
+ end
49
+
50
+ sub_test_case('#metadata') do
51
+ test('#description') do
52
+ description = @dataset.metadata.description
53
+ assert do
54
+ description.start_with?('1. Title of Database: Wine recognition data')
55
+ end
56
+ end
57
+ end
58
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red-datasets
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - tomisuker
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2018-07-25 00:00:00.000000000 Z
12
+ date: 2018-11-20 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -99,9 +99,12 @@ files:
99
99
  - Rakefile
100
100
  - doc/text/news.md
101
101
  - lib/datasets.rb
102
+ - lib/datasets/adult.rb
102
103
  - lib/datasets/cifar.rb
103
104
  - lib/datasets/dataset.rb
105
+ - lib/datasets/dictionary.rb
104
106
  - lib/datasets/downloader.rb
107
+ - lib/datasets/fashion-mnist.rb
105
108
  - lib/datasets/iris.rb
106
109
  - lib/datasets/metadata.rb
107
110
  - lib/datasets/mnist.rb
@@ -109,15 +112,20 @@ files:
109
112
  - lib/datasets/table.rb
110
113
  - lib/datasets/version.rb
111
114
  - lib/datasets/wikipedia.rb
115
+ - lib/datasets/wine.rb
112
116
  - red-datasets.gemspec
113
117
  - test/helper.rb
114
118
  - test/run-test.rb
119
+ - test/test-adult.rb
115
120
  - test/test-cifar.rb
121
+ - test/test-dictionary.rb
122
+ - test/test-fashion-mnist.rb
116
123
  - test/test-iris.rb
117
124
  - test/test-mnist.rb
118
125
  - test/test-penn-treebank.rb
119
126
  - test/test-table.rb
120
127
  - test/test-wikipedia.rb
128
+ - test/test-wine.rb
121
129
  homepage: https://github.com/red-data-tools/red-datasets
122
130
  licenses:
123
131
  - MIT
@@ -138,16 +146,20 @@ required_rubygems_version: !ruby/object:Gem::Requirement
138
146
  version: '0'
139
147
  requirements: []
140
148
  rubyforge_project:
141
- rubygems_version: 3.0.0.beta1
149
+ rubygems_version: 3.0.0.beta2
142
150
  signing_key:
143
151
  specification_version: 4
144
152
  summary: Red Datasets provides classes that provide common datasets such as iris dataset.
145
153
  test_files:
154
+ - test/test-wine.rb
146
155
  - test/run-test.rb
147
156
  - test/test-cifar.rb
157
+ - test/test-fashion-mnist.rb
148
158
  - test/test-wikipedia.rb
149
159
  - test/test-iris.rb
150
160
  - test/helper.rb
151
161
  - test/test-mnist.rb
152
162
  - test/test-table.rb
163
+ - test/test-adult.rb
153
164
  - test/test-penn-treebank.rb
165
+ - test/test-dictionary.rb