RubyGems - red-datasets - Versions diffs - 0.0.6 → 0.0.7 - Mend

red-datasets 0.0.6 → 0.0.7

Files changed (23) hide show

checksums.yaml +4 -4
data/README.md +3 -3
data/doc/text/news.md +31 -0
data/lib/datasets.rb +3 -0
data/lib/datasets/adult.rb +83 -0
data/lib/datasets/dictionary.rb +59 -0
data/lib/datasets/downloader.rb +35 -62
data/lib/datasets/fashion-mnist.rb +12 -0
data/lib/datasets/iris.rb +1 -1
data/lib/datasets/mnist.rb +11 -6
data/lib/datasets/penn-treebank.rb +2 -9
data/lib/datasets/table.rb +17 -1
data/lib/datasets/version.rb +1 -1
data/lib/datasets/wine.rb +64 -0
data/test/helper.rb +1 -0
data/test/test-adult.rb +126 -0
data/test/test-dictionary.rb +43 -0
data/test/test-fashion-mnist.rb +137 -0
data/test/test-mnist.rb +95 -70
data/test/test-penn-treebank.rb +6 -6
data/test/test-table.rb +22 -2
data/test/test-wine.rb +58 -0
metadata +15 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 3b96f5bf8fb7d8d7280451086dda394b65c42023b15ae077167e2d320c4361c1
-  data.tar.gz: 96f7936d62d70749f92d3bdd1d7ef2d79cfff3091e7dae8221d6a0537dbd6d7b
+  metadata.gz: 222271b814e3a5ce23b5e0dd1d2578bffb84afdab10110b0869985c6056bfd3b
+  data.tar.gz: ac30931b3317ab04afd394b28a45a9206c784d78b3bcaf98fc3a2a48227c7930
 SHA512:
-  metadata.gz: 859196aa39020d924fa7af4df6d96c110f41ac2b90a39dc89ed6935fc64e857b2bffb5776a366660ab61c55a96dd35b9bd6663ec23c7ee4249cae3103bc0a2aa
-  data.tar.gz: b07ec53917af58e737058c504685d283850e072f0794c457bd961d39b9815c85b2fc2a9bed4de2a643675dc0e0f7bb2077b4c41b2c28c9c94f948a532baae6bb
+  metadata.gz: 8a94a3d66baaed4948904e97dc53100d73ae96c528c09b02252caabd05b8545587abf6fbcba3a578725812327a9a2c8827bbb7e283ccd3d7e66753bf30035e2e
+  data.tar.gz: 2ab44b5aa3ee5da0ac8e8307546c71942938de4497bfec05fc929715a4e5ef6df1cb091bce0d5f12978582d2c9fa7eaffff9edd54be0d845627dccfce42a63dd

data/README.md CHANGED Viewed

@@ -30,7 +30,7 @@ iris.each do |record|
      record.sepal_width,
      record.petal_length,
      record.petal_width,
-     record.class,
+     record.label,
   ]
 end
 # => [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]
@@ -48,7 +48,7 @@ p iris_hash[:petal_length]
 # => [1.4, 1.4, .. , 4.7, ..
 p iris_hash[:petal_width]
 # => [0.2, 0.2, .. , 1.4, ..
-p iris_hash[:class]
+p iris_hash[:label]
 # => ["Iris-setosa", "Iris-setosa", .. , "Iris-versicolor", ..
@@ -60,7 +60,7 @@ p iris_table.fetch_values(:sepal_length, :sepal_width, :petal_length, :petal_wid
       [7.0, 3.2, 4.7, 1.4],
       :
-p iris_table[:class]
+p iris_table[:label]
 # => ["Iris-setosa", "Iris-setosa", .. , "Iris-versicolor", ..
 ```

data/doc/text/news.md CHANGED Viewed

@@ -1,5 +1,36 @@
 # News
+## 0.0.7 - 2018-11-21
+### Improvements
+  * `Datasets::Table#dictionary_encode`: Added.
+    [GitHub#22]
+  * `Datasets::Table#label_encode`: Added.
+  * `Datasets::Dictionary`: Added.
+  * `Datasets::Wine`: Added.
+    [GitHub#26][Patch by Ryuta Suzuki]
+  * `Datasets::FashionMNIST`: Added.
+    [GitHub#27][Patch by chimame]
+  * `Datasets::Iris::Record#label`: Renamed from `#class`. This is an
+    incompatible change.
+  * `Datasets::Adult`: Added.
+    [GitHub#30][Patch by Yasuo Honda]
+### Thanks
+  * Ryuta Suzuki
+  * chimame
+  * Yasuo Honda
 ## 0.0.6 - 2018-07-25
 ### Improvements

data/lib/datasets.rb CHANGED Viewed

@@ -1,7 +1,10 @@
 require "datasets/version"
+require "datasets/adult"
 require "datasets/cifar"
+require "datasets/fashion-mnist"
 require "datasets/iris"
 require "datasets/mnist"
 require "datasets/penn-treebank"
 require "datasets/wikipedia"
+require "datasets/wine"

data/lib/datasets/adult.rb ADDED Viewed

@@ -0,0 +1,83 @@
+require "csv"
+require_relative "dataset"
+module Datasets
+  class Adult < Dataset
+    Record = Struct.new(
+      :age,
+      :work_class,
+      :final_weight,
+      :education,
+      :n_education_years,
+      :marital_status,
+      :occupation,
+      :relationship,
+      :race,
+      :sex,
+      :capital_gain,
+      :capital_loss,
+      :hours_per_week,
+      :native_country,
+      :label
+    )
+    def initialize(type: :train)
+      unless [:train, :test].include?(type)
+        raise ArgumentError, 'Please set type :train or :test'
+      end
+      super()
+      @type = type
+      @metadata.id = "adult-#{@type}"
+      @metadata.name = "Adult: #{@type}"
+      @metadata.url = "http://archive.ics.uci.edu/ml/datasets/adult"
+      @metadata.description = lambda do
+        read_names
+      end
+    end
+    def each
+      return to_enum(__method__) unless block_given?
+      open_data do |csv|
+        csv.each do |row|
+          next if row[0].nil?
+          record = Record.new(*row)
+          yield(record)
+        end
+      end
+    end
+    private
+    def open_data
+      case @type
+      when :train
+        ext = "data"
+      when :test
+        ext = "test"
+      end
+      data_path = cache_dir_path + "adult-#{ext}.csv"
+      unless data_path.exist?
+        data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.#{ext}"
+        download(data_path, data_url)
+      end
+      CSV.open(data_path,
+               {
+                 converters: [:numeric, lambda {|f| f.strip}],
+                 skip_lines: /\A\|/,
+               }) do |csv|
+        yield(csv)
+      end
+    end
+    def read_names
+      names_path = cache_dir_path + "adult.names"
+      unless names_path.exist?
+        names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names"
+        download(names_path, names_url)
+      end
+      names_path.read
+    end
+  end
+end

data/lib/datasets/dictionary.rb ADDED Viewed

@@ -0,0 +1,59 @@
+module Datasets
+  class Dictionary
+    include Enumerable
+    def initialize(values)
+      build_dictionary(values)
+    end
+    def id(value)
+      @value_to_id[value]
+    end
+    def value(id)
+      @id_to_value[id]
+    end
+    def ids
+      @id_to_value.keys
+    end
+    def values
+      @id_to_value.values
+    end
+    def each(&block)
+      @id_to_value.each(&block)
+    end
+    def size
+      @id_to_value.size
+    end
+    alias_method :length, :size
+    def encode(values)
+      values.collect do |value|
+        id(value)
+      end
+    end
+    def decode(ids)
+      ids.collect do |id|
+        value(id)
+      end
+    end
+    private
+    def build_dictionary(values)
+      @id_to_value = {}
+      @value_to_id = {}
+      id = 0
+      values.each do |value|
+        next if @value_to_id.key?(value)
+        @id_to_value[id] = value
+        @value_to_id[value] = id
+        id += 1
+      end
+    end
+  end
+end

data/lib/datasets/downloader.rb CHANGED Viewed

@@ -3,7 +3,7 @@ begin
   require "io/console"
 rescue LoadError
 end
-require "open-uri"
+require "net/http"
 require "pathname"
 module Datasets
@@ -15,84 +15,57 @@ module Datasets
         url = URI.parse(url)
       end
       @url = url
-      @url.extend(CurrentBufferReadable)
+      unless @url.is_a?(URI::HTTP)
+        raise ArgumentError, "download URL must be HTTP or HTTPS: <#{@url}>"
+      end
     end
     def download(output_path)
       output_path.parent.mkpath
+      headers = {"User-Agent" => "Red Datasets/#{VERSION}"}
       start = nil
       partial_output_path = Pathname.new("#{output_path}.partial")
       if partial_output_path.exist?
         start = partial_output_path.size
+        headers["Range"] = "bytes=#{start}-"
       end
-      progress_reporter = nil
-      content_length_proc = lambda do |content_length|
-        base_name = @url.path.split("/").last
-        size_max = content_length
-        size_max += start if start
-        progress_reporter = ProgressReporter.new(base_name, size_max)
-      end
-      progress_proc = lambda do |size_current|
-        size_current += start if start
-        progress_reporter.report(size_current) if progress_reporter
-      end
-      options = {
-        :content_length_proc => content_length_proc,
-        :progress_proc => progress_proc,
-      }
-      if start
-        options["Range"] = "bytes=#{start}-"
-      end
+      Net::HTTP.start(@url.hostname,
+                      @url.port,
+                      :use_ssl => (@url.scheme == "https")) do |http|
+        request = Net::HTTP::Get.new(@url.path, headers)
+        http.request(request) do |response|
+          case response
+          when Net::HTTPPartialContent
+            mode = "ab"
+          when Net::HTTPSuccess
+            start = nil
+            mode = "wb"
+          else
+            break
+          end
-      begin
-        @url.open(options) do |input|
-          copy_stream(input, partial_output_path)
-        end
-      rescue Interrupt, Net::ReadTimeout
-        if @url.current_buffer
-          input = @url.current_buffer.io
-          input.rewind
-          copy_stream(input, partial_output_path)
+          base_name = @url.path.split("/").last
+          size_current = 0
+          size_max = response.content_length
+          if start
+            size_current += start
+            size_max += start
+          end
+          progress_reporter = ProgressReporter.new(base_name, size_max)
+          partial_output_path.open(mode) do |output|
+            response.read_body do |chunk|
+              size_current += chunk.bytesize
+              progress_reporter.report(size_current)
+              output.write(chunk)
+            end
+          end
         end
-        raise
       end
       FileUtils.mv(partial_output_path, output_path)
     end
-    private
-    def copy_stream(input, partial_output_path)
-      if partial_output_path.exist?
-        # TODO: It's better that we use "206 Partial Content" response
-        # to detect partial response.
-        partial_head = partial_output_path.open("rb") do |partial_output|
-          partial_output.read(256)
-        end
-        input_head = input.read(partial_head.bytesize)
-        input.rewind
-        if partial_head == input_head
-          mode = "wb"
-        else
-          mode = "ab"
-        end
-      else
-        mode = "wb"
-      end
-      partial_output_path.open(mode) do |partial_output|
-        IO.copy_stream(input, partial_output)
-      end
-    end
-    module CurrentBufferReadable
-      attr_reader :current_buffer
-      def buffer_open(buffer, proxy, options)
-        @current_buffer = buffer
-        super
-      end
-    end
     class ProgressReporter
       def initialize(base_name, size_max)
         @base_name = base_name

data/lib/datasets/fashion-mnist.rb ADDED Viewed

@@ -0,0 +1,12 @@
+require_relative 'mnist'
+module Datasets
+  class FashionMNIST < MNIST
+    BASE_URL = "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/"
+    private
+    def dataset_name
+      "Fashion-MNIST"
+    end
+  end
+end

data/lib/datasets/iris.rb CHANGED Viewed

@@ -8,7 +8,7 @@ module Datasets
                         :sepal_width,
                         :petal_length,
                         :petal_width,
-                        :class)
+                        :label)
     def initialize
       super()

data/lib/datasets/mnist.rb CHANGED Viewed

@@ -6,6 +6,7 @@ class SetTypeError < StandardError; end
 module Datasets
   class MNIST < Dataset
+    BASE_URL = "http://yann.lecun.com/exdb/mnist/"
     class Record < Struct.new(:data, :label)
       def pixels
@@ -26,9 +27,9 @@ module Datasets
       super()
-      @metadata.id = "mnist-#{type}"
-      @metadata.name = "MNIST: #{type}"
-      @metadata.url = "http://yann.lecun.com/exdb/mnist/"
+      @metadata.id = "#{dataset_name.downcase}-#{type}"
+      @metadata.name = "#{dataset_name}: #{type}"
+      @metadata.url = self.class::BASE_URL
       @type = type
       case type
@@ -44,7 +45,7 @@ module Datasets
       image_path = cache_dir_path + target_file(:image)
       label_path = cache_dir_path + target_file(:label)
-      base_url = "http://yann.lecun.com/exdb/mnist/"
+      base_url = self.class::BASE_URL
       unless image_path.exist?
         download(image_path, base_url + target_file(:image))
@@ -66,7 +67,7 @@ module Datasets
         n_bytes = n_uint32s * 4
         mnist_magic_number = 2051
         magic, n_images, n_rows, n_cols = f.read(n_bytes).unpack("N*")
-        raise 'This is not MNIST image file' if magic != mnist_magic_number
+        raise "This is not #{dataset_name} image file" if magic != mnist_magic_number
         n_images.times do |i|
           data = f.read(n_rows * n_cols)
           label = labels[i]
@@ -100,9 +101,13 @@ module Datasets
         n_bytes = n_uint32s * 2
         mnist_magic_number = 2049
         magic, n_labels = f.read(n_bytes).unpack('N2')
-        raise 'This is not MNIST label file' if magic != mnist_magic_number
+        raise "This is not #{dataset_name} label file" if magic != mnist_magic_number
         f.read(n_labels).unpack('C*')
       end
     end
+    def dataset_name
+      "MNIST"
+    end
   end
 end

data/lib/datasets/penn-treebank.rb CHANGED Viewed

@@ -2,7 +2,7 @@ require_relative "dataset"
 module Datasets
   class PennTreebank < Dataset
-    Record = Struct.new(:word, :id)
+    Record = Struct.new(:word)
     DESCRIPTION = <<~DESC
       `Penn Tree Bank <https://www.cis.upenn.edu/~treebank/>`_ is originally a
@@ -46,17 +46,10 @@ module Datasets
     private
     def parse_data(data_path)
-      index = 0
-      vocabulary = {}
       File.open(data_path) do |f|
         f.each_line do |line|
           line.split.each do |word|
-            word = word.strip
-            unless vocabulary.key?(word)
-              vocabulary[word] = index
-              index += 1
-            end
-            yield(Record.new(word, vocabulary[word]))
+            yield(Record.new(word.strip))
           end
         end
       end

data/lib/datasets/table.rb CHANGED Viewed

@@ -1,9 +1,12 @@
+require "datasets/dictionary"
 module Datasets
   class Table
     include Enumerable
     def initialize(dataset)
       @dataset = dataset
+      @dictionaries = {}
     end
     def each(&block)
@@ -11,7 +14,16 @@ module Datasets
     end
     def [](name)
-      columner_data[name.to_sym]
+      columner_data[normalize_name(name)]
+    end
+    def dictionary_encode(name)
+      @dictionaries[normalize_name(name)] ||= Dictionary.new(self[name])
+    end
+    def label_encode(name)
+      dictionary = dictionary_encode(name)
+      dictionary.encode(self[name])
     end
     def fetch_values(*keys)
@@ -55,5 +67,9 @@ module Datasets
     def columner_data
       @columns ||= to_h
     end
+    def normalize_name(name)
+      name.to_sym
+    end
   end
 end

data/lib/datasets/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Datasets
-  VERSION = "0.0.6"
+  VERSION = "0.0.7"
 end

data/lib/datasets/wine.rb ADDED Viewed

@@ -0,0 +1,64 @@
+require 'csv'
+require_relative 'dataset'
+module Datasets
+  class Wine < Dataset
+    Record = Struct.new(:label,
+                        :alcohol,
+                        :malic_acid,
+                        :ash,
+                        :alcalinity_of_ash,
+                        :n_magnesiums,
+                        :total_phenols,
+                        :total_flavonoids,
+                        :total_nonflavanoid_phenols,
+                        :total_proanthocyanins,
+                        :color_intensity,
+                        :hue,
+                        :optical_nucleic_acid_concentration,
+                        :n_prolines)
+    def initialize
+      super
+      @metadata.id = 'wine'
+      @metadata.name = 'Wine'
+      @metadata.url = 'http://archive.ics.uci.edu/ml/datasets/wine'
+      @metadata.description = -> { read_names }
+    end
+    def each
+      return to_enum(__method__) unless block_given?
+      open_data do |csv|
+        csv.each do |row|
+          next if row[0].nil?
+          record = Record.new(*row)
+          yield(record)
+        end
+      end
+    end
+    private
+    def read_names
+      names_path = cache_dir_path + 'wine.names'
+      unless names_path.exist?
+        names_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names'
+        download(names_path, names_url)
+      end
+      names_path.read
+    end
+    def open_data
+      data_path = cache_dir_path + 'wine.data'
+      unless data_path.exist?
+        data_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
+        download(data_path, data_url)
+      end
+      CSV.open(data_path, converters: %i[numeric]) do |csv|
+        yield(csv)
+      end
+    end
+  end
+end

data/test/helper.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require "fileutils"
 require "pathname"
+require "time"
 require "datasets"

data/test/test-adult.rb ADDED Viewed

@@ -0,0 +1,126 @@
+class AdultTest < Test::Unit::TestCase
+  sub_test_case("train") do
+    def setup
+      @dataset = Datasets::Adult.new(type: :train)
+    end
+    def record(*args)
+      Datasets::Adult::Record.new(*args)
+    end
+    test("#each") do
+      records = @dataset.each.to_a
+      assert_equal([
+                     32561,
+                     {
+                       :age => 39,
+                       :work_class => "State-gov",
+                       :final_weight => 77516,
+                       :education => "Bachelors",
+                       :n_education_years => 13,
+                       :marital_status => "Never-married",
+                       :occupation => "Adm-clerical",
+                       :relationship => "Not-in-family",
+                       :race => "White",
+                       :sex => "Male",
+                       :capital_gain => 2174,
+                       :capital_loss => 0,
+                       :hours_per_week => 40,
+                       :native_country => "United-States",
+                       :label => "<=50K"
+                     },
+                     {
+                       :age => 52,
+                       :work_class => "Self-emp-inc",
+                       :final_weight => 287927,
+                       :education => "HS-grad",
+                       :n_education_years => 9,
+                       :marital_status => "Married-civ-spouse",
+                       :occupation => "Exec-managerial",
+                       :relationship => "Wife",
+                       :race => "White",
+                       :sex => "Female",
+                       :capital_gain => 15024,
+                       :capital_loss => 0,
+                       :hours_per_week => 40,
+                       :native_country => "United-States",
+                       :label => ">50K"
+                     }
+                   ],
+                   [
+                     records.size,
+                     records[0].to_h,
+                     records[-1].to_h
+                   ])
+    end
+  end
+  sub_test_case("test") do
+    def setup
+      @dataset = Datasets::Adult.new(type: :test)
+    end
+    def record(*args)
+      Datasets::Adult::Record.new(*args)
+    end
+    test("#each") do
+      records = @dataset.each.to_a
+      assert_equal([
+                     16281,
+                     {
+                       :age => 25,
+                       :work_class => "Private",
+                       :final_weight => 226802,
+                       :education => "11th",
+                       :n_education_years => 7,
+                       :marital_status => "Never-married",
+                       :occupation => "Machine-op-inspct",
+                       :relationship => "Own-child",
+                       :race => "Black",
+                       :sex => "Male",
+                       :capital_gain => 0,
+                       :capital_loss => 0,
+                       :hours_per_week => 40,
+                       :native_country => "United-States",
+                       :label => "<=50K."
+                     },
+                     {
+                       :age => 35,
+                       :work_class => "Self-emp-inc",
+                       :final_weight => 182148,
+                       :education => "Bachelors",
+                       :n_education_years => 13,
+                       :marital_status => "Married-civ-spouse",
+                       :occupation => "Exec-managerial",
+                       :relationship => "Husband",
+                       :race => "White",
+                       :sex => "Male",
+                       :capital_gain => 0,
+                       :capital_loss => 0,
+                       :hours_per_week => 60,
+                       :native_country => "United-States",
+                       :label => ">50K."
+                     }
+                   ],
+                   [
+                     records.size,
+                     records[0].to_h,
+                     records[-1].to_h
+                   ])
+    end
+  end
+  sub_test_case("#metadata") do
+    def setup
+      @dataset = Datasets::Adult.new(type: :train)
+    end
+    test("#description") do
+      description = @dataset.metadata.description
+      assert do
+        description.start_with?("| This data was extracted from the census bureau database found at")
+      end
+    end
+  end
+end

data/test/test-dictionary.rb ADDED Viewed

@@ -0,0 +1,43 @@
+class DictionaryTest < Test::Unit::TestCase
+  def setup
+    penn_treebank = Datasets::PennTreebank.new(type: :test)
+    @dictionary = penn_treebank.to_table.dictionary_encode(:word)
+  end
+  test("#id") do
+    assert_equal(95, @dictionary.id("<unk>"))
+  end
+  test("#value") do
+    assert_equal("<unk>", @dictionary.value(95))
+  end
+  test("#ids") do
+    assert_equal([0, 1, 2, 3, 4], @dictionary.ids.first(5))
+  end
+  test("#values") do
+    assert_equal(["no", "it", "was", "n't", "black"],
+                 @dictionary.values.first(5))
+  end
+  test("#each") do
+    assert_equal([
+                   [0, "no"],
+                   [1, "it"],
+                   [2, "was"],
+                   [3, "n't"],
+                   [4, "black"],
+                 ],
+                 @dictionary.each.first(5).to_a)
+  end
+  test("#size") do
+    assert_equal(6048, @dictionary.size)
+  end
+  test("#length") do
+    assert_equal(@dictionary.size,
+                 @dictionary.length)
+  end
+end

data/test/test-fashion-mnist.rb ADDED Viewed

@@ -0,0 +1,137 @@
+class FashionMNISTTest < Test::Unit::TestCase
+  sub_test_case("Normal") do
+    sub_test_case("train") do
+      def setup
+        @dataset = Datasets::FashionMNIST.new(type: :train)
+      end
+      test("#each") do
+        records = @dataset.each.to_a
+        assert_equal([
+                       60000,
+                       [
+                         9,
+                         784,
+                         [0, 0, 0, 0, 237, 226, 217, 223, 222, 219],
+                         [220, 232, 246, 0, 3, 202, 228, 224, 221, 211],
+                       ],
+                       [
+                         5,
+                         784,
+                         [129, 153, 34, 0, 3, 3, 0, 3, 0, 24],
+                         [180, 177, 177, 47, 101, 235, 194, 223, 232, 255],
+                       ],
+                     ],
+                     [
+                       records.size,
+                       [
+                         records[0].label,
+                         records[0].pixels.size,
+                         records[0].pixels[400, 10],
+                         records[0].pixels[500, 10],
+                       ],
+                       [
+                         records[-1].label,
+                         records[-1].pixels.size,
+                         records[-1].pixels[400, 10],
+                         records[-1].pixels[500, 10],
+                       ],
+                     ])
+      end
+      test("#to_table") do
+        table_data = @dataset.to_table
+        assert_equal([
+                       [0, 0, 0, 0, 237, 226, 217, 223, 222, 219],
+                       [129, 153, 34, 0, 3, 3, 0, 3, 0, 24],
+                     ],
+                     [
+                       table_data[:pixels][0][400, 10],
+                       table_data[:pixels][-1][400, 10],
+                     ])
+      end
+      sub_test_case("#metadata") do
+        test("#id") do
+          assert_equal("fashion-mnist-train", @dataset.metadata.id)
+        end
+        test("#name") do
+          assert_equal("Fashion-MNIST: train", @dataset.metadata.name)
+        end
+      end
+    end
+    sub_test_case("test") do
+      def setup
+        @dataset = Datasets::FashionMNIST.new(type: :test)
+      end
+      test("#each") do
+        records = @dataset.each.to_a
+        assert_equal([
+                       10000,
+                       [
+                         9,
+                         784,
+                         [1, 0, 0, 0, 98, 136, 110, 109, 110, 162],
+                         [172, 161, 189, 62, 0, 68, 94, 90, 111, 114],
+                       ],
+                       [
+                         5,
+                         784,
+                         [45, 45, 69, 128, 100, 120, 132, 123, 135, 171],
+                         [63, 74, 72, 0, 1, 0, 0, 0, 4, 85],
+                       ],
+                     ],
+                     [
+                       records.size,
+                       [
+                         records[0].label,
+                         records[0].pixels.size,
+                         records[0].pixels[400, 10],
+                         records[0].pixels[500, 10],
+                       ],
+                       [
+                         records[-1].label,
+                         records[-1].pixels.size,
+                         records[-1].pixels[400, 10],
+                         records[-1].pixels[500, 10],
+                       ],
+                     ])
+      end
+      test("#to_table") do
+        table_data = @dataset.to_table
+        assert_equal([
+                       [1, 0, 0, 0, 98, 136, 110, 109, 110, 162],
+                       [45, 45, 69, 128, 100, 120, 132, 123, 135, 171],
+                     ],
+                     [
+                       table_data[:pixels][0][400, 10],
+                       table_data[:pixels][-1][400, 10],
+                     ])
+      end
+      sub_test_case("#metadata") do
+        test("#id") do
+          assert_equal("fashion-mnist-test", @dataset.metadata.id)
+        end
+        test("#name") do
+          assert_equal("Fashion-MNIST: test", @dataset.metadata.name)
+        end
+      end
+    end
+  end
+  sub_test_case("Abnormal") do
+    test("invalid type") do
+      invalid_type = :invalid
+      message = "Please set type :train or :test: #{invalid_type.inspect}"
+      assert_raise(ArgumentError.new(message)) do
+        Datasets::FashionMNIST.new(type: invalid_type)
+      end
+    end
+  end
+end

data/test/test-mnist.rb CHANGED Viewed

@@ -1,100 +1,125 @@
 class MNISTTest < Test::Unit::TestCase
-  include Helper::Sandbox
   sub_test_case("Normal") do
-    def setup_data
-      setup_sandbox
-      def @dataset.cache_dir_path
-        @cache_dir_path
-      end
-      def @dataset.cache_dir_path=(path)
-        @cache_dir_path = path
-      end
-      @dataset.cache_dir_path = @tmp_dir
-      def @dataset.download(output_path, url)
-        image_magic_number = 2051
-        label_magic_number = 2049
-        n_image, image_size_x, image_size_y, label = 10, 28, 28, 1
-        Zlib::GzipWriter.open(output_path) do |gz|
-          if output_path.basename.to_s.include?("-images-")
-            image_data = ([image_magic_number, n_image]).pack('N2') +
-                         ([image_size_x,image_size_y]).pack('N2') +
-                         ([0] * image_size_x * image_size_y).pack("C*") * n_image
-            gz.puts(image_data)
-          else
-            label_data = ([label_magic_number, n_image]).pack('N2') +
-                         ([label] * n_image).pack("C*")
-            gz.puts(label_data)
-          end
-        end
-      end
-    end
-    def teardown
-      teardown_sandbox
-    end
     sub_test_case("train") do
       def setup
         @dataset = Datasets::MNIST.new(type: :train)
-        setup_data()
       end
       test("#each") do
-        raw_dataset = @dataset.collect do |record|
-          {
-            :label => record.label,
-            :pixels => record.pixels
-          }
-        end
+        records = @dataset.each.to_a
         assert_equal([
-                       {
-                         :label => 1,
-                         :pixels => [0] * 28 * 28
-                       }
-                     ] * 10,
-                     raw_dataset)
+                       60000,
+                       [
+                         5,
+                         784,
+                         [0, 0, 0, 49, 238, 253, 253, 253, 253, 253],
+                         [0, 0, 0, 0, 0, 81, 240, 253, 253, 119],
+                       ],
+                       [8,
+                         784,
+                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 62],
+                         [0, 0, 190, 196, 14, 2, 97, 254, 252, 146],
+                       ],
+                     ],
+                     [
+                       records.size,
+                       [
+                         records[0].label,
+                         records[0].pixels.size,
+                         records[0].pixels[200, 10],
+                         records[0].pixels[400, 10],
+                       ],
+                       [
+                         records[-1].label,
+                         records[-1].pixels.size,
+                         records[-1].pixels[200, 10],
+                         records[-1].pixels[400, 10],
+                       ],
+                     ])
       end
       test("#to_table") do
         table_data = @dataset.to_table
-        assert_equal([[0] * 28 * 28] * 10,
-                     table_data[:pixels])
+        assert_equal([
+                       [0, 0, 0, 49, 238, 253, 253, 253, 253, 253],
+                       [0, 0, 0, 0, 0, 0, 0, 0, 0, 62],
+                     ],
+                     [
+                       table_data[:pixels][0][200, 10],
+                       table_data[:pixels][-1][200, 10],
+                     ])
+      end
+      sub_test_case("#metadata") do
+        test("#id") do
+          assert_equal("mnist-train", @dataset.metadata.id)
+        end
+        test("#name") do
+          assert_equal("MNIST: train", @dataset.metadata.name)
+        end
       end
     end
     sub_test_case("test") do
       def setup
         @dataset = Datasets::MNIST.new(type: :test)
-        setup_data()
       end
       test("#each") do
-        raw_dataset = @dataset.collect do |record|
-          {
-            :label => record.label,
-            :pixels => record.pixels
-          }
-        end
+        records = @dataset.each.to_a
         assert_equal([
-                       {
-                         :label => 1,
-                         :pixels => [0] * 28 * 28
-                       }
-                     ] * 10,
-                     raw_dataset)
+                       10000,
+                       [
+                         7,
+                         784,
+                         [0, 0, 84, 185, 159, 151, 60, 36, 0, 0],
+                         [0, 0, 0, 0, 0, 0, 0, 0, 59, 249],
+                       ],
+                       [
+                         6,
+                         784,
+                         [0, 0, 0, 0, 0, 15, 60, 60, 168, 253],
+                         [253, 253, 132, 64, 0, 0, 18, 43, 157, 171],
+                       ],
+                     ],
+                     [
+                       records.size,
+                       [
+                         records[0].label,
+                         records[0].pixels.size,
+                         records[0].pixels[200, 10],
+                         records[0].pixels[400, 10],
+                       ],
+                       [
+                         records[-1].label,
+                         records[-1].pixels.size,
+                         records[-1].pixels[200, 10],
+                         records[-1].pixels[400, 10],
+                       ],
+                     ])
       end
       test("#to_table") do
         table_data = @dataset.to_table
-        assert_equal([[0] * 28 * 28] * 10,
-                     table_data[:pixels])
+        assert_equal([
+                       [0, 0, 84, 185, 159, 151, 60, 36, 0, 0],
+                       [0, 0, 0, 0, 0, 15, 60, 60, 168, 253],
+                     ],
+                     [
+                       table_data[:pixels][0][200, 10],
+                       table_data[:pixels][-1][200, 10],
+                     ])
+      end
+      sub_test_case("#metadata") do
+        test("#id") do
+          assert_equal("mnist-test", @dataset.metadata.id)
+        end
+        test("#name") do
+          assert_equal("MNIST: test", @dataset.metadata.name)
+        end
       end
     end
   end

data/test/test-penn-treebank.rb CHANGED Viewed

@@ -9,8 +9,8 @@ class PennTreebankTest < Test::Unit::TestCase
       records = dataset.to_a
       assert_equal([
                      887521,
-                     record("aer", 0),
-                     record("<unk>", 25),
+                     record("aer"),
+                     record("<unk>"),
                    ],
                    [
                      records.size,
@@ -24,8 +24,8 @@ class PennTreebankTest < Test::Unit::TestCase
       records = dataset.to_a
       assert_equal([
                      78669,
-                     record("no", 0),
-                     record("us", 953),
+                     record("no"),
+                     record("us"),
                    ],
                    [
                      records.size,
@@ -39,8 +39,8 @@ class PennTreebankTest < Test::Unit::TestCase
       records = dataset.to_a
       assert_equal([
                      70390,
-                     record("consumers", 0),
-                     record("N", 28),
+                     record("consumers"),
+                     record("N"),
                    ],
                    [
                      records.size,

data/test/test-table.rb CHANGED Viewed

@@ -8,6 +8,26 @@ class TableTest < Test::Unit::TestCase
                  @table[:petal_length].first(5))
   end
+  test("#dictionary_encode") do
+    assert_equal([
+                   [0, "Iris-setosa"],
+                   [1, "Iris-versicolor"],
+                   [2, "Iris-virginica"],
+                 ],
+                 @table.dictionary_encode(:label).to_a)
+  end
+  test("#label_encode") do
+    label_encoded_labels = @table.label_encode(:label)
+    labels = @table[:label]
+    assert_equal([0, 1, 2],
+                 [
+                   label_encoded_labels[labels.find_index("Iris-setosa")],
+                   label_encoded_labels[labels.find_index("Iris-versicolor")],
+                   label_encoded_labels[labels.find_index("Iris-virginica")],
+                 ])
+  end
   sub_test_case("#fetch_values") do
     test("found") do
       values = @table.fetch_values(:petal_length, :petal_width)
@@ -44,7 +64,7 @@ class TableTest < Test::Unit::TestCase
       shorten_hash[name] = values.first(5)
     end
     assert_equal({
-                   :class        => ["Iris-setosa"] * 5,
+                   :label        => ["Iris-setosa"] * 5,
                    :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
                    :petal_width  => [0.2, 0.2, 0.2, 0.2, 0.2],
                    :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
@@ -59,7 +79,7 @@ class TableTest < Test::Unit::TestCase
       shorten_hash[name] = values.first(5)
     end
     assert_equal({
-                   :class        => ["Iris-setosa"] * 5,
+                   :label        => ["Iris-setosa"] * 5,
                    :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
                    :petal_width  => [0.2, 0.2, 0.2, 0.2, 0.2],
                    :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],

data/test/test-wine.rb ADDED Viewed

@@ -0,0 +1,58 @@
+class WineTest < Test::Unit::TestCase
+  def setup
+    @dataset = Datasets::Wine.new
+  end
+  test('#each') do
+    records = @dataset.each.to_a
+    assert_equal([
+                   178,
+                   {
+                     :alcalinity_of_ash => 15.6,
+                     :alcohol => 14.23,
+                     :ash => 2.43,
+                     :label => 1,
+                     :color_intensity => 5.64,
+                     :hue => 1.04,
+                     :malic_acid => 1.71,
+                     :total_flavonoids => 3.06,
+                     :n_magnesiums => 127,
+                     :total_nonflavanoid_phenols => 0.28,
+                     :total_proanthocyanins => 2.29,
+                     :n_prolines => 1065,
+                     :optical_nucleic_acid_concentration => 3.92,
+                     :total_phenols => 2.8
+                   },
+                   {
+                     :alcalinity_of_ash => 24.5,
+                     :alcohol => 14.13,
+                     :ash => 2.74,
+                     :label => 3,
+                     :color_intensity => 9.2,
+                     :hue => 0.61,
+                     :malic_acid => 4.1,
+                     :total_flavonoids => 0.76,
+                     :n_magnesiums => 96,
+                     :total_nonflavanoid_phenols => 0.56,
+                     :total_proanthocyanins => 1.35,
+                     :n_prolines => 560,
+                     :optical_nucleic_acid_concentration => 1.6,
+                     :total_phenols => 2.05,
+                   },
+                 ],
+                 [
+                   records.size,
+                   records[0].to_h,
+                   records[-1].to_h,
+                 ])
+  end
+  sub_test_case('#metadata') do
+    test('#description') do
+      description = @dataset.metadata.description
+      assert do
+        description.start_with?('1. Title of Database: Wine recognition data')
+      end
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: red-datasets
 version: !ruby/object:Gem::Version
-  version: 0.0.6
+  version: 0.0.7
 platform: ruby
 authors:
 - tomisuker
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2018-07-25 00:00:00.000000000 Z
+date: 2018-11-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -99,9 +99,12 @@ files:
 - Rakefile
 - doc/text/news.md
 - lib/datasets.rb
+- lib/datasets/adult.rb
 - lib/datasets/cifar.rb
 - lib/datasets/dataset.rb
+- lib/datasets/dictionary.rb
 - lib/datasets/downloader.rb
+- lib/datasets/fashion-mnist.rb
 - lib/datasets/iris.rb
 - lib/datasets/metadata.rb
 - lib/datasets/mnist.rb
@@ -109,15 +112,20 @@ files:
 - lib/datasets/table.rb
 - lib/datasets/version.rb
 - lib/datasets/wikipedia.rb
+- lib/datasets/wine.rb
 - red-datasets.gemspec
 - test/helper.rb
 - test/run-test.rb
+- test/test-adult.rb
 - test/test-cifar.rb
+- test/test-dictionary.rb
+- test/test-fashion-mnist.rb
 - test/test-iris.rb
 - test/test-mnist.rb
 - test/test-penn-treebank.rb
 - test/test-table.rb
 - test/test-wikipedia.rb
+- test/test-wine.rb
 homepage: https://github.com/red-data-tools/red-datasets
 licenses:
 - MIT
@@ -138,16 +146,20 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 3.0.0.beta1
+rubygems_version: 3.0.0.beta2
 signing_key:
 specification_version: 4
 summary: Red Datasets provides classes that provide common datasets such as iris dataset.
 test_files:
+- test/test-wine.rb
 - test/run-test.rb
 - test/test-cifar.rb
+- test/test-fashion-mnist.rb
 - test/test-wikipedia.rb
 - test/test-iris.rb
 - test/helper.rb
 - test/test-mnist.rb
 - test/test-table.rb
+- test/test-adult.rb
 - test/test-penn-treebank.rb
+- test/test-dictionary.rb