RubyGems - red-datasets - Versions diffs - 0.0.8 → 0.1.3 - Mend

red-datasets 0.0.8 → 0.1.3

Files changed (40) hide show

checksums.yaml +4 -4
data/README.md +6 -0
data/doc/text/news.md +93 -0
data/lib/datasets.rb +9 -0
data/lib/datasets/adult.rb +4 -3
data/lib/datasets/cifar.rb +4 -12
data/lib/datasets/cldr-plurals.rb +385 -0
data/lib/datasets/communities.rb +198 -0
data/lib/datasets/dataset.rb +20 -1
data/lib/datasets/downloader.rb +54 -26
data/lib/datasets/e-stat-japan.rb +320 -0
data/lib/datasets/error.rb +4 -0
data/lib/datasets/hepatitis.rb +207 -0
data/lib/datasets/libsvm-dataset-list.rb +194 -54
data/lib/datasets/libsvm.rb +1 -9
data/lib/datasets/mnist.rb +6 -4
data/lib/datasets/mushroom.rb +256 -0
data/lib/datasets/penguins.rb +146 -0
data/lib/datasets/rdatasets.rb +95 -0
data/lib/datasets/seaborn-data.rb +49 -0
data/lib/datasets/sudachi-synonym-dictionary.rb +169 -0
data/lib/datasets/table.rb +83 -3
data/lib/datasets/tar-gz-readable.rb +14 -0
data/lib/datasets/version.rb +1 -1
data/lib/datasets/wikipedia.rb +2 -10
data/red-datasets.gemspec +1 -0
data/test/run-test.rb +2 -0
data/test/test-cldr-plurals.rb +180 -0
data/test/test-communities.rb +290 -0
data/test/test-dataset.rb +27 -0
data/test/test-downloader.rb +29 -0
data/test/test-e-stat-japan.rb +383 -0
data/test/test-hepatitis.rb +74 -0
data/test/test-mushroom.rb +80 -0
data/test/test-penguins.rb +251 -0
data/test/test-rdatasets.rb +136 -0
data/test/test-seaborn-data.rb +97 -0
data/test/test-sudachi-synonym-dictionary.rb +48 -0
data/test/test-table.rb +123 -18
metadata +61 -15

data/lib/datasets/rdatasets.rb ADDED Viewed

@@ -0,0 +1,95 @@
+require_relative "dataset"
+require_relative "tar-gz-readable"
+module Datasets
+  class RdatasetsList < Dataset
+    Record = Struct.new(:package,
+                        :dataset,
+                        :title,
+                        :rows,
+                        :cols,
+                        :n_binary,
+                        :n_character,
+                        :n_factor,
+                        :n_logical,
+                        :n_numeric,
+                        :csv,
+                        :doc)
+    def initialize
+      super
+      @metadata.id = "rdatasets"
+      @metadata.name = "Rdatasets"
+      @metadata.url = "https://vincentarelbundock.github.io/Rdatasets/"
+      @metadata.licenses = ["GPL-3"]
+      @data_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv"
+      @data_path = cache_dir_path + "datasets.csv"
+    end
+    def filter(package: nil, dataset: nil)
+      return to_enum(__method__, package: package, dataset: dataset) unless block_given?
+      conds = {}
+      conds["Package"] = package if package
+      conds["Item"]    = dataset if dataset
+      if conds.empty?
+        each_row {|row| yield Record.new(*row.fields) }
+      else
+        each_row do |row|
+          if conds.all? {|k, v| row[k] == v }
+            yield Record.new(*row.fields)
+          end
+        end
+      end
+    end
+    def each(&block)
+      filter(&block)
+    end
+    private def each_row(&block)
+      download(@data_path, @data_url) unless @data_path.exist?
+      CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
+        csv.each(&block)
+      end
+    end
+  end
+  class Rdatasets < Dataset
+    def initialize(package_name, dataset_name)
+      list = RdatasetsList.new
+      info = list.filter(package: package_name, dataset: dataset_name).first
+      unless info
+        raise ArgumentError, "Unable to locate dataset #{package_name}/#{dataset_name}"
+      end
+      super()
+      @metadata.id = "rdatasets-#{package_name}-#{dataset_name}"
+      @metadata.name = "Rdatasets: #{package_name}: #{dataset_name}"
+      @metadata.url = info.csv
+      @metadata.licenses = ["GPL-3"]
+      @metadata.description = info.title
+      # Follow the original directory structure in the cache directory
+      @data_path = cache_dir_path + (dataset_name + ".csv")
+      @package_name = package_name
+      @dataset_name = dataset_name
+    end
+    def each(&block)
+      return to_enum(__method__) unless block_given?
+      download(@data_path, @metadata.url) unless @data_path.exist?
+      CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
+        csv.each do |row|
+          record = row.to_h
+          record.delete("")
+          record.transform_keys!(&:to_sym)
+          yield record
+        end
+      end
+    end
+  end
+end

data/lib/datasets/seaborn-data.rb ADDED Viewed

@@ -0,0 +1,49 @@
+module Datasets
+  class SeabornData < Dataset
+    URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
+    def initialize(name)
+      super()
+      @metadata.id = "seaborn-data-#{name}"
+      @metadata.name = "SeabornData: #{name}"
+      @metadata.url = URL_FORMAT % {name: name}
+      @data_path = cache_dir_path + (name + ".csv")
+      @name = name
+    end
+    def each(&block)
+      return to_enum(__method__) unless block_given?
+      download(@data_path, @metadata.url) unless @data_path.exist?
+      CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
+        csv.each do |row|
+          record = prepare_record(row)
+          yield record
+        end
+      end
+    end
+    private
+    def prepare_record(csv_row)
+      record = csv_row.to_h
+      record.transform_keys!(&:to_sym)
+      # Perform the same preprocessing as seaborn's load_dataset function
+      preprocessor = :"preprocess_#{@name}_record"
+      __send__(preprocessor, record) if respond_to?(preprocessor, true)
+      record
+    end
+    # The same preprocessing as seaborn.load_dataset
+    def preprocess_flights_record(record)
+      record[:month] &&= record[:month][0,3]
+    end
+    # The same preprocessing as seaborn.load_dataset
+    def preprocess_penguins_record(record)
+      record[:sex] &&= record[:sex].capitalize
+    end
+  end
+end

data/lib/datasets/sudachi-synonym-dictionary.rb ADDED Viewed

@@ -0,0 +1,169 @@
+require "csv"
+require_relative "dataset"
+module Datasets
+  class SudachiSynonymDictionary < Dataset
+    class Synonym < Struct.new(:group_id,
+                               :is_noun,
+                               :expansion_type,
+                               :lexeme_id,
+                               :form_type,
+                               :acronym_type,
+                               :variant_type,
+                               :categories,
+                               :notation)
+      alias_method :noun?, :is_noun
+    end
+    def initialize
+      super()
+      @metadata.id = "sudachi-synonym-dictionary"
+      @metadata.name = "Sudachi synonym dictionary"
+      @metadata.url = "https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md"
+      @metadata.licenses = [
+        "Apache-2.0",
+      ]
+      @metadata.description = lambda do
+        download_description
+      end
+    end
+    def each
+      return to_enum(__method__) unless block_given?
+      lexeme_id_context = {}
+      open_data do |csv|
+        csv.each do |row|
+          group_id = row[0]
+          if group_id != lexeme_id_context[:group_id]
+            lexeme_id_context[:group_id] = group_id
+            lexeme_id_context[:counter] = 0
+          end
+          is_noun = (row[1] == "1")
+          expansion_type = normalize_expansion_type(row[2])
+          lexeme_id = normalize_lexeme_id(row[3], lexeme_id_context)
+          form_type = normalize_form_type(row[4])
+          acronym_type = normalize_acronym_type(row[5])
+          variant_type = normalize_variant_type(row[6])
+          categories = normalize_categories(row[7])
+          notation = row[8]
+          synonym = Synonym.new(group_id,
+                                is_noun,
+                                expansion_type,
+                                lexeme_id,
+                                form_type,
+                                acronym_type,
+                                variant_type,
+                                categories,
+                                notation)
+          yield(synonym)
+        end
+      end
+    end
+    private
+    def open_data
+      data_path = cache_dir_path + "synonyms.txt"
+      unless data_path.exist?
+        data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt"
+        download(data_path, data_url)
+      end
+      CSV.open(data_path, skip_blanks: true) do |csv|
+        yield(csv)
+      end
+    end
+    def download_description
+      description_path = cache_dir_path + "synonyms.md"
+      unless description_path.exist?
+        description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md"
+        download(description_path, description_url)
+      end
+      description_path.read
+    end
+    def normalize_expansion_type(type)
+      case type
+      when "0", ""
+        :always
+      when "1"
+        :expanded
+      when "2"
+        :never
+      else
+        raise Error, "unknown expansion type: #{type.inspect}"
+      end
+    end
+    def normalize_lexeme_id(id, context)
+      case id
+      when ""
+        lexeme_id_context[:counter] += 1
+        lexeme_id_context[:counter]
+      else
+        # Use only the first lexeme ID.
+        # Example:
+        #   000116,1,0,1/2,0,2,0,(IT/娯楽),ネットゲー,,
+        #   000116,1,0,1/2,0,2,0,(IT/娯楽),ネトゲ,,
+        Integer(id.split("/").first, 10)
+      end
+    end
+    def normalize_form_type(type)
+      case type
+      when "0", ""
+        :typical
+      when "1"
+        :translation
+      when "2"
+        :alias
+      when "3"
+        :old_name
+      when "4"
+        :misnomer
+      else
+        raise Error, "unknown form type: #{type.inspect}"
+      end
+    end
+    def normalize_acronym_type(type)
+      case type
+      when "0", ""
+        :typical
+      when "1"
+        :alphabet
+      when "2"
+        :others
+      else
+        raise Error, "unknown acronym type: #{type.inspect}"
+      end
+    end
+    def normalize_variant_type(type)
+      case type
+      when "0", ""
+        :typical
+      when "1"
+        :alphabet
+      when "2"
+        :general
+      when "3"
+        :misspelled
+      else
+        raise Error, "unknown variant type: #{type.inspect}"
+      end
+    end
+    def normalize_categories(categories)
+      case categories
+      when ""
+        nil
+      when /\A\((.*)\)\z/
+        $1.split("/")
+      else
+        raise Error, "invalid categories: #{categories.inspect}"
+      end
+    end
+  end
+end

data/lib/datasets/table.rb CHANGED Viewed

@@ -2,19 +2,99 @@ require "datasets/dictionary"
 module Datasets
   class Table
+    class Record
+      include Enumerable
+      def initialize(table, index)
+        @table = table
+        @index = index
+      end
+      def [](column_name_or_column_index)
+        @table[column_name_or_column_index][@index]
+      end
+      def each
+        return to_enum(__method__) unless block_given?
+        @table.each_column.each do |column_name, column_values|
+          yield(column_name, column_values[@index])
+        end
+      end
+      def values
+        @table.each_column.collect do |_column_name, column_values|
+          column_values[@index]
+        end
+      end
+      def to_h
+        hash = {}
+        each do |column_name, column_value|
+          hash[column_name] = column_value
+        end
+        hash
+      end
+      def inspect
+        "#<#{self.class.name} #{@table.dataset.metadata.name}[#{@index}] #{to_h.inspect}>"
+      end
+    end
     include Enumerable
+    attr_reader :dataset
     def initialize(dataset)
       @dataset = dataset
       @dictionaries = {}
     end
-    def each(&block)
+    def n_columns
+      columner_data.size
+    end
+    alias_method :size, :n_columns
+    alias_method :length, :n_columns
+    def n_rows
+      first_column = columner_data.first
+      return 0 if first_column.nil?
+      first_column[1].size
+    end
+    def column_names
+      columner_data.keys
+    end
+    def each_column(&block)
       columner_data.each(&block)
     end
+    alias_method :each, :each_column
-    def [](name)
-      columner_data[normalize_name(name)]
+    def each_record
+      return to_enum(__method__) unless block_given?
+      n_rows.times do |i|
+        yield(Record.new(self, i))
+      end
+    end
+    def find_record(row)
+      row += n_rows if row < 0
+      return nil if row < 0
+      return nil if row >= n_rows
+      Record.new(self, row)
+    end
+    def [](name_or_index)
+      case name_or_index
+      when Integer
+        index = name_or_index
+        columner_data.each_with_index do |(_name, values), i|
+          return values if i == index
+        end
+        nil
+      else
+        name = name_or_index
+        columner_data[normalize_name(name)]
+      end
     end
     def dictionary_encode(name)

data/lib/datasets/tar-gz-readable.rb ADDED Viewed

@@ -0,0 +1,14 @@
+require "rubygems/package"
+require "zlib"
+module Datasets
+  module TarGzReadable
+    def open_tar_gz(data_path)
+      Zlib::GzipReader.open(data_path) do |f|
+        Gem::Package::TarReader.new(f) do |tar|
+          yield(tar)
+        end
+      end
+    end
+  end
+end

data/lib/datasets/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Datasets
-  VERSION = "0.0.8"
+  VERSION = "0.1.3"
 end

data/lib/datasets/wikipedia.rb CHANGED Viewed

@@ -52,7 +52,7 @@ module Datasets
     end
     private
-    def open_data
+    def open_data(&block)
       base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
       data_path = cache_dir_path + base_name
       unless data_path.exist?
@@ -60,15 +60,7 @@ module Datasets
         download(data_path, data_url)
       end
-      input, output = IO.pipe
-      pid = spawn("bzcat", data_path.to_s, {:out => output})
-      begin
-        output.close
-        yield(input)
-      ensure
-        input.close
-        Process.waitpid(pid)
-      end
+      extract_bz2(data_path, &block)
     end
     def type_in_path