RubyGems - red-datasets - Versions diffs - 0.1.2 → 0.1.3 - Mend

red-datasets 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/doc/text/news.md +8 -0
data/lib/datasets.rb +2 -0
data/lib/datasets/cifar.rb +1 -1
data/lib/datasets/cldr-plurals.rb +4 -4
data/lib/datasets/mnist.rb +6 -2
data/lib/datasets/rdatasets.rb +1 -1
data/lib/datasets/seaborn-data.rb +49 -0
data/lib/datasets/sudachi-synonym-dictionary.rb +169 -0
data/lib/datasets/{tar_gz_readable.rb → tar-gz-readable.rb} +0 -0
data/lib/datasets/version.rb +1 -1
data/test/test-cldr-plurals.rb +1 -1
data/test/test-rdatasets.rb +1 -1
data/test/test-seaborn-data.rb +97 -0
data/test/test-sudachi-synonym-dictionary.rb +48 -0
metadata +9 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 6fbd4d11063f89ba2e09250b751886086c953ec8bc92c75a6a351c31a36da0c4
-  data.tar.gz: acc6ff31f0f4ae3a6c6565fe569233c01615718c01300b0838ff744571edc34d
+  metadata.gz: 07c55b47d31b30ceaf4cdd3ea22da5c737d81884a494c6a11abc6fda6fbea22b
+  data.tar.gz: a28d34b5d28cb57349a81112ffc2db8fe9f94939beb21477af4d9d0c9d5b59ab
 SHA512:
-  metadata.gz: 26361511155b447ffed56a79b2336a9a1db96494bf856b23e7b39cc6a8b6a2039e7ed27564140761bdb2daaae7ee563b3695c464a7a7b21ff93b0636f6b8338d
-  data.tar.gz: 40446f90e410e0d86abeec186a1d7adcc5375e29c19dc934f823befb26a87d904458ef5ea18c9d64055493d29ed305dba53d6e4d86bd7d84488baf3745ebd792
+  metadata.gz: 4871ac4ec167cb78b3fce8f9c5de9f6cff6a4089b8e4fd87fe7bb3265865cfcbd86935e8f2fa0bc5e40fde8a471e1655390fcdf8dcc0a5197342143e0cb855e5
+  data.tar.gz: 66d31943cb857632518a90166972bfa9ebe4b8ec355eca8291da40183c260e3c175d5866220efc0e4174f780c8449b45004db425a8fc7453268236f9b7dcbc1d

data/doc/text/news.md CHANGED Viewed

@@ -1,5 +1,13 @@
 # News
+## 0.1.3 - 2021-07-09
+### Improvements
+  * `Datasets::SeabornData`: Added.
+  * `Datasets::SudachiSynonymDictionary`: Added.
 ## 0.1.2 - 2021-06-03
 ### Improvements

data/lib/datasets.rb CHANGED Viewed

@@ -16,5 +16,7 @@ require_relative "datasets/penguins"
 require_relative "datasets/penn-treebank"
 require_relative "datasets/postal-code-japan"
 require_relative "datasets/rdatasets"
+require_relative "datasets/seaborn-data"
+require_relative "datasets/sudachi-synonym-dictionary"
 require_relative "datasets/wikipedia"
 require_relative "datasets/wine"

data/lib/datasets/cifar.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-require_relative "tar_gz_readable"
+require_relative "tar-gz-readable"
 require_relative "dataset"
 module Datasets

data/lib/datasets/cldr-plurals.rb CHANGED Viewed

@@ -183,7 +183,7 @@ module Datasets
         end
         value = parse_value
         if value.nil?
-          raise Error.new("no value for #{operator}: #{@scanner.inspect}")
+          raise Error, "no value for #{operator}: #{@scanner.inspect}"
         end
         [operator, expr, value]
       end
@@ -267,7 +267,7 @@ module Datasets
         if operator
           value = parse_value
           if value.nil?
-            raise Error.new("no value for #{operator}: #{@scanner.inspect}")
+            raise Error, "no value for #{operator}: #{@scanner.inspect}"
           end
           [operator, operand, value]
         else
@@ -336,7 +336,7 @@ module Datasets
           skip_whitespaces
           # U+2026 HORIZONTAL ELLIPSIS
           unless @scanner.scan(/\u2026|\.\.\./)
-            raise "no ellipsis: #{@scanner.inspect}"
+            raise Error, "no ellipsis: #{@scanner.inspect}"
           end
           samples << :elipsis
         end
@@ -362,7 +362,7 @@ module Datasets
           skip_whitespaces
           decimal = @scanner.scan(/[0-9]+/)
           if decimal.nil?
-            raise "no decimal: #{@scanner.inspect}"
+            raise Error, "no decimal: #{@scanner.inspect}"
           end
           value += Float("0.#{decimal}")
           skip_whitespaces

data/lib/datasets/mnist.rb CHANGED Viewed

@@ -65,7 +65,9 @@ module Datasets
         n_bytes = n_uint32s * 4
         mnist_magic_number = 2051
         magic, n_images, n_rows, n_cols = f.read(n_bytes).unpack("N*")
-        raise "This is not #{dataset_name} image file" if magic != mnist_magic_number
+        if magic != mnist_magic_number
+          raise Error, "This is not #{dataset_name} image file"
+        end
         n_images.times do |i|
           data = f.read(n_rows * n_cols)
           label = labels[i]
@@ -99,7 +101,9 @@ module Datasets
         n_bytes = n_uint32s * 2
         mnist_magic_number = 2049
         magic, n_labels = f.read(n_bytes).unpack('N2')
-        raise "This is not #{dataset_name} label file" if magic != mnist_magic_number
+        if magic != mnist_magic_number
+          raise Error, "This is not #{dataset_name} label file"
+        end
         f.read(n_labels).unpack('C*')
       end
     end

data/lib/datasets/rdatasets.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 require_relative "dataset"
-require_relative "tar_gz_readable"
+require_relative "tar-gz-readable"
 module Datasets
   class RdatasetsList < Dataset

data/lib/datasets/seaborn-data.rb ADDED Viewed

@@ -0,0 +1,49 @@
+module Datasets
+  class SeabornData < Dataset
+    URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
+    def initialize(name)
+      super()
+      @metadata.id = "seaborn-data-#{name}"
+      @metadata.name = "SeabornData: #{name}"
+      @metadata.url = URL_FORMAT % {name: name}
+      @data_path = cache_dir_path + (name + ".csv")
+      @name = name
+    end
+    def each(&block)
+      return to_enum(__method__) unless block_given?
+      download(@data_path, @metadata.url) unless @data_path.exist?
+      CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
+        csv.each do |row|
+          record = prepare_record(row)
+          yield record
+        end
+      end
+    end
+    private
+    def prepare_record(csv_row)
+      record = csv_row.to_h
+      record.transform_keys!(&:to_sym)
+      # Perform the same preprocessing as seaborn's load_dataset function
+      preprocessor = :"preprocess_#{@name}_record"
+      __send__(preprocessor, record) if respond_to?(preprocessor, true)
+      record
+    end
+    # The same preprocessing as seaborn.load_dataset
+    def preprocess_flights_record(record)
+      record[:month] &&= record[:month][0,3]
+    end
+    # The same preprocessing as seaborn.load_dataset
+    def preprocess_penguins_record(record)
+      record[:sex] &&= record[:sex].capitalize
+    end
+  end
+end

data/lib/datasets/sudachi-synonym-dictionary.rb ADDED Viewed

@@ -0,0 +1,169 @@
+require "csv"
+require_relative "dataset"
+module Datasets
+  class SudachiSynonymDictionary < Dataset
+    class Synonym < Struct.new(:group_id,
+                               :is_noun,
+                               :expansion_type,
+                               :lexeme_id,
+                               :form_type,
+                               :acronym_type,
+                               :variant_type,
+                               :categories,
+                               :notation)
+      alias_method :noun?, :is_noun
+    end
+    def initialize
+      super()
+      @metadata.id = "sudachi-synonym-dictionary"
+      @metadata.name = "Sudachi synonym dictionary"
+      @metadata.url = "https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md"
+      @metadata.licenses = [
+        "Apache-2.0",
+      ]
+      @metadata.description = lambda do
+        download_description
+      end
+    end
+    def each
+      return to_enum(__method__) unless block_given?
+      lexeme_id_context = {}
+      open_data do |csv|
+        csv.each do |row|
+          group_id = row[0]
+          if group_id != lexeme_id_context[:group_id]
+            lexeme_id_context[:group_id] = group_id
+            lexeme_id_context[:counter] = 0
+          end
+          is_noun = (row[1] == "1")
+          expansion_type = normalize_expansion_type(row[2])
+          lexeme_id = normalize_lexeme_id(row[3], lexeme_id_context)
+          form_type = normalize_form_type(row[4])
+          acronym_type = normalize_acronym_type(row[5])
+          variant_type = normalize_variant_type(row[6])
+          categories = normalize_categories(row[7])
+          notation = row[8]
+          synonym = Synonym.new(group_id,
+                                is_noun,
+                                expansion_type,
+                                lexeme_id,
+                                form_type,
+                                acronym_type,
+                                variant_type,
+                                categories,
+                                notation)
+          yield(synonym)
+        end
+      end
+    end
+    private
+    def open_data
+      data_path = cache_dir_path + "synonyms.txt"
+      unless data_path.exist?
+        data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt"
+        download(data_path, data_url)
+      end
+      CSV.open(data_path, skip_blanks: true) do |csv|
+        yield(csv)
+      end
+    end
+    def download_description
+      description_path = cache_dir_path + "synonyms.md"
+      unless description_path.exist?
+        description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md"
+        download(description_path, description_url)
+      end
+      description_path.read
+    end
+    def normalize_expansion_type(type)
+      case type
+      when "0", ""
+        :always
+      when "1"
+        :expanded
+      when "2"
+        :never
+      else
+        raise Error, "unknown expansion type: #{type.inspect}"
+      end
+    end
+    def normalize_lexeme_id(id, context)
+      case id
+      when ""
+        lexeme_id_context[:counter] += 1
+        lexeme_id_context[:counter]
+      else
+        # Use only the first lexeme ID.
+        # Example:
+        #   000116,1,0,1/2,0,2,0,(IT/娯楽),ネットゲー,,
+        #   000116,1,0,1/2,0,2,0,(IT/娯楽),ネトゲ,,
+        Integer(id.split("/").first, 10)
+      end
+    end
+    def normalize_form_type(type)
+      case type
+      when "0", ""
+        :typical
+      when "1"
+        :translation
+      when "2"
+        :alias
+      when "3"
+        :old_name
+      when "4"
+        :misnomer
+      else
+        raise Error, "unknown form type: #{type.inspect}"
+      end
+    end
+    def normalize_acronym_type(type)
+      case type
+      when "0", ""
+        :typical
+      when "1"
+        :alphabet
+      when "2"
+        :others
+      else
+        raise Error, "unknown acronym type: #{type.inspect}"
+      end
+    end
+    def normalize_variant_type(type)
+      case type
+      when "0", ""
+        :typical
+      when "1"
+        :alphabet
+      when "2"
+        :general
+      when "3"
+        :misspelled
+      else
+        raise Error, "unknown variant type: #{type.inspect}"
+      end
+    end
+    def normalize_categories(categories)
+      case categories
+      when ""
+        nil
+      when /\A\((.*)\)\z/
+        $1.split("/")
+      else
+        raise Error, "invalid categories: #{categories.inspect}"
+      end
+    end
+  end
+end

data/lib/datasets/{tar_gz_readable.rb → tar-gz-readable.rb} RENAMED Viewed

File without changes

data/lib/datasets/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Datasets
-  VERSION = "0.1.2"
+  VERSION = "0.1.3"
 end

data/test/test-cldr-plurals.rb CHANGED Viewed

@@ -14,7 +14,7 @@ class CLDRPluralsTest < Test::Unit::TestCase
   test("#each") do
     locales = @dataset.each.to_a
     assert_equal([
-                   215,
+                   218,
                    locale("bm",
                           [
                             rule("other",

data/test/test-rdatasets.rb CHANGED Viewed

@@ -48,7 +48,7 @@ class RdatasetsTest < Test::Unit::TestCase
       test("without package_name") do
         records = @dataset.each.to_a
         assert_equal([
-                       1478,
+                       1714,
                        {
                          package: "AER",
                          dataset: "Affairs",

data/test/test-seaborn-data.rb ADDED Viewed

@@ -0,0 +1,97 @@
+class SeabornDataTest < Test::Unit::TestCase
+  sub_test_case("fmri") do
+    def setup
+      @dataset = Datasets::SeabornData.new("fmri")
+    end
+    def test_each
+      records = @dataset.each.to_a
+      assert_equal([
+                     1064,
+                     {
+                       subject: "s5",
+                       timepoint: 14,
+                       event: "stim",
+                       region: "parietal",
+                       signal: -0.0808829319505
+                     },
+                     {
+                       subject: "s0",
+                       timepoint: 0,
+                       event: "cue",
+                       region: "parietal",
+                       signal: -0.00689923478092
+                     }
+                   ],
+                   [
+                     records.size,
+                     records[1].to_h,
+                     records[-1].to_h
+                   ])
+    end
+  end
+  sub_test_case("flights") do
+    def setup
+      @dataset = Datasets::SeabornData.new("flights")
+    end
+    def test_each
+      records = @dataset.each.to_a
+      assert_equal([
+                     144,
+                     {
+                       year: 1949,
+                       month: "Feb",
+                       passengers: 118
+                     },
+                     {
+                       year: 1960,
+                       month: "Dec",
+                       passengers: 432
+                     }
+                   ],
+                   [
+                     records.size,
+                     records[1].to_h,
+                     records[-1].to_h
+                   ])
+    end
+  end
+  sub_test_case("penguins") do
+    def setup
+      @dataset = Datasets::SeabornData.new("penguins")
+    end
+    def test_each
+      records = @dataset.each.to_a
+      assert_equal([
+                     344,
+                     {
+                       species: "Adelie",
+                       island: "Torgersen",
+                       bill_length_mm: 39.5,
+                       bill_depth_mm: 17.4,
+                       flipper_length_mm: 186,
+                       body_mass_g: 3800,
+                       sex: "Female"
+                     },
+                     {
+                       species: "Gentoo",
+                       island: "Biscoe",
+                       bill_length_mm: 49.9,
+                       bill_depth_mm: 16.1,
+                       flipper_length_mm: 213,
+                       body_mass_g: 5400,
+                       sex: "Male"
+                     }
+                   ],
+                   [
+                     records.size,
+                     records[1].to_h,
+                     records[-1].to_h
+                   ])
+    end
+  end
+end

data/test/test-sudachi-synonym-dictionary.rb ADDED Viewed

@@ -0,0 +1,48 @@
+class SudachiSynonymDictionaryTest < Test::Unit::TestCase
+  def setup
+    @dataset = Datasets::SudachiSynonymDictionary.new
+  end
+  test('#each') do
+    records = @dataset.each.to_a
+    assert_equal([
+                   61335,
+                   {
+                     group_id: "000001",
+                     is_noun: true,
+                     expansion_type: :always,
+                     lexeme_id: 1,
+                     form_type: :typical,
+                     acronym_type: :typical,
+                     variant_type: :typical,
+                     categories: [],
+                     notation: "曖昧",
+                   },
+                   {
+                     group_id: "023705",
+                     is_noun: true,
+                     expansion_type: :always,
+                     lexeme_id: 1,
+                     form_type: :typical,
+                     acronym_type: :alphabet,
+                     variant_type: :typical,
+                     categories: ["単位"],
+                     notation: "GB",
+                   },
+                 ],
+                 [
+                   records.size,
+                   records[0].to_h,
+                   records[-1].to_h,
+                 ])
+  end
+  sub_test_case('#metadata') do
+    test('#description') do
+      description = @dataset.metadata.description
+      assert do
+        description.start_with?('# Sudachi 同義語辞書')
+      end
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: red-datasets
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.1.3
 platform: ruby
 authors:
 - tomisuker
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-06-03 00:00:00.000000000 Z
+date: 2021-07-09 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: csv
@@ -162,8 +162,10 @@ files:
 - lib/datasets/penn-treebank.rb
 - lib/datasets/postal-code-japan.rb
 - lib/datasets/rdatasets.rb
+- lib/datasets/seaborn-data.rb
+- lib/datasets/sudachi-synonym-dictionary.rb
 - lib/datasets/table.rb
-- lib/datasets/tar_gz_readable.rb
+- lib/datasets/tar-gz-readable.rb
 - lib/datasets/version.rb
 - lib/datasets/wikipedia.rb
 - lib/datasets/wine.rb
@@ -189,6 +191,8 @@ files:
 - test/test-penn-treebank.rb
 - test/test-postal-code-japan.rb
 - test/test-rdatasets.rb
+- test/test-seaborn-data.rb
+- test/test-sudachi-synonym-dictionary.rb
 - test/test-table.rb
 - test/test-wikipedia.rb
 - test/test-wine.rb
@@ -237,6 +241,8 @@ test_files:
 - test/test-penn-treebank.rb
 - test/test-postal-code-japan.rb
 - test/test-rdatasets.rb
+- test/test-seaborn-data.rb
+- test/test-sudachi-synonym-dictionary.rb
 - test/test-table.rb
 - test/test-wikipedia.rb
 - test/test-wine.rb