RubyGems - red-datasets - Versions diffs - 0.1.2 → 0.1.3 - Mend

red-datasets 0.1.2 → 0.1.3

Files changed (16) hide show

checksums.yaml +4 -4
data/doc/text/news.md +8 -0
data/lib/datasets.rb +2 -0
data/lib/datasets/cifar.rb +1 -1
data/lib/datasets/cldr-plurals.rb +4 -4
data/lib/datasets/mnist.rb +6 -2
data/lib/datasets/rdatasets.rb +1 -1
data/lib/datasets/seaborn-data.rb +49 -0
data/lib/datasets/sudachi-synonym-dictionary.rb +169 -0
data/lib/datasets/{tar_gz_readable.rb → tar-gz-readable.rb} +0 -0
data/lib/datasets/version.rb +1 -1
data/test/test-cldr-plurals.rb +1 -1
data/test/test-rdatasets.rb +1 -1
data/test/test-seaborn-data.rb +97 -0
data/test/test-sudachi-synonym-dictionary.rb +48 -0
metadata +9 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 6fbd4d11063f89ba2e09250b751886086c953ec8bc92c75a6a351c31a36da0c4
-  data.tar.gz: acc6ff31f0f4ae3a6c6565fe569233c01615718c01300b0838ff744571edc34d
+  metadata.gz: 07c55b47d31b30ceaf4cdd3ea22da5c737d81884a494c6a11abc6fda6fbea22b
+  data.tar.gz: a28d34b5d28cb57349a81112ffc2db8fe9f94939beb21477af4d9d0c9d5b59ab
 SHA512:
-  metadata.gz: 26361511155b447ffed56a79b2336a9a1db96494bf856b23e7b39cc6a8b6a2039e7ed27564140761bdb2daaae7ee563b3695c464a7a7b21ff93b0636f6b8338d
-  data.tar.gz: 40446f90e410e0d86abeec186a1d7adcc5375e29c19dc934f823befb26a87d904458ef5ea18c9d64055493d29ed305dba53d6e4d86bd7d84488baf3745ebd792
+  metadata.gz: 4871ac4ec167cb78b3fce8f9c5de9f6cff6a4089b8e4fd87fe7bb3265865cfcbd86935e8f2fa0bc5e40fde8a471e1655390fcdf8dcc0a5197342143e0cb855e5
+  data.tar.gz: 66d31943cb857632518a90166972bfa9ebe4b8ec355eca8291da40183c260e3c175d5866220efc0e4174f780c8449b45004db425a8fc7453268236f9b7dcbc1d

data/doc/text/news.md CHANGED Viewed

@@ -1,5 +1,13 @@
 # News
+## 0.1.3 - 2021-07-09
+### Improvements
+  * `Datasets::SeabornData`: Added.
+  * `Datasets::SudachiSynonymDictionary`: Added.
 ## 0.1.2 - 2021-06-03
 ### Improvements

data/lib/datasets.rb CHANGED Viewed

@@ -16,5 +16,7 @@ require_relative "datasets/penguins"
 require_relative "datasets/penn-treebank"
 require_relative "datasets/postal-code-japan"
 require_relative "datasets/rdatasets"
+require_relative "datasets/seaborn-data"
+require_relative "datasets/sudachi-synonym-dictionary"
 require_relative "datasets/wikipedia"
 require_relative "datasets/wine"

data/lib/datasets/cifar.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-require_relative "tar_gz_readable"
+require_relative "tar-gz-readable"
 require_relative "dataset"
 module Datasets

data/lib/datasets/cldr-plurals.rb CHANGED Viewed

@@ -183,7 +183,7 @@ module Datasets
         end
         value = parse_value
         if value.nil?
-          raise Error.new("no value for #{operator}: #{@scanner.inspect}")
+          raise Error, "no value for #{operator}: #{@scanner.inspect}"
         end
         [operator, expr, value]
       end
@@ -267,7 +267,7 @@ module Datasets
         if operator
           value = parse_value
           if value.nil?
-            raise Error.new("no value for #{operator}: #{@scanner.inspect}")
+            raise Error, "no value for #{operator}: #{@scanner.inspect}"
           end
           [operator, operand, value]
         else
@@ -336,7 +336,7 @@ module Datasets
           skip_whitespaces
           # U+2026 HORIZONTAL ELLIPSIS
           unless @scanner.scan(/\u2026|\.\.\./)
-            raise "no ellipsis: #{@scanner.inspect}"
+            raise Error, "no ellipsis: #{@scanner.inspect}"
           end
           samples << :elipsis
         end
@@ -362,7 +362,7 @@ module Datasets
           skip_whitespaces
           decimal = @scanner.scan(/[0-9]+/)
           if decimal.nil?
-            raise "no decimal: #{@scanner.inspect}"
+            raise Error, "no decimal: #{@scanner.inspect}"
           end
           value += Float("0.#{decimal}")
           skip_whitespaces

data/lib/datasets/mnist.rb CHANGED Viewed

@@ -65,7 +65,9 @@ module Datasets
         n_bytes = n_uint32s * 4
         mnist_magic_number = 2051
         magic, n_images, n_rows, n_cols = f.read(n_bytes).unpack("N*")
-        raise "This is not #{dataset_name} image file" if magic != mnist_magic_number
+        if magic != mnist_magic_number
+          raise Error, "This is not #{dataset_name} image file"
+        end
         n_images.times do |i|
           data = f.read(n_rows * n_cols)
           label = labels[i]
@@ -99,7 +101,9 @@ module Datasets
         n_bytes = n_uint32s * 2
         mnist_magic_number = 2049
         magic, n_labels = f.read(n_bytes).unpack('N2')
-        raise "This is not #{dataset_name} label file" if magic != mnist_magic_number
+        if magic != mnist_magic_number
+          raise Error, "This is not #{dataset_name} label file"
+        end
         f.read(n_labels).unpack('C*')
       end
     end

data/lib/datasets/rdatasets.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 require_relative "dataset"
-require_relative "tar_gz_readable"
+require_relative "tar-gz-readable"
 module Datasets
   class RdatasetsList < Dataset

data/lib/datasets/seaborn-data.rb ADDED Viewed

@@ -0,0 +1,49 @@
+module Datasets
+  class SeabornData < Dataset
+    URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
+    def initialize(name)
+      super()
+      @metadata.id = "seaborn-data-#{name}"
+      @metadata.name = "SeabornData: #{name}"
+      @metadata.url = URL_FORMAT % {name: name}
+      @data_path = cache_dir_path + (name + ".csv")
+      @name = name
+    end
+    def each(&block)
+      return to_enum(__method__) unless block_given?
+      download(@data_path, @metadata.url) unless @data_path.exist?
+      CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
+        csv.each do |row|
+          record = prepare_record(row)
+          yield record
+        end
+      end
+    end
+    private
+    def prepare_record(csv_row)
+      record = csv_row.to_h
+      record.transform_keys!(&:to_sym)
+      # Perform the same preprocessing as seaborn's load_dataset function
+      preprocessor = :"preprocess_#{@name}_record"
+      __send__(preprocessor, record) if respond_to?(preprocessor, true)
+      record
+    end
+    # The same preprocessing as seaborn.load_dataset
+    def preprocess_flights_record(record)
+      record[:month] &&= record[:month][0,3]
+    end
+    # The same preprocessing as seaborn.load_dataset
+    def preprocess_penguins_record(record)
+      record[:sex] &&= record[:sex].capitalize
+    end
+  end
+end

data/lib/datasets/sudachi-synonym-dictionary.rb ADDED Viewed

@@ -0,0 +1,169 @@
+require "csv"
+require_relative "dataset"
+module Datasets
+  class SudachiSynonymDictionary < Dataset
+    class Synonym < Struct.new(:group_id,
+                               :is_noun,
+                               :expansion_type,
+                               :lexeme_id,
+                               :form_type,
+                               :acronym_type,
+                               :variant_type,
+                               :categories,
+                               :notation)
+      alias_method :noun?, :is_noun
+    end
+    def initialize
+      super()
+      @metadata.id = "sudachi-synonym-dictionary"
+      @metadata.name = "Sudachi synonym dictionary"
+      @metadata.url = "https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md"
+      @metadata.licenses = [
+        "Apache-2.0",
+      ]
+      @metadata.description = lambda do
+        download_description
+      end
+    end
+    def each
+      return to_enum(__method__) unless block_given?
+      lexeme_id_context = {}
+      open_data do |csv|
+        csv.each do |row|
+          group_id = row[0]
+          if group_id != lexeme_id_context[:group_id]
+            lexeme_id_context[:group_id] = group_id
+            lexeme_id_context[:counter] = 0
+          end
+          is_noun = (row[1] == "1")
+          expansion_type = normalize_expansion_type(row[2])
+          lexeme_id = normalize_lexeme_id(row[3], lexeme_id_context)
+          form_type = normalize_form_type(row[4])
+          acronym_type = normalize_acronym_type(row[5])
+          variant_type = normalize_variant_type(row[6])
+          categories = normalize_categories(row[7])
+          notation = row[8]
+          synonym = Synonym.new(group_id,
+                                is_noun,
+                                expansion_type,
+                                lexeme_id,
+                                form_type,
+                                acronym_type,
+                                variant_type,
+                                categories,
+                                notation)
+          yield(synonym)
+        end
+      end
+    end
+    private
+    def open_data
+      data_path = cache_dir_path + "synonyms.txt"
+      unless data_path.exist?
+        data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt"
+        download(data_path, data_url)
+      end
+      CSV.open(data_path, skip_blanks: true) do |csv|
+        yield(csv)
+      end
+    end
+    def download_description
+      description_path = cache_dir_path + "synonyms.md"
+      unless description_path.exist?
+        description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md"
+        download(description_path, description_url)
+      end
+      description_path.read
+    end
+    def normalize_expansion_type(type)
+      case type
+      when "0", ""
+        :always
+      when "1"
+        :expanded
+      when "2"
+        :never
+      else
+        raise Error, "unknown expansion type: #{type.inspect}"
+      end
+    end
+    def normalize_lexeme_id(id, context)
+      case id
+      when ""
+        lexeme_id_context[:counter] += 1
+        lexeme_id_context[:counter]
+      else
+        # Use only the first lexeme ID.
+        # Example:
+        #   000116,1,0,1/2,0,2,0,(IT/娯楽),ネットゲー,,
+        #   000116,1,0,1/2,0,2,0,(IT/娯楽),ネトゲ,,
+        Integer(id.split("/").first, 10)
+      end
+    end
+    def normalize_form_type(type)
+      case type
+      when "0", ""
+        :typical
+      when "1"
+        :translation
+      when "2"
+        :alias
+      when "3"
+        :old_name
+      when "4"
+        :misnomer
+      else
+        raise Error, "unknown form type: #{type.inspect}"
+      end
+    end
+    def normalize_acronym_type(type)
+      case type
+      when "0", ""
+        :typical
+      when "1"
+        :alphabet
+      when "2"
+        :others
+      else
+        raise Error, "unknown acronym type: #{type.inspect}"
+      end
+    end
+    def normalize_variant_type(type)
+      case type
+      when "0", ""
+        :typical
+      when "1"
+        :alphabet
+      when "2"
+        :general
+      when "3"
+        :misspelled
+      else
+        raise Error, "unknown variant type: #{type.inspect}"
+      end
+    end
+    def normalize_categories(categories)
+      case categories
+      when ""
+        nil
+      when /\A\((.*)\)\z/
+        $1.split("/")
+      else
+        raise Error, "invalid categories: #{categories.inspect}"
+      end
+    end
+  end
+end

data/lib/datasets/{tar_gz_readable.rb → tar-gz-readable.rb} RENAMED Viewed

File without changes

data/lib/datasets/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Datasets
-  VERSION = "0.1.2"
+  VERSION = "0.1.3"
 end

data/test/test-cldr-plurals.rb CHANGED Viewed

@@ -14,7 +14,7 @@ class CLDRPluralsTest < Test::Unit::TestCase
   test("#each") do
     locales = @dataset.each.to_a
     assert_equal([
-                   215,
+                   218,
                    locale("bm",
                           [
                             rule("other",

data/test/test-rdatasets.rb CHANGED Viewed

@@ -48,7 +48,7 @@ class RdatasetsTest < Test::Unit::TestCase
       test("without package_name") do
         records = @dataset.each.to_a
         assert_equal([
-                       1478,
+                       1714,
                        {
                          package: "AER",
                          dataset: "Affairs",

data/test/test-seaborn-data.rb ADDED Viewed

@@ -0,0 +1,97 @@
+class SeabornDataTest < Test::Unit::TestCase
+  sub_test_case("fmri") do
+    def setup
+      @dataset = Datasets::SeabornData.new("fmri")
+    end
+    def test_each
+      records = @dataset.each.to_a
+      assert_equal([
+                     1064,
+                     {
+                       subject: "s5",
+                       timepoint: 14,
+                       event: "stim",
+                       region: "parietal",
+                       signal: -0.0808829319505
+                     },
+                     {
+                       subject: "s0",
+                       timepoint: 0,
+                       event: "cue",
+                       region: "parietal",
+                       signal: -0.00689923478092
+                     }
+                   ],
+                   [
+                     records.size,
+                     records[1].to_h,
+                     records[-1].to_h
+                   ])
+    end
+  end
+  sub_test_case("flights") do
+    def setup
+      @dataset = Datasets::SeabornData.new("flights")
+    end
+    def test_each
+      records = @dataset.each.to_a
+      assert_equal([
+                     144,
+                     {
+                       year: 1949,
+                       month: "Feb",
+                       passengers: 118
+                     },
+                     {
+                       year: 1960,
+                       month: "Dec",
+                       passengers: 432
+                     }
+                   ],
+                   [
+                     records.size,
+                     records[1].to_h,
+                     records[-1].to_h
+                   ])
+    end
+  end
+  sub_test_case("penguins") do
+    def setup
+      @dataset = Datasets::SeabornData.new("penguins")
+    end
+    def test_each
+      records = @dataset.each.to_a
+      assert_equal([
+                     344,
+                     {
+                       species: "Adelie",
+                       island: "Torgersen",
+                       bill_length_mm: 39.5,
+                       bill_depth_mm: 17.4,
+                       flipper_length_mm: 186,
+                       body_mass_g: 3800,
+                       sex: "Female"
+                     },
+                     {
+                       species: "Gentoo",
+                       island: "Biscoe",
+                       bill_length_mm: 49.9,
+                       bill_depth_mm: 16.1,
+                       flipper_length_mm: 213,
+                       body_mass_g: 5400,
+                       sex: "Male"
+                     }
+                   ],
+                   [
+                     records.size,
+                     records[1].to_h,
+                     records[-1].to_h
+                   ])
+    end
+  end
+end

data/test/test-sudachi-synonym-dictionary.rb ADDED Viewed

@@ -0,0 +1,48 @@
+class SudachiSynonymDictionaryTest < Test::Unit::TestCase
+  def setup
+    @dataset = Datasets::SudachiSynonymDictionary.new
+  end
+  test('#each') do
+    records = @dataset.each.to_a
+    assert_equal([
+                   61335,
+                   {
+                     group_id: "000001",
+                     is_noun: true,
+                     expansion_type: :always,
+                     lexeme_id: 1,
+                     form_type: :typical,
+                     acronym_type: :typical,
+                     variant_type: :typical,
+                     categories: [],
+                     notation: "曖昧",
+                   },
+                   {
+                     group_id: "023705",
+                     is_noun: true,
+                     expansion_type: :always,
+                     lexeme_id: 1,
+                     form_type: :typical,
+                     acronym_type: :alphabet,
+                     variant_type: :typical,
+                     categories: ["単位"],
+                     notation: "GB",
+                   },
+                 ],
+                 [
+                   records.size,
+                   records[0].to_h,
+                   records[-1].to_h,
+                 ])
+  end
+  sub_test_case('#metadata') do
+    test('#description') do
+      description = @dataset.metadata.description
+      assert do
+        description.start_with?('# Sudachi 同義語辞書')
+      end
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: red-datasets
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.1.3
 platform: ruby
 authors:
 - tomisuker
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-06-03 00:00:00.000000000 Z
+date: 2021-07-09 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: csv
@@ -162,8 +162,10 @@ files:
 - lib/datasets/penn-treebank.rb
 - lib/datasets/postal-code-japan.rb
 - lib/datasets/rdatasets.rb
+- lib/datasets/seaborn-data.rb
+- lib/datasets/sudachi-synonym-dictionary.rb
 - lib/datasets/table.rb
-- lib/datasets/tar_gz_readable.rb
+- lib/datasets/tar-gz-readable.rb
 - lib/datasets/version.rb
 - lib/datasets/wikipedia.rb
 - lib/datasets/wine.rb
@@ -189,6 +191,8 @@ files:
 - test/test-penn-treebank.rb
 - test/test-postal-code-japan.rb
 - test/test-rdatasets.rb
+- test/test-seaborn-data.rb
+- test/test-sudachi-synonym-dictionary.rb
 - test/test-table.rb
 - test/test-wikipedia.rb
 - test/test-wine.rb
@@ -237,6 +241,8 @@ test_files:
 - test/test-penn-treebank.rb
 - test/test-postal-code-japan.rb
 - test/test-rdatasets.rb
+- test/test-seaborn-data.rb
+- test/test-sudachi-synonym-dictionary.rb
 - test/test-table.rb
 - test/test-wikipedia.rb
 - test/test-wine.rb