RubyGems - red-datasets - Versions diffs - 0.0.6 → 0.1.1 - Mend

red-datasets 0.0.6 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

checksums.yaml +4 -4
data/README.md +23 -7
data/doc/text/news.md +124 -0
data/lib/datasets.rb +18 -6
data/lib/datasets/adult.rb +84 -0
data/lib/datasets/cldr-plurals.rb +385 -0
data/lib/datasets/communities.rb +198 -0
data/lib/datasets/dataset.rb +13 -0
data/lib/datasets/dictionary.rb +59 -0
data/lib/datasets/downloader.rb +37 -62
data/lib/datasets/e-stat-japan.rb +320 -0
data/lib/datasets/error.rb +4 -0
data/lib/datasets/fashion-mnist.rb +12 -0
data/lib/datasets/hepatitis.rb +207 -0
data/lib/datasets/iris.rb +1 -1
data/lib/datasets/libsvm-dataset-list.rb +277 -0
data/lib/datasets/libsvm.rb +135 -0
data/lib/datasets/mnist.rb +11 -8
data/lib/datasets/mushroom.rb +256 -0
data/lib/datasets/penguins.rb +125 -0
data/lib/datasets/penn-treebank.rb +2 -9
data/lib/datasets/postal-code-japan.rb +154 -0
data/lib/datasets/table.rb +99 -3
data/lib/datasets/version.rb +1 -1
data/lib/datasets/wikipedia.rb +2 -10
data/lib/datasets/wine.rb +64 -0
data/red-datasets.gemspec +4 -0
data/test/helper.rb +1 -0
data/test/run-test.rb +2 -0
data/test/test-adult.rb +126 -0
data/test/test-cldr-plurals.rb +180 -0
data/test/test-communities.rb +290 -0
data/test/test-dictionary.rb +43 -0
data/test/test-e-stat-japan.rb +383 -0
data/test/test-fashion-mnist.rb +137 -0
data/test/test-hepatitis.rb +74 -0
data/test/test-libsvm-dataset-list.rb +47 -0
data/test/test-libsvm.rb +205 -0
data/test/test-mnist.rb +95 -70
data/test/test-mushroom.rb +80 -0
data/test/test-penguins.rb +239 -0
data/test/test-penn-treebank.rb +6 -6
data/test/test-postal-code-japan.rb +69 -0
data/test/test-table.rb +144 -19
data/test/test-wine.rb +58 -0
metadata +89 -8

data/lib/datasets/communities.rb ADDED Viewed

@@ -0,0 +1,198 @@
+require "csv"
+require_relative "dataset"
+module Datasets
+  class Communities < Dataset
+    Record = Struct.new(
+      :state,
+      :county,
+      :community,
+      :community_name,
+      :fold,
+      :population,
+      :household_size,
+      :race_percent_black,
+      :race_percent_white,
+      :race_percent_asian,
+      :race_percent_hispanic,
+      :age_percent_12_to_21,
+      :age_percent_12_to_29,
+      :age_percent_16_to_24,
+      :age_percent_65_and_upper,
+      :n_people_urban,
+      :percent_people_urban,
+      :median_income,
+      :percent_households_with_wage,
+      :percent_households_with_farm_self,
+      :percent_households_with_investment_income,
+      :percent_households_with_social_security,
+      :percent_households_with_public_assistant,
+      :percent_households_with_retire,
+      :median_family_income,
+      :per_capita_income,
+      :per_capita_income_white,
+      :per_capita_income_black,
+      :per_capita_income_indian,
+      :per_capita_income_asian,
+      :per_capita_income_other,
+      :per_capita_income_hispanic,
+      :n_people_under_poverty,
+      :percent_people_under_poverty,
+      :percent_less_9th_grade,
+      :percent_not_high_school_graduate,
+      :percent_bachelors_or_more,
+      :percent_unemployed,
+      :percent_employed,
+      :percent_employed_manufacturing,
+      :percent_employed_professional_service,
+      :percent_occupations_manufacturing,
+      :percent_occupations_management_professional,
+      :male_percent_divorced,
+      :male_percent_never_married,
+      :female_percent_divorced,
+      :total_percent_divorced,
+      :mean_persons_per_family,
+      :percent_family_2_parents,
+      :percent_kids_2_parents,
+      :percent_young_kids_2_parents,
+      :percent_teen_2_parents,
+      :percent_work_mom_young_kids,
+      :percent_work_mom,
+      :n_illegals,
+      :percent_illegals,
+      :n_immigrants,
+      :percent_immigrants_recent,
+      :percent_immigrants_recent_5,
+      :percent_immigrants_recent_8,
+      :percent_immigrants_recent_10,
+      :percent_population_immigranted_recent,
+      :percent_population_immigranted_recent_5,
+      :percent_population_immigranted_recent_8,
+      :percent_population_immigranted_recent_10,
+      :percent_speak_english_only,
+      :percent_not_speak_english_well,
+      :percent_large_households_family,
+      :percent_large_households_occupied,
+      :mean_persons_per_occupied_household,
+      :mean_persons_per_owner_occupied_household,
+      :mean_persons_per_rental_occupied_household,
+      :percent_persons_owner_occupied_household,
+      :percent_persons_dense_housing,
+      :percent_housing_less_3_bedrooms,
+      :median_n_bedrooms,
+      :n_vacant_households,
+      :percent_housing_occupied,
+      :percent_housing_owner_occupied,
+      :percent_vacant_housing_boarded,
+      :percent_vacant_housing_more_6_months,
+      :median_year_housing_built,
+      :percent_housing_no_phone,
+      :percent_housing_without_full_plumbing,
+      :owner_occupied_housing_lower_quartile,
+      :owner_occupied_housing_median,
+      :owner_occupied_housing_higher_quartile,
+      :rental_housing_lower_quartile,
+      :rental_housing_median,
+      :rental_housing_higher_quartile,
+      :median_rent,
+      :median_rent_percent_household_income,
+      :median_owner_cost_percent_household_income,
+      :median_owner_cost_percent_household_income_no_mortgage,
+      :n_people_shelter,
+      :n_people_street,
+      :percent_foreign_born,
+      :percent_born_same_state,
+      :percent_same_house_85,
+      :percent_same_city_85,
+      :percent_same_state_85,
+      :lemas_sworn_full_time,
+      :lemas_sworn_full_time_per_population,
+      :lemas_sworn_full_time_field,
+      :lemas_sworn_full_time_field_per_population,
+      :lemas_total_requests,
+      :lemas_total_requests_per_population,
+      :total_requests_per_officer,
+      :n_officers_per_population,
+      :racial_match_community_police,
+      :percent_police_white,
+      :percent_police_black,
+      :percent_police_hispanic,
+      :percent_police_asian,
+      :percent_police_minority,
+      :n_officers_assigned_drug_units,
+      :n_kinds_drugs_seized,
+      :police_average_overtime_worked,
+      :land_area,
+      :population_density,
+      :percent_use_public_transit,
+      :n_police_cars,
+      :n_police_operating_budget,
+      :lemas_percent_police_on_patrol,
+      :lemas_gang_unit_deployed,
+      :lemas_percent_office_drug_units,
+      :police_operating_budget_per_population,
+      :total_violent_crimes_per_population
+    )
+    def initialize
+      super()
+      @metadata.id = "communities"
+      @metadata.name = "Communities"
+      @metadata.url = "https://archive.ics.uci.edu/ml/datasets/communities+and+crime"
+      @metadata.description = lambda do
+        read_names
+      end
+    end
+    def each
+      return to_enum(__method__) unless block_given?
+      open_data do |csv|
+        csv.each do |row|
+          row = row.collect.with_index do |column, i|
+            if column == "?"
+              nil
+            else
+              case i
+              when 3 # communityname
+              # when 124 # LemasGangUnitDeploy
+              # 0 means NO, 1 means YES, 0.5 means Part Time
+              else
+                column = Float(column)
+              end
+              column
+            end
+          end
+          record = Record.new(*row)
+          yield(record)
+        end
+      end
+    end
+    private
+    def base_url
+      "https://archive.ics.uci.edu/ml/machine-learning-databases/communities"
+    end
+    def open_data
+      data_path = cache_dir_path + "communities.data"
+      unless data_path.exist?
+        data_url = "#{base_url}/communities.data"
+        download(data_path, data_url)
+      end
+      CSV.open(data_path) do |csv|
+        yield(csv)
+      end
+    end
+    def read_names
+      names_path = cache_dir_path + "communities.names"
+      unless names_path.exist?
+        names_url = "#{base_url}/communities.names"
+        download(names_path, names_url)
+      end
+      names_path.read
+    end
+  end
+end

data/lib/datasets/dataset.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 require "pathname"
 require_relative "downloader"
+require_relative "error"
 require_relative "metadata"
 require_relative "table"
@@ -34,5 +35,17 @@ module Datasets
       downloader = Downloader.new(url)
       downloader.download(output_path)
     end
+    def extract_bz2(path)
+      input, output = IO.pipe
+      pid = spawn("bzcat", path.to_s, {:out => output})
+      begin
+        output.close
+        yield(input)
+      ensure
+        input.close
+        Process.waitpid(pid)
+      end
+    end
   end
 end

data/lib/datasets/dictionary.rb ADDED Viewed

@@ -0,0 +1,59 @@
+module Datasets
+  class Dictionary
+    include Enumerable
+    def initialize(values)
+      build_dictionary(values)
+    end
+    def id(value)
+      @value_to_id[value]
+    end
+    def value(id)
+      @id_to_value[id]
+    end
+    def ids
+      @id_to_value.keys
+    end
+    def values
+      @id_to_value.values
+    end
+    def each(&block)
+      @id_to_value.each(&block)
+    end
+    def size
+      @id_to_value.size
+    end
+    alias_method :length, :size
+    def encode(values)
+      values.collect do |value|
+        id(value)
+      end
+    end
+    def decode(ids)
+      ids.collect do |id|
+        value(id)
+      end
+    end
+    private
+    def build_dictionary(values)
+      @id_to_value = {}
+      @value_to_id = {}
+      id = 0
+      values.each do |value|
+        next if @value_to_id.key?(value)
+        @id_to_value[id] = value
+        @value_to_id[value] = id
+        id += 1
+      end
+    end
+  end
+end

data/lib/datasets/downloader.rb CHANGED Viewed

@@ -3,7 +3,7 @@ begin
   require "io/console"
 rescue LoadError
 end
-require "open-uri"
+require "net/http"
 require "pathname"
 module Datasets
@@ -15,84 +15,59 @@ module Datasets
         url = URI.parse(url)
       end
       @url = url
-      @url.extend(CurrentBufferReadable)
+      unless @url.is_a?(URI::HTTP)
+        raise ArgumentError, "download URL must be HTTP or HTTPS: <#{@url}>"
+      end
     end
     def download(output_path)
       output_path.parent.mkpath
+      headers = {"User-Agent" => "Red Datasets/#{VERSION}"}
       start = nil
       partial_output_path = Pathname.new("#{output_path}.partial")
       if partial_output_path.exist?
         start = partial_output_path.size
+        headers["Range"] = "bytes=#{start}-"
       end
-      progress_reporter = nil
-      content_length_proc = lambda do |content_length|
-        base_name = @url.path.split("/").last
-        size_max = content_length
-        size_max += start if start
-        progress_reporter = ProgressReporter.new(base_name, size_max)
-      end
-      progress_proc = lambda do |size_current|
-        size_current += start if start
-        progress_reporter.report(size_current) if progress_reporter
-      end
-      options = {
-        :content_length_proc => content_length_proc,
-        :progress_proc => progress_proc,
-      }
-      if start
-        options["Range"] = "bytes=#{start}-"
-      end
+      Net::HTTP.start(@url.hostname,
+                      @url.port,
+                      :use_ssl => (@url.scheme == "https")) do |http|
+        path = @url.path
+        path += "?#{@url.query}" if @url.query
+        request = Net::HTTP::Get.new(path, headers)
+        http.request(request) do |response|
+          case response
+          when Net::HTTPPartialContent
+            mode = "ab"
+          when Net::HTTPSuccess
+            start = nil
+            mode = "wb"
+          else
+            break
+          end
-      begin
-        @url.open(options) do |input|
-          copy_stream(input, partial_output_path)
-        end
-      rescue Interrupt, Net::ReadTimeout
-        if @url.current_buffer
-          input = @url.current_buffer.io
-          input.rewind
-          copy_stream(input, partial_output_path)
+          base_name = @url.path.split("/").last
+          size_current = 0
+          size_max = response.content_length
+          if start
+            size_current += start
+            size_max += start
+          end
+          progress_reporter = ProgressReporter.new(base_name, size_max)
+          partial_output_path.open(mode) do |output|
+            response.read_body do |chunk|
+              size_current += chunk.bytesize
+              progress_reporter.report(size_current)
+              output.write(chunk)
+            end
+          end
         end
-        raise
       end
       FileUtils.mv(partial_output_path, output_path)
     end
-    private
-    def copy_stream(input, partial_output_path)
-      if partial_output_path.exist?
-        # TODO: It's better that we use "206 Partial Content" response
-        # to detect partial response.
-        partial_head = partial_output_path.open("rb") do |partial_output|
-          partial_output.read(256)
-        end
-        input_head = input.read(partial_head.bytesize)
-        input.rewind
-        if partial_head == input_head
-          mode = "wb"
-        else
-          mode = "ab"
-        end
-      else
-        mode = "wb"
-      end
-      partial_output_path.open(mode) do |partial_output|
-        IO.copy_stream(input, partial_output)
-      end
-    end
-    module CurrentBufferReadable
-      attr_reader :current_buffer
-      def buffer_open(buffer, proxy, options)
-        @current_buffer = buffer
-        super
-      end
-    end
     class ProgressReporter
       def initialize(base_name, size_max)
         @base_name = base_name

data/lib/datasets/e-stat-japan.rb ADDED Viewed

@@ -0,0 +1,320 @@
+# frozen_string_literal: true
+require 'digest/md5'
+require 'net/http'
+require 'uri'
+require 'json'
+module Datasets
+  module EStatJapan
+    Record = Struct.new(:id, :name, :values)
+    # configuration injection
+    module Configurable
+      attr_accessor :app_id
+      #
+      # configuration for e-Stat API
+      # See detail at https://www.e-stat.go.jp/api/api-dev/how_to_use (Japanese only).
+      # @example
+      #  Datasets::EStatJapan.configure do |config|
+      #   # put your App ID for e-Stat app_id
+      #   config.app_id = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
+      #  end
+      #
+      def configure
+        yield self
+      end
+    end
+    extend Configurable
+    # wrapper class for e-Stat API service
+    class StatsData < Dataset
+      attr_accessor :app_id, :id
+      #
+      # generate accessor instance for e-Stat API's endpoint `getStatsData`.
+      # for detail spec : https://www.e-stat.go.jp/api/api-info/e-stat-manual
+      # @param [String] id Statistical data id
+      # @param [Array<String>] areas Target areas (fetch all if omitted)
+      # @param [Array<String>] categories Category IDs (fetch all if omitted)
+      # @param [Array<String>] times Time axes (fetch all if omitted)
+      # @param [Array<Number>] skip_levels Skip levels for parsing (defaults to `[1]`)
+      # @param [String] hierarchy_selection Select target from 'child', 'parent', or 'both'. (Example: 札幌市○○区 -> 'child':札幌市○○区 only; 'parent':札幌市 only; 'both': Both selected) (defaults to `both`)
+      # @param [Boolean] skip_nil_column Skip column if contains nil
+      # @param [Boolean] skip_nil_row Skip row if contains nil
+      # @example
+      #   stats_data = Datasets::EStatJapan::StatsData.new(
+      #     "0000020201", # A Population and household (key name: Ａ　人口・世帯)
+      #     categories: ["A1101"], # Population (key name: A1101_人口総数)
+      #     areas: ["01105", "01106"], # Toyohira-ku Sapporo-shi Hokkaido, Minami-ku Sapporo-shi Hokkaido
+      #     times: ["1981100000", "1982100000"],
+      #     hierarchy_selection: 'child',
+      #     skip_child_area: true,
+      #     skip_nil_column: true,
+      #     skip_nil_row: false,
+      #   )
+      #
+      def initialize(id,
+                     app_id: nil,
+                     areas: nil, categories: nil, times: nil,
+                     skip_levels: [1],
+                     hierarchy_selection: 'child',
+                     skip_nil_column: true,
+                     skip_nil_row: false,
+                     time_range: nil)
+        @app_id = app_id || fetch_app_id
+        if @app_id.nil? || @app_id.empty?
+          raise ArgumentError, 'Please set app_id via `Datasets::EStatJapan.configure` method, environment var `ESTATJAPAN_APP_ID` or keyword argument `:app_id`'
+        end
+        super()
+        @api_version = '3.0'
+        @base_url = "https://api.e-stat.go.jp/rest/#{@api_version}/app/json/getStatsData"
+        @metadata.id = "e-stat-japan-#{@api_version}"
+        @metadata.name = "e-Stat API #{@api_version}"
+        @metadata.url = @base_url
+        @metadata.description = "e-Stat API #{@api_version}"
+        @id = id
+        @areas = areas
+        @categories = categories
+        @times = times
+        @skip_levels = skip_levels
+        case hierarchy_selection
+        when 'child' then
+          @skip_child_area = false
+          @skip_parent_area = true
+        when 'parent' then
+          @skip_child_area = true
+          @skip_parent_area = false
+        else # 'both'
+          @skip_child_area = false
+          @skip_parent_area = false
+        end
+        @skip_nil_column = skip_nil_column
+        @skip_nil_row = skip_nil_row
+        @time_range = time_range
+        @url = generate_url
+        option_hash = Digest::MD5.hexdigest(@url.to_s)
+        base_name = "e-stat-japan-#{option_hash}.json"
+        @data_path = cache_dir_path + base_name
+        @loaded = false
+      end
+      #
+      # fetch data records from Remote API
+      # @example
+      #   indices = []
+      #   rows = []
+      #   map_id_name = {}
+      #   estat.each do |record|
+      #     # Select Hokkaido prefecture only
+      #     next unless record.id.to_s.start_with? '01'
+      #     indices << record.id
+      #     rows << record.values
+      #     map_id_name[record.id] = record.name
+      #   end
+      #
+      def each
+        return to_enum(__method__) unless block_given?
+        load_data
+        # create rows
+        @areas.each do |a_key, a_value|
+          rows = []
+          @time_tables.reject { |_key, x| x[:skip] }.each do |st_key, _st_value|
+            row = @columns.reject { |_key, x| x[:skip] }.map do |c_key, _c_value|
+              @indexed_data.dig(st_key, a_key, c_key)
+            end
+            rows << row
+          end
+          next if @skip_nil_row && rows.flatten.count(nil).positive?
+          yield Record.new(a_key, a_value['@name'], rows.flatten)
+        end
+      end
+      def areas
+        load_data
+        @areas
+      end
+      def time_tables
+        load_data
+        @time_tables
+      end
+      def columns
+        load_data
+        @columns
+      end
+      def schema
+        load_data
+        @schema
+      end
+      private
+      def generate_url
+        # generates url for query
+        params = {
+          appId: @app_id, lang: 'J',
+          statsDataId: @id,
+          metaGetFlg: 'Y', cntGetFlg: 'N',
+          sectionHeaderFlg: '1'
+        }
+        params['cdArea'] = @areas.join(',') if @areas.instance_of?(Array)
+        params['cdCat01'] = @categories.join(',') if @categories.instance_of?(Array)
+        params['cdTime'] = @times.join(',') if @times.instance_of?(Array)
+        URI.parse("#{@base_url}?#{URI.encode_www_form(params)}")
+      end
+      def extract_def(data, id)
+        rec = data.dig('GET_STATS_DATA',
+                       'STATISTICAL_DATA',
+                       'CLASS_INF',
+                       'CLASS_OBJ')
+        rec.select { |x| x['@id'] == id }
+      end
+      def index_def(data_def)
+        unless data_def.first['CLASS'].instance_of?(Array)
+          # convert to array when number of element is 1
+          data_def.first['CLASS'] = [data_def.first['CLASS']]
+        end
+        Hash[*data_def.first['CLASS'].map { |x| [x['@code'], x] }.flatten]
+      end
+      def get_values(data)
+        data.dig('GET_STATS_DATA',
+                 'STATISTICAL_DATA',
+                 'DATA_INF',
+                 'VALUE')
+      end
+      def fetch_app_id
+        EStatJapan.app_id || ENV['ESTATJAPAN_APP_ID']
+      end
+      def load_data
+        return if @loaded
+        fetch_data
+        index_data
+      end
+      def fetch_data
+        # MEMO:
+        # The e-stat api always returns 200 (Ok)
+        # even if error happens dispite of its error mapping.
+        # So we can't avoid caching retrieved response from the api.
+        # ref: https://www.e-stat.go.jp/api/api-info/e-stat-manual3-0
+        download(@data_path, @url.to_s) unless @data_path.exist?
+      end
+      def index_data
+        # parse json
+        raw_data = File.open(@data_path) do |io|
+          JSON.parse(io.read)
+        end
+        # check status
+        api_status = raw_data.dig('GET_STATS_DATA', 'RESULT', 'STATUS')
+        if api_status != 0
+          # remove error response cache manually
+          FileUtils.rm(@data_path)
+          error_msg = raw_data.dig('GET_STATS_DATA', 'RESULT', 'ERROR_MSG')
+          raise APIError, "code #{api_status} : #{error_msg}"
+        end
+        # index data
+        ## table_def = extract_def(raw_data, "tab")
+        timetable_def = extract_def(raw_data, 'time')
+        column_def = extract_def(raw_data, 'cat01')
+        area_def = extract_def(raw_data, 'area')
+        @time_tables = index_def(timetable_def)
+        @columns = index_def(column_def)
+        @areas = index_def(area_def)
+        ## apply time_range to time_tables
+        @time_tables.select! { |k, _v| @time_tables.keys[@time_range].include? k } if @time_range.instance_of?(Range)
+        @indexed_data = Hash[*@time_tables.keys.map { |x| [x, {}] }.flatten]
+        get_values(raw_data).each do |row|
+          next unless @time_tables.key?(row['@time'])
+          data = @indexed_data.dig(row['@time'], row['@area']) || {}
+          new_data = data.merge(row['@cat01'] => row['$'].to_f)
+          @indexed_data[row['@time']][row['@area']] = new_data
+        end
+        skip_areas
+        skip_nil_column
+        @schema = create_header
+        @loaded = true
+      end
+      def skip_areas
+        # skip levels
+        @areas.reject! { |_key, x| @skip_levels.include? x['@level'].to_i }
+        # skip area that has children
+        if @skip_parent_area
+          # inspect hieralchy of areas
+          @areas.each do |_a_key, a_value|
+            next unless @areas.key? a_value['@parentCode']
+            @areas[a_value['@parentCode']][:has_children] = true
+          end
+          # filter areas without children
+          @areas.reject! { |_key, x| x[:has_children] }
+        end
+        # skip child area
+        @areas.reject! { |_a_key, a_value| (@areas.key? a_value['@parentCode']) } if @skip_child_area
+      end
+      def skip_nil_column
+        return unless @skip_nil_column
+        # filter time_tables and columns
+        @areas.each do |a_key, _a_value|
+          @time_tables.each do |st_key, st_value|
+            unless @indexed_data[st_key].key?(a_key)
+              st_value[:skip] = true
+              next
+            end
+            @columns.each do |c_key, c_value|
+              unless @indexed_data.dig(st_key, a_key).key?(c_key)
+                c_value[:skip] = true
+                next
+              end
+            end
+          end
+        end
+      end
+      def create_header
+        schema = []
+        @time_tables.reject { |_key, x| x[:skip] }.each do |_st_key, st_value|
+          @columns.reject { |_key, x| x[:skip] }.each do |_c_key, c_value|
+            schema << "#{st_value['@name']}_#{c_value['@name']}"
+          end
+        end
+        schema
+      end
+    end
+    class ArgumentError < Error
+    end
+    class APIError < Error
+    end
+  end
+end