RubyGems - red-datasets - Versions diffs - 0.0.7 → 0.1.2 - Mend

red-datasets 0.0.7 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

checksums.yaml +4 -4
data/README.md +20 -4
data/doc/text/news.md +102 -0
data/lib/datasets.rb +19 -9
data/lib/datasets/adult.rb +4 -3
data/lib/datasets/cifar.rb +4 -12
data/lib/datasets/cldr-plurals.rb +385 -0
data/lib/datasets/communities.rb +198 -0
data/lib/datasets/dataset.rb +20 -1
data/lib/datasets/downloader.rb +54 -26
data/lib/datasets/e-stat-japan.rb +320 -0
data/lib/datasets/error.rb +4 -0
data/lib/datasets/hepatitis.rb +207 -0
data/lib/datasets/libsvm-dataset-list.rb +277 -0
data/lib/datasets/libsvm.rb +135 -0
data/lib/datasets/mnist.rb +0 -2
data/lib/datasets/mushroom.rb +256 -0
data/lib/datasets/penguins.rb +146 -0
data/lib/datasets/postal-code-japan.rb +154 -0
data/lib/datasets/rdatasets.rb +95 -0
data/lib/datasets/table.rb +83 -3
data/lib/datasets/tar_gz_readable.rb +14 -0
data/lib/datasets/version.rb +1 -1
data/lib/datasets/wikipedia.rb +2 -10
data/red-datasets.gemspec +4 -0
data/test/run-test.rb +2 -0
data/test/test-cldr-plurals.rb +180 -0
data/test/test-communities.rb +290 -0
data/test/test-dataset.rb +27 -0
data/test/test-downloader.rb +29 -0
data/test/test-e-stat-japan.rb +383 -0
data/test/test-hepatitis.rb +74 -0
data/test/test-libsvm-dataset-list.rb +47 -0
data/test/test-libsvm.rb +205 -0
data/test/test-mushroom.rb +80 -0
data/test/test-penguins.rb +251 -0
data/test/test-postal-code-japan.rb +69 -0
data/test/test-rdatasets.rb +136 -0
data/test/test-table.rb +123 -18
metadata +88 -11

data/lib/datasets/communities.rb ADDED Viewed

@@ -0,0 +1,198 @@
+require "csv"
+require_relative "dataset"
+module Datasets
+  class Communities < Dataset
+    Record = Struct.new(
+      :state,
+      :county,
+      :community,
+      :community_name,
+      :fold,
+      :population,
+      :household_size,
+      :race_percent_black,
+      :race_percent_white,
+      :race_percent_asian,
+      :race_percent_hispanic,
+      :age_percent_12_to_21,
+      :age_percent_12_to_29,
+      :age_percent_16_to_24,
+      :age_percent_65_and_upper,
+      :n_people_urban,
+      :percent_people_urban,
+      :median_income,
+      :percent_households_with_wage,
+      :percent_households_with_farm_self,
+      :percent_households_with_investment_income,
+      :percent_households_with_social_security,
+      :percent_households_with_public_assistant,
+      :percent_households_with_retire,
+      :median_family_income,
+      :per_capita_income,
+      :per_capita_income_white,
+      :per_capita_income_black,
+      :per_capita_income_indian,
+      :per_capita_income_asian,
+      :per_capita_income_other,
+      :per_capita_income_hispanic,
+      :n_people_under_poverty,
+      :percent_people_under_poverty,
+      :percent_less_9th_grade,
+      :percent_not_high_school_graduate,
+      :percent_bachelors_or_more,
+      :percent_unemployed,
+      :percent_employed,
+      :percent_employed_manufacturing,
+      :percent_employed_professional_service,
+      :percent_occupations_manufacturing,
+      :percent_occupations_management_professional,
+      :male_percent_divorced,
+      :male_percent_never_married,
+      :female_percent_divorced,
+      :total_percent_divorced,
+      :mean_persons_per_family,
+      :percent_family_2_parents,
+      :percent_kids_2_parents,
+      :percent_young_kids_2_parents,
+      :percent_teen_2_parents,
+      :percent_work_mom_young_kids,
+      :percent_work_mom,
+      :n_illegals,
+      :percent_illegals,
+      :n_immigrants,
+      :percent_immigrants_recent,
+      :percent_immigrants_recent_5,
+      :percent_immigrants_recent_8,
+      :percent_immigrants_recent_10,
+      :percent_population_immigranted_recent,
+      :percent_population_immigranted_recent_5,
+      :percent_population_immigranted_recent_8,
+      :percent_population_immigranted_recent_10,
+      :percent_speak_english_only,
+      :percent_not_speak_english_well,
+      :percent_large_households_family,
+      :percent_large_households_occupied,
+      :mean_persons_per_occupied_household,
+      :mean_persons_per_owner_occupied_household,
+      :mean_persons_per_rental_occupied_household,
+      :percent_persons_owner_occupied_household,
+      :percent_persons_dense_housing,
+      :percent_housing_less_3_bedrooms,
+      :median_n_bedrooms,
+      :n_vacant_households,
+      :percent_housing_occupied,
+      :percent_housing_owner_occupied,
+      :percent_vacant_housing_boarded,
+      :percent_vacant_housing_more_6_months,
+      :median_year_housing_built,
+      :percent_housing_no_phone,
+      :percent_housing_without_full_plumbing,
+      :owner_occupied_housing_lower_quartile,
+      :owner_occupied_housing_median,
+      :owner_occupied_housing_higher_quartile,
+      :rental_housing_lower_quartile,
+      :rental_housing_median,
+      :rental_housing_higher_quartile,
+      :median_rent,
+      :median_rent_percent_household_income,
+      :median_owner_cost_percent_household_income,
+      :median_owner_cost_percent_household_income_no_mortgage,
+      :n_people_shelter,
+      :n_people_street,
+      :percent_foreign_born,
+      :percent_born_same_state,
+      :percent_same_house_85,
+      :percent_same_city_85,
+      :percent_same_state_85,
+      :lemas_sworn_full_time,
+      :lemas_sworn_full_time_per_population,
+      :lemas_sworn_full_time_field,
+      :lemas_sworn_full_time_field_per_population,
+      :lemas_total_requests,
+      :lemas_total_requests_per_population,
+      :total_requests_per_officer,
+      :n_officers_per_population,
+      :racial_match_community_police,
+      :percent_police_white,
+      :percent_police_black,
+      :percent_police_hispanic,
+      :percent_police_asian,
+      :percent_police_minority,
+      :n_officers_assigned_drug_units,
+      :n_kinds_drugs_seized,
+      :police_average_overtime_worked,
+      :land_area,
+      :population_density,
+      :percent_use_public_transit,
+      :n_police_cars,
+      :n_police_operating_budget,
+      :lemas_percent_police_on_patrol,
+      :lemas_gang_unit_deployed,
+      :lemas_percent_office_drug_units,
+      :police_operating_budget_per_population,
+      :total_violent_crimes_per_population
+    )
+    def initialize
+      super()
+      @metadata.id = "communities"
+      @metadata.name = "Communities"
+      @metadata.url = "https://archive.ics.uci.edu/ml/datasets/communities+and+crime"
+      @metadata.description = lambda do
+        read_names
+      end
+    end
+    def each
+      return to_enum(__method__) unless block_given?
+      open_data do |csv|
+        csv.each do |row|
+          row = row.collect.with_index do |column, i|
+            if column == "?"
+              nil
+            else
+              case i
+              when 3 # communityname
+              # when 124 # LemasGangUnitDeploy
+              # 0 means NO, 1 means YES, 0.5 means Part Time
+              else
+                column = Float(column)
+              end
+              column
+            end
+          end
+          record = Record.new(*row)
+          yield(record)
+        end
+      end
+    end
+    private
+    def base_url
+      "https://archive.ics.uci.edu/ml/machine-learning-databases/communities"
+    end
+    def open_data
+      data_path = cache_dir_path + "communities.data"
+      unless data_path.exist?
+        data_url = "#{base_url}/communities.data"
+        download(data_path, data_url)
+      end
+      CSV.open(data_path) do |csv|
+        yield(csv)
+      end
+    end
+    def read_names
+      names_path = cache_dir_path + "communities.names"
+      unless names_path.exist?
+        names_url = "#{base_url}/communities.names"
+        download(names_path, names_url)
+      end
+      names_path.read
+    end
+  end
+end

data/lib/datasets/dataset.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 require "pathname"
 require_relative "downloader"
+require_relative "error"
 require_relative "metadata"
 require_relative "table"
@@ -17,11 +18,17 @@ module Datasets
       Table.new(self)
     end
+    def clear_cache!
+      if cache_dir_path.exist?
+        FileUtils.rmtree(cache_dir_path.to_s, secure: true)
+      end
+    end
     private
     def cache_dir_path
       case RUBY_PLATFORM
       when /mswin/, /mingw/
-        base_dir = ENV["LOCALAPPDATA"] || "~/AppData"
+        base_dir = ENV["LOCALAPPDATA"] || "~/AppData/Local"
       when /darwin/
         base_dir = "~/Library/Caches"
       else
@@ -34,5 +41,17 @@ module Datasets
       downloader = Downloader.new(url)
       downloader.download(output_path)
     end
+    def extract_bz2(path)
+      input, output = IO.pipe
+      pid = spawn("bzcat", path.to_s, {:out => output})
+      begin
+        output.close
+        yield(input)
+      ensure
+        input.close
+        Process.waitpid(pid)
+      end
+    end
   end
 end

data/lib/datasets/downloader.rb CHANGED Viewed

@@ -8,6 +8,8 @@ require "pathname"
 module Datasets
   class Downloader
+    class TooManyRedirects < StandardError; end
     def initialize(url)
       if url.is_a?(URI::Generic)
         url = url.dup
@@ -31,39 +33,65 @@ module Datasets
         headers["Range"] = "bytes=#{start}-"
       end
-      Net::HTTP.start(@url.hostname,
-                      @url.port,
-                      :use_ssl => (@url.scheme == "https")) do |http|
-        request = Net::HTTP::Get.new(@url.path, headers)
+      start_http(@url, headers) do |response|
+        if response.is_a?(Net::HTTPPartialContent)
+          mode = "ab"
+        else
+          start = nil
+          mode = "wb"
+        end
+        base_name = @url.path.split("/").last
+        size_current = 0
+        size_max = response.content_length
+        if start
+          size_current += start
+          size_max += start
+        end
+        progress_reporter = ProgressReporter.new(base_name, size_max)
+        partial_output_path.open(mode) do |output|
+          response.read_body do |chunk|
+            size_current += chunk.bytesize
+            progress_reporter.report(size_current)
+            output.write(chunk)
+          end
+        end
+      end
+      FileUtils.mv(partial_output_path, output_path)
+    rescue TooManyRedirects => error
+      last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
+      raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
+    end
+    private def start_http(url, headers, limit = 10, &block)
+      if limit == 0
+        raise TooManyRedirects, "too many redirections: #{url}"
+      end
+      http = Net::HTTP.new(url.hostname, url.port)
+      # http.set_debug_output($stderr)
+      http.use_ssl = (url.scheme == "https")
+      http.start do
+        path = url.path
+        path += "?#{url.query}" if url.query
+        request = Net::HTTP::Get.new(path, headers)
         http.request(request) do |response|
           case response
-          when Net::HTTPPartialContent
-            mode = "ab"
-          when Net::HTTPSuccess
-            start = nil
-            mode = "wb"
+          when Net::HTTPSuccess, Net::HTTPPartialContent
+            return block.call(response)
+          when Net::HTTPRedirection
+            url = URI.parse(response[:location])
+            $stderr.puts "Redirect to #{url}"
+            return start_http(url, headers, limit - 1, &block)
           else
-            break
-          end
-          base_name = @url.path.split("/").last
-          size_current = 0
-          size_max = response.content_length
-          if start
-            size_current += start
-            size_max += start
-          end
-          progress_reporter = ProgressReporter.new(base_name, size_max)
-          partial_output_path.open(mode) do |output|
-            response.read_body do |chunk|
-              size_current += chunk.bytesize
-              progress_reporter.report(size_current)
-              output.write(chunk)
+            message = response.code
+            if response.message and not response.message.empty?
+              message += ": #{response.message}"
             end
+            message += ": #{url}"
+            raise response.error_type.new(message, response)
           end
         end
       end
-      FileUtils.mv(partial_output_path, output_path)
     end
     class ProgressReporter

data/lib/datasets/e-stat-japan.rb ADDED Viewed

@@ -0,0 +1,320 @@
+# frozen_string_literal: true
+require 'digest/md5'
+require 'net/http'
+require 'uri'
+require 'json'
+module Datasets
+  module EStatJapan
+    Record = Struct.new(:id, :name, :values)
+    # configuration injection
+    module Configurable
+      attr_accessor :app_id
+      #
+      # configuration for e-Stat API
+      # See detail at https://www.e-stat.go.jp/api/api-dev/how_to_use (Japanese only).
+      # @example
+      #  Datasets::EStatJapan.configure do |config|
+      #   # put your App ID for e-Stat app_id
+      #   config.app_id = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
+      #  end
+      #
+      def configure
+        yield self
+      end
+    end
+    extend Configurable
+    # wrapper class for e-Stat API service
+    class StatsData < Dataset
+      attr_accessor :app_id, :id
+      #
+      # generate accessor instance for e-Stat API's endpoint `getStatsData`.
+      # for detail spec : https://www.e-stat.go.jp/api/api-info/e-stat-manual
+      # @param [String] id Statistical data id
+      # @param [Array<String>] areas Target areas (fetch all if omitted)
+      # @param [Array<String>] categories Category IDs (fetch all if omitted)
+      # @param [Array<String>] times Time axes (fetch all if omitted)
+      # @param [Array<Number>] skip_levels Skip levels for parsing (defaults to `[1]`)
+      # @param [String] hierarchy_selection Select target from 'child', 'parent', or 'both'. (Example: 札幌市○○区 -> 'child':札幌市○○区 only; 'parent':札幌市 only; 'both': Both selected) (defaults to `both`)
+      # @param [Boolean] skip_nil_column Skip column if contains nil
+      # @param [Boolean] skip_nil_row Skip row if contains nil
+      # @example
+      #   stats_data = Datasets::EStatJapan::StatsData.new(
+      #     "0000020201", # A Population and household (key name: Ａ　人口・世帯)
+      #     categories: ["A1101"], # Population (key name: A1101_人口総数)
+      #     areas: ["01105", "01106"], # Toyohira-ku Sapporo-shi Hokkaido, Minami-ku Sapporo-shi Hokkaido
+      #     times: ["1981100000", "1982100000"],
+      #     hierarchy_selection: 'child',
+      #     skip_child_area: true,
+      #     skip_nil_column: true,
+      #     skip_nil_row: false,
+      #   )
+      #
+      def initialize(id,
+                     app_id: nil,
+                     areas: nil, categories: nil, times: nil,
+                     skip_levels: [1],
+                     hierarchy_selection: 'child',
+                     skip_nil_column: true,
+                     skip_nil_row: false,
+                     time_range: nil)
+        @app_id = app_id || fetch_app_id
+        if @app_id.nil? || @app_id.empty?
+          raise ArgumentError, 'Please set app_id via `Datasets::EStatJapan.configure` method, environment var `ESTATJAPAN_APP_ID` or keyword argument `:app_id`'
+        end
+        super()
+        @api_version = '3.0'
+        @base_url = "https://api.e-stat.go.jp/rest/#{@api_version}/app/json/getStatsData"
+        @metadata.id = "e-stat-japan-#{@api_version}"
+        @metadata.name = "e-Stat API #{@api_version}"
+        @metadata.url = @base_url
+        @metadata.description = "e-Stat API #{@api_version}"
+        @id = id
+        @areas = areas
+        @categories = categories
+        @times = times
+        @skip_levels = skip_levels
+        case hierarchy_selection
+        when 'child' then
+          @skip_child_area = false
+          @skip_parent_area = true
+        when 'parent' then
+          @skip_child_area = true
+          @skip_parent_area = false
+        else # 'both'
+          @skip_child_area = false
+          @skip_parent_area = false
+        end
+        @skip_nil_column = skip_nil_column
+        @skip_nil_row = skip_nil_row
+        @time_range = time_range
+        @url = generate_url
+        option_hash = Digest::MD5.hexdigest(@url.to_s)
+        base_name = "e-stat-japan-#{option_hash}.json"
+        @data_path = cache_dir_path + base_name
+        @loaded = false
+      end
+      #
+      # fetch data records from Remote API
+      # @example
+      #   indices = []
+      #   rows = []
+      #   map_id_name = {}
+      #   estat.each do |record|
+      #     # Select Hokkaido prefecture only
+      #     next unless record.id.to_s.start_with? '01'
+      #     indices << record.id
+      #     rows << record.values
+      #     map_id_name[record.id] = record.name
+      #   end
+      #
+      def each
+        return to_enum(__method__) unless block_given?
+        load_data
+        # create rows
+        @areas.each do |a_key, a_value|
+          rows = []
+          @time_tables.reject { |_key, x| x[:skip] }.each do |st_key, _st_value|
+            row = @columns.reject { |_key, x| x[:skip] }.map do |c_key, _c_value|
+              @indexed_data.dig(st_key, a_key, c_key)
+            end
+            rows << row
+          end
+          next if @skip_nil_row && rows.flatten.count(nil).positive?
+          yield Record.new(a_key, a_value['@name'], rows.flatten)
+        end
+      end
+      def areas
+        load_data
+        @areas
+      end
+      def time_tables
+        load_data
+        @time_tables
+      end
+      def columns
+        load_data
+        @columns
+      end
+      def schema
+        load_data
+        @schema
+      end
+      private
+      def generate_url
+        # generates url for query
+        params = {
+          appId: @app_id, lang: 'J',
+          statsDataId: @id,
+          metaGetFlg: 'Y', cntGetFlg: 'N',
+          sectionHeaderFlg: '1'
+        }
+        params['cdArea'] = @areas.join(',') if @areas.instance_of?(Array)
+        params['cdCat01'] = @categories.join(',') if @categories.instance_of?(Array)
+        params['cdTime'] = @times.join(',') if @times.instance_of?(Array)
+        URI.parse("#{@base_url}?#{URI.encode_www_form(params)}")
+      end
+      def extract_def(data, id)
+        rec = data.dig('GET_STATS_DATA',
+                       'STATISTICAL_DATA',
+                       'CLASS_INF',
+                       'CLASS_OBJ')
+        rec.select { |x| x['@id'] == id }
+      end
+      def index_def(data_def)
+        unless data_def.first['CLASS'].instance_of?(Array)
+          # convert to array when number of element is 1
+          data_def.first['CLASS'] = [data_def.first['CLASS']]
+        end
+        Hash[*data_def.first['CLASS'].map { |x| [x['@code'], x] }.flatten]
+      end
+      def get_values(data)
+        data.dig('GET_STATS_DATA',
+                 'STATISTICAL_DATA',
+                 'DATA_INF',
+                 'VALUE')
+      end
+      def fetch_app_id
+        EStatJapan.app_id || ENV['ESTATJAPAN_APP_ID']
+      end
+      def load_data
+        return if @loaded
+        fetch_data
+        index_data
+      end
+      def fetch_data
+        # MEMO:
+        # The e-stat api always returns 200 (Ok)
+        # even if error happens dispite of its error mapping.
+        # So we can't avoid caching retrieved response from the api.
+        # ref: https://www.e-stat.go.jp/api/api-info/e-stat-manual3-0
+        download(@data_path, @url.to_s) unless @data_path.exist?
+      end
+      def index_data
+        # parse json
+        raw_data = File.open(@data_path) do |io|
+          JSON.parse(io.read)
+        end
+        # check status
+        api_status = raw_data.dig('GET_STATS_DATA', 'RESULT', 'STATUS')
+        if api_status != 0
+          # remove error response cache manually
+          FileUtils.rm(@data_path)
+          error_msg = raw_data.dig('GET_STATS_DATA', 'RESULT', 'ERROR_MSG')
+          raise APIError, "code #{api_status} : #{error_msg}"
+        end
+        # index data
+        ## table_def = extract_def(raw_data, "tab")
+        timetable_def = extract_def(raw_data, 'time')
+        column_def = extract_def(raw_data, 'cat01')
+        area_def = extract_def(raw_data, 'area')
+        @time_tables = index_def(timetable_def)
+        @columns = index_def(column_def)
+        @areas = index_def(area_def)
+        ## apply time_range to time_tables
+        @time_tables.select! { |k, _v| @time_tables.keys[@time_range].include? k } if @time_range.instance_of?(Range)
+        @indexed_data = Hash[*@time_tables.keys.map { |x| [x, {}] }.flatten]
+        get_values(raw_data).each do |row|
+          next unless @time_tables.key?(row['@time'])
+          data = @indexed_data.dig(row['@time'], row['@area']) || {}
+          new_data = data.merge(row['@cat01'] => row['$'].to_f)
+          @indexed_data[row['@time']][row['@area']] = new_data
+        end
+        skip_areas
+        skip_nil_column
+        @schema = create_header
+        @loaded = true
+      end
+      def skip_areas
+        # skip levels
+        @areas.reject! { |_key, x| @skip_levels.include? x['@level'].to_i }
+        # skip area that has children
+        if @skip_parent_area
+          # inspect hieralchy of areas
+          @areas.each do |_a_key, a_value|
+            next unless @areas.key? a_value['@parentCode']
+            @areas[a_value['@parentCode']][:has_children] = true
+          end
+          # filter areas without children
+          @areas.reject! { |_key, x| x[:has_children] }
+        end
+        # skip child area
+        @areas.reject! { |_a_key, a_value| (@areas.key? a_value['@parentCode']) } if @skip_child_area
+      end
+      def skip_nil_column
+        return unless @skip_nil_column
+        # filter time_tables and columns
+        @areas.each do |a_key, _a_value|
+          @time_tables.each do |st_key, st_value|
+            unless @indexed_data[st_key].key?(a_key)
+              st_value[:skip] = true
+              next
+            end
+            @columns.each do |c_key, c_value|
+              unless @indexed_data.dig(st_key, a_key).key?(c_key)
+                c_value[:skip] = true
+                next
+              end
+            end
+          end
+        end
+      end
+      def create_header
+        schema = []
+        @time_tables.reject { |_key, x| x[:skip] }.each do |_st_key, st_value|
+          @columns.reject { |_key, x| x[:skip] }.each do |_c_key, c_value|
+            schema << "#{st_value['@name']}_#{c_value['@name']}"
+          end
+        end
+        schema
+      end
+    end
+    class ArgumentError < Error
+    end
+    class APIError < Error
+    end
+  end
+end