RubyGems - red-datasets - Versions diffs - 0.1.0 → 0.1.1 - Mend

red-datasets 0.1.0 → 0.1.1

Files changed (19) hide show

checksums.yaml +4 -4
data/README.md +6 -0
data/doc/text/news.md +25 -0
data/lib/datasets.rb +4 -0
data/lib/datasets/cldr-plurals.rb +385 -0
data/lib/datasets/communities.rb +198 -0
data/lib/datasets/dataset.rb +1 -0
data/lib/datasets/e-stat-japan.rb +320 -0
data/lib/datasets/error.rb +4 -0
data/lib/datasets/mnist.rb +0 -2
data/lib/datasets/penguins.rb +125 -0
data/lib/datasets/version.rb +1 -1
data/red-datasets.gemspec +1 -0
data/test/run-test.rb +2 -0
data/test/test-cldr-plurals.rb +180 -0
data/test/test-communities.rb +290 -0
data/test/test-e-stat-japan.rb +383 -0
data/test/test-penguins.rb +239 -0
metadata +41 -15

data/lib/datasets/communities.rb ADDED Viewed

@@ -0,0 +1,198 @@
+require "csv"
+require_relative "dataset"
+module Datasets
+  class Communities < Dataset
+    Record = Struct.new(
+      :state,
+      :county,
+      :community,
+      :community_name,
+      :fold,
+      :population,
+      :household_size,
+      :race_percent_black,
+      :race_percent_white,
+      :race_percent_asian,
+      :race_percent_hispanic,
+      :age_percent_12_to_21,
+      :age_percent_12_to_29,
+      :age_percent_16_to_24,
+      :age_percent_65_and_upper,
+      :n_people_urban,
+      :percent_people_urban,
+      :median_income,
+      :percent_households_with_wage,
+      :percent_households_with_farm_self,
+      :percent_households_with_investment_income,
+      :percent_households_with_social_security,
+      :percent_households_with_public_assistant,
+      :percent_households_with_retire,
+      :median_family_income,
+      :per_capita_income,
+      :per_capita_income_white,
+      :per_capita_income_black,
+      :per_capita_income_indian,
+      :per_capita_income_asian,
+      :per_capita_income_other,
+      :per_capita_income_hispanic,
+      :n_people_under_poverty,
+      :percent_people_under_poverty,
+      :percent_less_9th_grade,
+      :percent_not_high_school_graduate,
+      :percent_bachelors_or_more,
+      :percent_unemployed,
+      :percent_employed,
+      :percent_employed_manufacturing,
+      :percent_employed_professional_service,
+      :percent_occupations_manufacturing,
+      :percent_occupations_management_professional,
+      :male_percent_divorced,
+      :male_percent_never_married,
+      :female_percent_divorced,
+      :total_percent_divorced,
+      :mean_persons_per_family,
+      :percent_family_2_parents,
+      :percent_kids_2_parents,
+      :percent_young_kids_2_parents,
+      :percent_teen_2_parents,
+      :percent_work_mom_young_kids,
+      :percent_work_mom,
+      :n_illegals,
+      :percent_illegals,
+      :n_immigrants,
+      :percent_immigrants_recent,
+      :percent_immigrants_recent_5,
+      :percent_immigrants_recent_8,
+      :percent_immigrants_recent_10,
+      :percent_population_immigranted_recent,
+      :percent_population_immigranted_recent_5,
+      :percent_population_immigranted_recent_8,
+      :percent_population_immigranted_recent_10,
+      :percent_speak_english_only,
+      :percent_not_speak_english_well,
+      :percent_large_households_family,
+      :percent_large_households_occupied,
+      :mean_persons_per_occupied_household,
+      :mean_persons_per_owner_occupied_household,
+      :mean_persons_per_rental_occupied_household,
+      :percent_persons_owner_occupied_household,
+      :percent_persons_dense_housing,
+      :percent_housing_less_3_bedrooms,
+      :median_n_bedrooms,
+      :n_vacant_households,
+      :percent_housing_occupied,
+      :percent_housing_owner_occupied,
+      :percent_vacant_housing_boarded,
+      :percent_vacant_housing_more_6_months,
+      :median_year_housing_built,
+      :percent_housing_no_phone,
+      :percent_housing_without_full_plumbing,
+      :owner_occupied_housing_lower_quartile,
+      :owner_occupied_housing_median,
+      :owner_occupied_housing_higher_quartile,
+      :rental_housing_lower_quartile,
+      :rental_housing_median,
+      :rental_housing_higher_quartile,
+      :median_rent,
+      :median_rent_percent_household_income,
+      :median_owner_cost_percent_household_income,
+      :median_owner_cost_percent_household_income_no_mortgage,
+      :n_people_shelter,
+      :n_people_street,
+      :percent_foreign_born,
+      :percent_born_same_state,
+      :percent_same_house_85,
+      :percent_same_city_85,
+      :percent_same_state_85,
+      :lemas_sworn_full_time,
+      :lemas_sworn_full_time_per_population,
+      :lemas_sworn_full_time_field,
+      :lemas_sworn_full_time_field_per_population,
+      :lemas_total_requests,
+      :lemas_total_requests_per_population,
+      :total_requests_per_officer,
+      :n_officers_per_population,
+      :racial_match_community_police,
+      :percent_police_white,
+      :percent_police_black,
+      :percent_police_hispanic,
+      :percent_police_asian,
+      :percent_police_minority,
+      :n_officers_assigned_drug_units,
+      :n_kinds_drugs_seized,
+      :police_average_overtime_worked,
+      :land_area,
+      :population_density,
+      :percent_use_public_transit,
+      :n_police_cars,
+      :n_police_operating_budget,
+      :lemas_percent_police_on_patrol,
+      :lemas_gang_unit_deployed,
+      :lemas_percent_office_drug_units,
+      :police_operating_budget_per_population,
+      :total_violent_crimes_per_population
+    )
+    def initialize
+      super()
+      @metadata.id = "communities"
+      @metadata.name = "Communities"
+      @metadata.url = "https://archive.ics.uci.edu/ml/datasets/communities+and+crime"
+      @metadata.description = lambda do
+        read_names
+      end
+    end
+    def each
+      return to_enum(__method__) unless block_given?
+      open_data do |csv|
+        csv.each do |row|
+          row = row.collect.with_index do |column, i|
+            if column == "?"
+              nil
+            else
+              case i
+              when 3 # communityname
+              # when 124 # LemasGangUnitDeploy
+              # 0 means NO, 1 means YES, 0.5 means Part Time
+              else
+                column = Float(column)
+              end
+              column
+            end
+          end
+          record = Record.new(*row)
+          yield(record)
+        end
+      end
+    end
+    private
+    def base_url
+      "https://archive.ics.uci.edu/ml/machine-learning-databases/communities"
+    end
+    def open_data
+      data_path = cache_dir_path + "communities.data"
+      unless data_path.exist?
+        data_url = "#{base_url}/communities.data"
+        download(data_path, data_url)
+      end
+      CSV.open(data_path) do |csv|
+        yield(csv)
+      end
+    end
+    def read_names
+      names_path = cache_dir_path + "communities.names"
+      unless names_path.exist?
+        names_url = "#{base_url}/communities.names"
+        download(names_path, names_url)
+      end
+      names_path.read
+    end
+  end
+end

data/lib/datasets/dataset.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 require "pathname"
 require_relative "downloader"
+require_relative "error"
 require_relative "metadata"
 require_relative "table"

data/lib/datasets/e-stat-japan.rb ADDED Viewed

@@ -0,0 +1,320 @@
+# frozen_string_literal: true
+require 'digest/md5'
+require 'net/http'
+require 'uri'
+require 'json'
+module Datasets
+  module EStatJapan
+    Record = Struct.new(:id, :name, :values)
+    # configuration injection
+    module Configurable
+      attr_accessor :app_id
+      #
+      # configuration for e-Stat API
+      # See detail at https://www.e-stat.go.jp/api/api-dev/how_to_use (Japanese only).
+      # @example
+      #  Datasets::EStatJapan.configure do |config|
+      #   # put your App ID for e-Stat app_id
+      #   config.app_id = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
+      #  end
+      #
+      def configure
+        yield self
+      end
+    end
+    extend Configurable
+    # wrapper class for e-Stat API service
+    class StatsData < Dataset
+      attr_accessor :app_id, :id
+      #
+      # generate accessor instance for e-Stat API's endpoint `getStatsData`.
+      # for detail spec : https://www.e-stat.go.jp/api/api-info/e-stat-manual
+      # @param [String] id Statistical data id
+      # @param [Array<String>] areas Target areas (fetch all if omitted)
+      # @param [Array<String>] categories Category IDs (fetch all if omitted)
+      # @param [Array<String>] times Time axes (fetch all if omitted)
+      # @param [Array<Number>] skip_levels Skip levels for parsing (defaults to `[1]`)
+      # @param [String] hierarchy_selection Select target from 'child', 'parent', or 'both'. (Example: 札幌市○○区 -> 'child':札幌市○○区 only; 'parent':札幌市 only; 'both': Both selected) (defaults to `both`)
+      # @param [Boolean] skip_nil_column Skip column if contains nil
+      # @param [Boolean] skip_nil_row Skip row if contains nil
+      # @example
+      #   stats_data = Datasets::EStatJapan::StatsData.new(
+      #     "0000020201", # A Population and household (key name: Ａ　人口・世帯)
+      #     categories: ["A1101"], # Population (key name: A1101_人口総数)
+      #     areas: ["01105", "01106"], # Toyohira-ku Sapporo-shi Hokkaido, Minami-ku Sapporo-shi Hokkaido
+      #     times: ["1981100000", "1982100000"],
+      #     hierarchy_selection: 'child',
+      #     skip_child_area: true,
+      #     skip_nil_column: true,
+      #     skip_nil_row: false,
+      #   )
+      #
+      def initialize(id,
+                     app_id: nil,
+                     areas: nil, categories: nil, times: nil,
+                     skip_levels: [1],
+                     hierarchy_selection: 'child',
+                     skip_nil_column: true,
+                     skip_nil_row: false,
+                     time_range: nil)
+        @app_id = app_id || fetch_app_id
+        if @app_id.nil? || @app_id.empty?
+          raise ArgumentError, 'Please set app_id via `Datasets::EStatJapan.configure` method, environment var `ESTATJAPAN_APP_ID` or keyword argument `:app_id`'
+        end
+        super()
+        @api_version = '3.0'
+        @base_url = "https://api.e-stat.go.jp/rest/#{@api_version}/app/json/getStatsData"
+        @metadata.id = "e-stat-japan-#{@api_version}"
+        @metadata.name = "e-Stat API #{@api_version}"
+        @metadata.url = @base_url
+        @metadata.description = "e-Stat API #{@api_version}"
+        @id = id
+        @areas = areas
+        @categories = categories
+        @times = times
+        @skip_levels = skip_levels
+        case hierarchy_selection
+        when 'child' then
+          @skip_child_area = false
+          @skip_parent_area = true
+        when 'parent' then
+          @skip_child_area = true
+          @skip_parent_area = false
+        else # 'both'
+          @skip_child_area = false
+          @skip_parent_area = false
+        end
+        @skip_nil_column = skip_nil_column
+        @skip_nil_row = skip_nil_row
+        @time_range = time_range
+        @url = generate_url
+        option_hash = Digest::MD5.hexdigest(@url.to_s)
+        base_name = "e-stat-japan-#{option_hash}.json"
+        @data_path = cache_dir_path + base_name
+        @loaded = false
+      end
+      #
+      # fetch data records from Remote API
+      # @example
+      #   indices = []
+      #   rows = []
+      #   map_id_name = {}
+      #   estat.each do |record|
+      #     # Select Hokkaido prefecture only
+      #     next unless record.id.to_s.start_with? '01'
+      #     indices << record.id
+      #     rows << record.values
+      #     map_id_name[record.id] = record.name
+      #   end
+      #
+      def each
+        return to_enum(__method__) unless block_given?
+        load_data
+        # create rows
+        @areas.each do |a_key, a_value|
+          rows = []
+          @time_tables.reject { |_key, x| x[:skip] }.each do |st_key, _st_value|
+            row = @columns.reject { |_key, x| x[:skip] }.map do |c_key, _c_value|
+              @indexed_data.dig(st_key, a_key, c_key)
+            end
+            rows << row
+          end
+          next if @skip_nil_row && rows.flatten.count(nil).positive?
+          yield Record.new(a_key, a_value['@name'], rows.flatten)
+        end
+      end
+      def areas
+        load_data
+        @areas
+      end
+      def time_tables
+        load_data
+        @time_tables
+      end
+      def columns
+        load_data
+        @columns
+      end
+      def schema
+        load_data
+        @schema
+      end
+      private
+      def generate_url
+        # generates url for query
+        params = {
+          appId: @app_id, lang: 'J',
+          statsDataId: @id,
+          metaGetFlg: 'Y', cntGetFlg: 'N',
+          sectionHeaderFlg: '1'
+        }
+        params['cdArea'] = @areas.join(',') if @areas.instance_of?(Array)
+        params['cdCat01'] = @categories.join(',') if @categories.instance_of?(Array)
+        params['cdTime'] = @times.join(',') if @times.instance_of?(Array)
+        URI.parse("#{@base_url}?#{URI.encode_www_form(params)}")
+      end
+      def extract_def(data, id)
+        rec = data.dig('GET_STATS_DATA',
+                       'STATISTICAL_DATA',
+                       'CLASS_INF',
+                       'CLASS_OBJ')
+        rec.select { |x| x['@id'] == id }
+      end
+      def index_def(data_def)
+        unless data_def.first['CLASS'].instance_of?(Array)
+          # convert to array when number of element is 1
+          data_def.first['CLASS'] = [data_def.first['CLASS']]
+        end
+        Hash[*data_def.first['CLASS'].map { |x| [x['@code'], x] }.flatten]
+      end
+      def get_values(data)
+        data.dig('GET_STATS_DATA',
+                 'STATISTICAL_DATA',
+                 'DATA_INF',
+                 'VALUE')
+      end
+      def fetch_app_id
+        EStatJapan.app_id || ENV['ESTATJAPAN_APP_ID']
+      end
+      def load_data
+        return if @loaded
+        fetch_data
+        index_data
+      end
+      def fetch_data
+        # MEMO:
+        # The e-stat api always returns 200 (Ok)
+        # even if error happens dispite of its error mapping.
+        # So we can't avoid caching retrieved response from the api.
+        # ref: https://www.e-stat.go.jp/api/api-info/e-stat-manual3-0
+        download(@data_path, @url.to_s) unless @data_path.exist?
+      end
+      def index_data
+        # parse json
+        raw_data = File.open(@data_path) do |io|
+          JSON.parse(io.read)
+        end
+        # check status
+        api_status = raw_data.dig('GET_STATS_DATA', 'RESULT', 'STATUS')
+        if api_status != 0
+          # remove error response cache manually
+          FileUtils.rm(@data_path)
+          error_msg = raw_data.dig('GET_STATS_DATA', 'RESULT', 'ERROR_MSG')
+          raise APIError, "code #{api_status} : #{error_msg}"
+        end
+        # index data
+        ## table_def = extract_def(raw_data, "tab")
+        timetable_def = extract_def(raw_data, 'time')
+        column_def = extract_def(raw_data, 'cat01')
+        area_def = extract_def(raw_data, 'area')
+        @time_tables = index_def(timetable_def)
+        @columns = index_def(column_def)
+        @areas = index_def(area_def)
+        ## apply time_range to time_tables
+        @time_tables.select! { |k, _v| @time_tables.keys[@time_range].include? k } if @time_range.instance_of?(Range)
+        @indexed_data = Hash[*@time_tables.keys.map { |x| [x, {}] }.flatten]
+        get_values(raw_data).each do |row|
+          next unless @time_tables.key?(row['@time'])
+          data = @indexed_data.dig(row['@time'], row['@area']) || {}
+          new_data = data.merge(row['@cat01'] => row['$'].to_f)
+          @indexed_data[row['@time']][row['@area']] = new_data
+        end
+        skip_areas
+        skip_nil_column
+        @schema = create_header
+        @loaded = true
+      end
+      def skip_areas
+        # skip levels
+        @areas.reject! { |_key, x| @skip_levels.include? x['@level'].to_i }
+        # skip area that has children
+        if @skip_parent_area
+          # inspect hieralchy of areas
+          @areas.each do |_a_key, a_value|
+            next unless @areas.key? a_value['@parentCode']
+            @areas[a_value['@parentCode']][:has_children] = true
+          end
+          # filter areas without children
+          @areas.reject! { |_key, x| x[:has_children] }
+        end
+        # skip child area
+        @areas.reject! { |_a_key, a_value| (@areas.key? a_value['@parentCode']) } if @skip_child_area
+      end
+      def skip_nil_column
+        return unless @skip_nil_column
+        # filter time_tables and columns
+        @areas.each do |a_key, _a_value|
+          @time_tables.each do |st_key, st_value|
+            unless @indexed_data[st_key].key?(a_key)
+              st_value[:skip] = true
+              next
+            end
+            @columns.each do |c_key, c_value|
+              unless @indexed_data.dig(st_key, a_key).key?(c_key)
+                c_value[:skip] = true
+                next
+              end
+            end
+          end
+        end
+      end
+      def create_header
+        schema = []
+        @time_tables.reject { |_key, x| x[:skip] }.each do |_st_key, st_value|
+          @columns.reject { |_key, x| x[:skip] }.each do |_c_key, c_value|
+            schema << "#{st_value['@name']}_#{c_value['@name']}"
+          end
+        end
+        schema
+      end
+    end
+    class ArgumentError < Error
+    end
+    class APIError < Error
+    end
+  end
+end