red-datasets 0.0.7 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,198 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class Communities < Dataset
7
+ Record = Struct.new(
8
+ :state,
9
+ :county,
10
+ :community,
11
+ :community_name,
12
+ :fold,
13
+ :population,
14
+ :household_size,
15
+ :race_percent_black,
16
+ :race_percent_white,
17
+ :race_percent_asian,
18
+ :race_percent_hispanic,
19
+ :age_percent_12_to_21,
20
+ :age_percent_12_to_29,
21
+ :age_percent_16_to_24,
22
+ :age_percent_65_and_upper,
23
+ :n_people_urban,
24
+ :percent_people_urban,
25
+ :median_income,
26
+ :percent_households_with_wage,
27
+ :percent_households_with_farm_self,
28
+ :percent_households_with_investment_income,
29
+ :percent_households_with_social_security,
30
+ :percent_households_with_public_assistant,
31
+ :percent_households_with_retire,
32
+ :median_family_income,
33
+ :per_capita_income,
34
+ :per_capita_income_white,
35
+ :per_capita_income_black,
36
+ :per_capita_income_indian,
37
+ :per_capita_income_asian,
38
+ :per_capita_income_other,
39
+ :per_capita_income_hispanic,
40
+ :n_people_under_poverty,
41
+ :percent_people_under_poverty,
42
+ :percent_less_9th_grade,
43
+ :percent_not_high_school_graduate,
44
+ :percent_bachelors_or_more,
45
+ :percent_unemployed,
46
+ :percent_employed,
47
+ :percent_employed_manufacturing,
48
+ :percent_employed_professional_service,
49
+ :percent_occupations_manufacturing,
50
+ :percent_occupations_management_professional,
51
+ :male_percent_divorced,
52
+ :male_percent_never_married,
53
+ :female_percent_divorced,
54
+ :total_percent_divorced,
55
+ :mean_persons_per_family,
56
+ :percent_family_2_parents,
57
+ :percent_kids_2_parents,
58
+ :percent_young_kids_2_parents,
59
+ :percent_teen_2_parents,
60
+ :percent_work_mom_young_kids,
61
+ :percent_work_mom,
62
+ :n_illegals,
63
+ :percent_illegals,
64
+ :n_immigrants,
65
+ :percent_immigrants_recent,
66
+ :percent_immigrants_recent_5,
67
+ :percent_immigrants_recent_8,
68
+ :percent_immigrants_recent_10,
69
+ :percent_population_immigranted_recent,
70
+ :percent_population_immigranted_recent_5,
71
+ :percent_population_immigranted_recent_8,
72
+ :percent_population_immigranted_recent_10,
73
+ :percent_speak_english_only,
74
+ :percent_not_speak_english_well,
75
+ :percent_large_households_family,
76
+ :percent_large_households_occupied,
77
+ :mean_persons_per_occupied_household,
78
+ :mean_persons_per_owner_occupied_household,
79
+ :mean_persons_per_rental_occupied_household,
80
+ :percent_persons_owner_occupied_household,
81
+ :percent_persons_dense_housing,
82
+ :percent_housing_less_3_bedrooms,
83
+ :median_n_bedrooms,
84
+ :n_vacant_households,
85
+ :percent_housing_occupied,
86
+ :percent_housing_owner_occupied,
87
+ :percent_vacant_housing_boarded,
88
+ :percent_vacant_housing_more_6_months,
89
+ :median_year_housing_built,
90
+ :percent_housing_no_phone,
91
+ :percent_housing_without_full_plumbing,
92
+ :owner_occupied_housing_lower_quartile,
93
+ :owner_occupied_housing_median,
94
+ :owner_occupied_housing_higher_quartile,
95
+ :rental_housing_lower_quartile,
96
+ :rental_housing_median,
97
+ :rental_housing_higher_quartile,
98
+ :median_rent,
99
+ :median_rent_percent_household_income,
100
+ :median_owner_cost_percent_household_income,
101
+ :median_owner_cost_percent_household_income_no_mortgage,
102
+ :n_people_shelter,
103
+ :n_people_street,
104
+ :percent_foreign_born,
105
+ :percent_born_same_state,
106
+ :percent_same_house_85,
107
+ :percent_same_city_85,
108
+ :percent_same_state_85,
109
+ :lemas_sworn_full_time,
110
+ :lemas_sworn_full_time_per_population,
111
+ :lemas_sworn_full_time_field,
112
+ :lemas_sworn_full_time_field_per_population,
113
+ :lemas_total_requests,
114
+ :lemas_total_requests_per_population,
115
+ :total_requests_per_officer,
116
+ :n_officers_per_population,
117
+ :racial_match_community_police,
118
+ :percent_police_white,
119
+ :percent_police_black,
120
+ :percent_police_hispanic,
121
+ :percent_police_asian,
122
+ :percent_police_minority,
123
+ :n_officers_assigned_drug_units,
124
+ :n_kinds_drugs_seized,
125
+ :police_average_overtime_worked,
126
+ :land_area,
127
+ :population_density,
128
+ :percent_use_public_transit,
129
+ :n_police_cars,
130
+ :n_police_operating_budget,
131
+ :lemas_percent_police_on_patrol,
132
+ :lemas_gang_unit_deployed,
133
+ :lemas_percent_office_drug_units,
134
+ :police_operating_budget_per_population,
135
+ :total_violent_crimes_per_population
136
+ )
137
+
138
+ def initialize
139
+ super()
140
+ @metadata.id = "communities"
141
+ @metadata.name = "Communities"
142
+ @metadata.url = "https://archive.ics.uci.edu/ml/datasets/communities+and+crime"
143
+ @metadata.description = lambda do
144
+ read_names
145
+ end
146
+ end
147
+
148
+ def each
149
+ return to_enum(__method__) unless block_given?
150
+
151
+ open_data do |csv|
152
+ csv.each do |row|
153
+ row = row.collect.with_index do |column, i|
154
+ if column == "?"
155
+ nil
156
+ else
157
+ case i
158
+ when 3 # communityname
159
+ # when 124 # LemasGangUnitDeploy
160
+ # 0 means NO, 1 means YES, 0.5 means Part Time
161
+ else
162
+ column = Float(column)
163
+ end
164
+ column
165
+ end
166
+ end
167
+ record = Record.new(*row)
168
+ yield(record)
169
+ end
170
+ end
171
+ end
172
+
173
+ private
174
+ def base_url
175
+ "https://archive.ics.uci.edu/ml/machine-learning-databases/communities"
176
+ end
177
+
178
+ def open_data
179
+ data_path = cache_dir_path + "communities.data"
180
+ unless data_path.exist?
181
+ data_url = "#{base_url}/communities.data"
182
+ download(data_path, data_url)
183
+ end
184
+ CSV.open(data_path) do |csv|
185
+ yield(csv)
186
+ end
187
+ end
188
+
189
+ def read_names
190
+ names_path = cache_dir_path + "communities.names"
191
+ unless names_path.exist?
192
+ names_url = "#{base_url}/communities.names"
193
+ download(names_path, names_url)
194
+ end
195
+ names_path.read
196
+ end
197
+ end
198
+ end
@@ -1,6 +1,7 @@
1
1
  require "pathname"
2
2
 
3
3
  require_relative "downloader"
4
+ require_relative "error"
4
5
  require_relative "metadata"
5
6
  require_relative "table"
6
7
 
@@ -17,11 +18,17 @@ module Datasets
17
18
  Table.new(self)
18
19
  end
19
20
 
21
+ def clear_cache!
22
+ if cache_dir_path.exist?
23
+ FileUtils.rmtree(cache_dir_path.to_s, secure: true)
24
+ end
25
+ end
26
+
20
27
  private
21
28
  def cache_dir_path
22
29
  case RUBY_PLATFORM
23
30
  when /mswin/, /mingw/
24
- base_dir = ENV["LOCALAPPDATA"] || "~/AppData"
31
+ base_dir = ENV["LOCALAPPDATA"] || "~/AppData/Local"
25
32
  when /darwin/
26
33
  base_dir = "~/Library/Caches"
27
34
  else
@@ -34,5 +41,17 @@ module Datasets
34
41
  downloader = Downloader.new(url)
35
42
  downloader.download(output_path)
36
43
  end
44
+
45
+ def extract_bz2(path)
46
+ input, output = IO.pipe
47
+ pid = spawn("bzcat", path.to_s, {:out => output})
48
+ begin
49
+ output.close
50
+ yield(input)
51
+ ensure
52
+ input.close
53
+ Process.waitpid(pid)
54
+ end
55
+ end
37
56
  end
38
57
  end
@@ -8,6 +8,8 @@ require "pathname"
8
8
 
9
9
  module Datasets
10
10
  class Downloader
11
+ class TooManyRedirects < StandardError; end
12
+
11
13
  def initialize(url)
12
14
  if url.is_a?(URI::Generic)
13
15
  url = url.dup
@@ -31,39 +33,65 @@ module Datasets
31
33
  headers["Range"] = "bytes=#{start}-"
32
34
  end
33
35
 
34
- Net::HTTP.start(@url.hostname,
35
- @url.port,
36
- :use_ssl => (@url.scheme == "https")) do |http|
37
- request = Net::HTTP::Get.new(@url.path, headers)
36
+ start_http(@url, headers) do |response|
37
+ if response.is_a?(Net::HTTPPartialContent)
38
+ mode = "ab"
39
+ else
40
+ start = nil
41
+ mode = "wb"
42
+ end
43
+
44
+ base_name = @url.path.split("/").last
45
+ size_current = 0
46
+ size_max = response.content_length
47
+ if start
48
+ size_current += start
49
+ size_max += start
50
+ end
51
+ progress_reporter = ProgressReporter.new(base_name, size_max)
52
+ partial_output_path.open(mode) do |output|
53
+ response.read_body do |chunk|
54
+ size_current += chunk.bytesize
55
+ progress_reporter.report(size_current)
56
+ output.write(chunk)
57
+ end
58
+ end
59
+ end
60
+ FileUtils.mv(partial_output_path, output_path)
61
+ rescue TooManyRedirects => error
62
+ last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
63
+ raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
64
+ end
65
+
66
+ private def start_http(url, headers, limit = 10, &block)
67
+ if limit == 0
68
+ raise TooManyRedirects, "too many redirections: #{url}"
69
+ end
70
+ http = Net::HTTP.new(url.hostname, url.port)
71
+ # http.set_debug_output($stderr)
72
+ http.use_ssl = (url.scheme == "https")
73
+ http.start do
74
+ path = url.path
75
+ path += "?#{url.query}" if url.query
76
+ request = Net::HTTP::Get.new(path, headers)
38
77
  http.request(request) do |response|
39
78
  case response
40
- when Net::HTTPPartialContent
41
- mode = "ab"
42
- when Net::HTTPSuccess
43
- start = nil
44
- mode = "wb"
79
+ when Net::HTTPSuccess, Net::HTTPPartialContent
80
+ return block.call(response)
81
+ when Net::HTTPRedirection
82
+ url = URI.parse(response[:location])
83
+ $stderr.puts "Redirect to #{url}"
84
+ return start_http(url, headers, limit - 1, &block)
45
85
  else
46
- break
47
- end
48
-
49
- base_name = @url.path.split("/").last
50
- size_current = 0
51
- size_max = response.content_length
52
- if start
53
- size_current += start
54
- size_max += start
55
- end
56
- progress_reporter = ProgressReporter.new(base_name, size_max)
57
- partial_output_path.open(mode) do |output|
58
- response.read_body do |chunk|
59
- size_current += chunk.bytesize
60
- progress_reporter.report(size_current)
61
- output.write(chunk)
86
+ message = response.code
87
+ if response.message and not response.message.empty?
88
+ message += ": #{response.message}"
62
89
  end
90
+ message += ": #{url}"
91
+ raise response.error_type.new(message, response)
63
92
  end
64
93
  end
65
94
  end
66
- FileUtils.mv(partial_output_path, output_path)
67
95
  end
68
96
 
69
97
  class ProgressReporter
@@ -0,0 +1,320 @@
1
+ # frozen_string_literal: true
2
+ require 'digest/md5'
3
+ require 'net/http'
4
+ require 'uri'
5
+ require 'json'
6
+
7
+ module Datasets
8
+ module EStatJapan
9
+ Record = Struct.new(:id, :name, :values)
10
+ # configuration injection
11
+ module Configurable
12
+ attr_accessor :app_id
13
+
14
+ #
15
+ # configuration for e-Stat API
16
+ # See detail at https://www.e-stat.go.jp/api/api-dev/how_to_use (Japanese only).
17
+ # @example
18
+ # Datasets::EStatJapan.configure do |config|
19
+ # # put your App ID for e-Stat app_id
20
+ # config.app_id = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
21
+ # end
22
+ #
23
+ def configure
24
+ yield self
25
+ end
26
+ end
27
+
28
+ extend Configurable
29
+
30
+ # wrapper class for e-Stat API service
31
+ class StatsData < Dataset
32
+ attr_accessor :app_id, :id
33
+
34
+ #
35
+ # generate accessor instance for e-Stat API's endpoint `getStatsData`.
36
+ # for detail spec : https://www.e-stat.go.jp/api/api-info/e-stat-manual
37
+ # @param [String] id Statistical data id
38
+ # @param [Array<String>] areas Target areas (fetch all if omitted)
39
+ # @param [Array<String>] categories Category IDs (fetch all if omitted)
40
+ # @param [Array<String>] times Time axes (fetch all if omitted)
41
+ # @param [Array<Number>] skip_levels Skip levels for parsing (defaults to `[1]`)
42
+ # @param [String] hierarchy_selection Select target from 'child', 'parent', or 'both'. (Example: 札幌市○○区 -> 'child':札幌市○○区 only; 'parent':札幌市 only; 'both': Both selected) (defaults to `both`)
43
+ # @param [Boolean] skip_nil_column Skip column if contains nil
44
+ # @param [Boolean] skip_nil_row Skip row if contains nil
45
+ # @example
46
+ # stats_data = Datasets::EStatJapan::StatsData.new(
47
+ # "0000020201", # A Population and household (key name: A 人口・世帯)
48
+ # categories: ["A1101"], # Population (key name: A1101_人口総数)
49
+ # areas: ["01105", "01106"], # Toyohira-ku Sapporo-shi Hokkaido, Minami-ku Sapporo-shi Hokkaido
50
+ # times: ["1981100000", "1982100000"],
51
+ # hierarchy_selection: 'child',
52
+ # skip_child_area: true,
53
+ # skip_nil_column: true,
54
+ # skip_nil_row: false,
55
+ # )
56
+ #
57
+ def initialize(id,
58
+ app_id: nil,
59
+ areas: nil, categories: nil, times: nil,
60
+ skip_levels: [1],
61
+ hierarchy_selection: 'child',
62
+ skip_nil_column: true,
63
+ skip_nil_row: false,
64
+ time_range: nil)
65
+ @app_id = app_id || fetch_app_id
66
+ if @app_id.nil? || @app_id.empty?
67
+ raise ArgumentError, 'Please set app_id via `Datasets::EStatJapan.configure` method, environment var `ESTATJAPAN_APP_ID` or keyword argument `:app_id`'
68
+ end
69
+
70
+ super()
71
+
72
+ @api_version = '3.0'
73
+ @base_url = "https://api.e-stat.go.jp/rest/#{@api_version}/app/json/getStatsData"
74
+ @metadata.id = "e-stat-japan-#{@api_version}"
75
+ @metadata.name = "e-Stat API #{@api_version}"
76
+ @metadata.url = @base_url
77
+ @metadata.description = "e-Stat API #{@api_version}"
78
+
79
+ @id = id
80
+ @areas = areas
81
+ @categories = categories
82
+ @times = times
83
+ @skip_levels = skip_levels
84
+ case hierarchy_selection
85
+ when 'child' then
86
+ @skip_child_area = false
87
+ @skip_parent_area = true
88
+ when 'parent' then
89
+ @skip_child_area = true
90
+ @skip_parent_area = false
91
+ else # 'both'
92
+ @skip_child_area = false
93
+ @skip_parent_area = false
94
+ end
95
+ @skip_nil_column = skip_nil_column
96
+ @skip_nil_row = skip_nil_row
97
+ @time_range = time_range
98
+
99
+ @url = generate_url
100
+ option_hash = Digest::MD5.hexdigest(@url.to_s)
101
+ base_name = "e-stat-japan-#{option_hash}.json"
102
+ @data_path = cache_dir_path + base_name
103
+ @loaded = false
104
+ end
105
+
106
+ #
107
+ # fetch data records from Remote API
108
+ # @example
109
+ # indices = []
110
+ # rows = []
111
+ # map_id_name = {}
112
+ # estat.each do |record|
113
+ # # Select Hokkaido prefecture only
114
+ # next unless record.id.to_s.start_with? '01'
115
+ # indices << record.id
116
+ # rows << record.values
117
+ # map_id_name[record.id] = record.name
118
+ # end
119
+ #
120
+ def each
121
+ return to_enum(__method__) unless block_given?
122
+
123
+ load_data
124
+
125
+ # create rows
126
+ @areas.each do |a_key, a_value|
127
+ rows = []
128
+ @time_tables.reject { |_key, x| x[:skip] }.each do |st_key, _st_value|
129
+ row = @columns.reject { |_key, x| x[:skip] }.map do |c_key, _c_value|
130
+ @indexed_data.dig(st_key, a_key, c_key)
131
+ end
132
+ rows << row
133
+ end
134
+ next if @skip_nil_row && rows.flatten.count(nil).positive?
135
+
136
+ yield Record.new(a_key, a_value['@name'], rows.flatten)
137
+ end
138
+ end
139
+
140
+ def areas
141
+ load_data
142
+ @areas
143
+ end
144
+
145
+ def time_tables
146
+ load_data
147
+ @time_tables
148
+ end
149
+
150
+ def columns
151
+ load_data
152
+ @columns
153
+ end
154
+
155
+ def schema
156
+ load_data
157
+ @schema
158
+ end
159
+
160
+ private
161
+
162
+ def generate_url
163
+ # generates url for query
164
+ params = {
165
+ appId: @app_id, lang: 'J',
166
+ statsDataId: @id,
167
+ metaGetFlg: 'Y', cntGetFlg: 'N',
168
+ sectionHeaderFlg: '1'
169
+ }
170
+ params['cdArea'] = @areas.join(',') if @areas.instance_of?(Array)
171
+ params['cdCat01'] = @categories.join(',') if @categories.instance_of?(Array)
172
+ params['cdTime'] = @times.join(',') if @times.instance_of?(Array)
173
+
174
+ URI.parse("#{@base_url}?#{URI.encode_www_form(params)}")
175
+ end
176
+
177
+ def extract_def(data, id)
178
+ rec = data.dig('GET_STATS_DATA',
179
+ 'STATISTICAL_DATA',
180
+ 'CLASS_INF',
181
+ 'CLASS_OBJ')
182
+ rec.select { |x| x['@id'] == id }
183
+ end
184
+
185
+ def index_def(data_def)
186
+ unless data_def.first['CLASS'].instance_of?(Array)
187
+ # convert to array when number of element is 1
188
+ data_def.first['CLASS'] = [data_def.first['CLASS']]
189
+ end
190
+ Hash[*data_def.first['CLASS'].map { |x| [x['@code'], x] }.flatten]
191
+ end
192
+
193
+ def get_values(data)
194
+ data.dig('GET_STATS_DATA',
195
+ 'STATISTICAL_DATA',
196
+ 'DATA_INF',
197
+ 'VALUE')
198
+ end
199
+
200
+ def fetch_app_id
201
+ EStatJapan.app_id || ENV['ESTATJAPAN_APP_ID']
202
+ end
203
+
204
+ def load_data
205
+ return if @loaded
206
+
207
+ fetch_data
208
+ index_data
209
+ end
210
+
211
+ def fetch_data
212
+ # MEMO:
213
+ # The e-stat api always returns 200 (Ok)
214
+ # even if error happens dispite of its error mapping.
215
+ # So we can't avoid caching retrieved response from the api.
216
+ # ref: https://www.e-stat.go.jp/api/api-info/e-stat-manual3-0
217
+ download(@data_path, @url.to_s) unless @data_path.exist?
218
+ end
219
+
220
+ def index_data
221
+ # parse json
222
+ raw_data = File.open(@data_path) do |io|
223
+ JSON.parse(io.read)
224
+ end
225
+
226
+ # check status
227
+ api_status = raw_data.dig('GET_STATS_DATA', 'RESULT', 'STATUS')
228
+ if api_status != 0
229
+ # remove error response cache manually
230
+ FileUtils.rm(@data_path)
231
+ error_msg = raw_data.dig('GET_STATS_DATA', 'RESULT', 'ERROR_MSG')
232
+ raise APIError, "code #{api_status} : #{error_msg}"
233
+ end
234
+
235
+ # index data
236
+ ## table_def = extract_def(raw_data, "tab")
237
+ timetable_def = extract_def(raw_data, 'time')
238
+ column_def = extract_def(raw_data, 'cat01')
239
+ area_def = extract_def(raw_data, 'area')
240
+
241
+ @time_tables = index_def(timetable_def)
242
+ @columns = index_def(column_def)
243
+ @areas = index_def(area_def)
244
+
245
+ ## apply time_range to time_tables
246
+ @time_tables.select! { |k, _v| @time_tables.keys[@time_range].include? k } if @time_range.instance_of?(Range)
247
+
248
+ @indexed_data = Hash[*@time_tables.keys.map { |x| [x, {}] }.flatten]
249
+ get_values(raw_data).each do |row|
250
+ next unless @time_tables.key?(row['@time'])
251
+
252
+ data = @indexed_data.dig(row['@time'], row['@area']) || {}
253
+ new_data = data.merge(row['@cat01'] => row['$'].to_f)
254
+ @indexed_data[row['@time']][row['@area']] = new_data
255
+ end
256
+
257
+ skip_areas
258
+ skip_nil_column
259
+ @schema = create_header
260
+ @loaded = true
261
+ end
262
+
263
+ def skip_areas
264
+ # skip levels
265
+ @areas.reject! { |_key, x| @skip_levels.include? x['@level'].to_i }
266
+
267
+ # skip area that has children
268
+ if @skip_parent_area
269
+ # inspect hieralchy of areas
270
+ @areas.each do |_a_key, a_value|
271
+ next unless @areas.key? a_value['@parentCode']
272
+
273
+ @areas[a_value['@parentCode']][:has_children] = true
274
+ end
275
+ # filter areas without children
276
+ @areas.reject! { |_key, x| x[:has_children] }
277
+ end
278
+
279
+ # skip child area
280
+ @areas.reject! { |_a_key, a_value| (@areas.key? a_value['@parentCode']) } if @skip_child_area
281
+ end
282
+
283
+ def skip_nil_column
284
+ return unless @skip_nil_column
285
+
286
+ # filter time_tables and columns
287
+ @areas.each do |a_key, _a_value|
288
+ @time_tables.each do |st_key, st_value|
289
+ unless @indexed_data[st_key].key?(a_key)
290
+ st_value[:skip] = true
291
+ next
292
+ end
293
+ @columns.each do |c_key, c_value|
294
+ unless @indexed_data.dig(st_key, a_key).key?(c_key)
295
+ c_value[:skip] = true
296
+ next
297
+ end
298
+ end
299
+ end
300
+ end
301
+ end
302
+
303
+ def create_header
304
+ schema = []
305
+ @time_tables.reject { |_key, x| x[:skip] }.each do |_st_key, st_value|
306
+ @columns.reject { |_key, x| x[:skip] }.each do |_c_key, c_value|
307
+ schema << "#{st_value['@name']}_#{c_value['@name']}"
308
+ end
309
+ end
310
+ schema
311
+ end
312
+ end
313
+
314
+ class ArgumentError < Error
315
+ end
316
+
317
+ class APIError < Error
318
+ end
319
+ end
320
+ end