red-datasets 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,198 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class Communities < Dataset
7
+ Record = Struct.new(
8
+ :state,
9
+ :county,
10
+ :community,
11
+ :community_name,
12
+ :fold,
13
+ :population,
14
+ :household_size,
15
+ :race_percent_black,
16
+ :race_percent_white,
17
+ :race_percent_asian,
18
+ :race_percent_hispanic,
19
+ :age_percent_12_to_21,
20
+ :age_percent_12_to_29,
21
+ :age_percent_16_to_24,
22
+ :age_percent_65_and_upper,
23
+ :n_people_urban,
24
+ :percent_people_urban,
25
+ :median_income,
26
+ :percent_households_with_wage,
27
+ :percent_households_with_farm_self,
28
+ :percent_households_with_investment_income,
29
+ :percent_households_with_social_security,
30
+ :percent_households_with_public_assistant,
31
+ :percent_households_with_retire,
32
+ :median_family_income,
33
+ :per_capita_income,
34
+ :per_capita_income_white,
35
+ :per_capita_income_black,
36
+ :per_capita_income_indian,
37
+ :per_capita_income_asian,
38
+ :per_capita_income_other,
39
+ :per_capita_income_hispanic,
40
+ :n_people_under_poverty,
41
+ :percent_people_under_poverty,
42
+ :percent_less_9th_grade,
43
+ :percent_not_high_school_graduate,
44
+ :percent_bachelors_or_more,
45
+ :percent_unemployed,
46
+ :percent_employed,
47
+ :percent_employed_manufacturing,
48
+ :percent_employed_professional_service,
49
+ :percent_occupations_manufacturing,
50
+ :percent_occupations_management_professional,
51
+ :male_percent_divorced,
52
+ :male_percent_never_married,
53
+ :female_percent_divorced,
54
+ :total_percent_divorced,
55
+ :mean_persons_per_family,
56
+ :percent_family_2_parents,
57
+ :percent_kids_2_parents,
58
+ :percent_young_kids_2_parents,
59
+ :percent_teen_2_parents,
60
+ :percent_work_mom_young_kids,
61
+ :percent_work_mom,
62
+ :n_illegals,
63
+ :percent_illegals,
64
+ :n_immigrants,
65
+ :percent_immigrants_recent,
66
+ :percent_immigrants_recent_5,
67
+ :percent_immigrants_recent_8,
68
+ :percent_immigrants_recent_10,
69
+ :percent_population_immigranted_recent,
70
+ :percent_population_immigranted_recent_5,
71
+ :percent_population_immigranted_recent_8,
72
+ :percent_population_immigranted_recent_10,
73
+ :percent_speak_english_only,
74
+ :percent_not_speak_english_well,
75
+ :percent_large_households_family,
76
+ :percent_large_households_occupied,
77
+ :mean_persons_per_occupied_household,
78
+ :mean_persons_per_owner_occupied_household,
79
+ :mean_persons_per_rental_occupied_household,
80
+ :percent_persons_owner_occupied_household,
81
+ :percent_persons_dense_housing,
82
+ :percent_housing_less_3_bedrooms,
83
+ :median_n_bedrooms,
84
+ :n_vacant_households,
85
+ :percent_housing_occupied,
86
+ :percent_housing_owner_occupied,
87
+ :percent_vacant_housing_boarded,
88
+ :percent_vacant_housing_more_6_months,
89
+ :median_year_housing_built,
90
+ :percent_housing_no_phone,
91
+ :percent_housing_without_full_plumbing,
92
+ :owner_occupied_housing_lower_quartile,
93
+ :owner_occupied_housing_median,
94
+ :owner_occupied_housing_higher_quartile,
95
+ :rental_housing_lower_quartile,
96
+ :rental_housing_median,
97
+ :rental_housing_higher_quartile,
98
+ :median_rent,
99
+ :median_rent_percent_household_income,
100
+ :median_owner_cost_percent_household_income,
101
+ :median_owner_cost_percent_household_income_no_mortgage,
102
+ :n_people_shelter,
103
+ :n_people_street,
104
+ :percent_foreign_born,
105
+ :percent_born_same_state,
106
+ :percent_same_house_85,
107
+ :percent_same_city_85,
108
+ :percent_same_state_85,
109
+ :lemas_sworn_full_time,
110
+ :lemas_sworn_full_time_per_population,
111
+ :lemas_sworn_full_time_field,
112
+ :lemas_sworn_full_time_field_per_population,
113
+ :lemas_total_requests,
114
+ :lemas_total_requests_per_population,
115
+ :total_requests_per_officer,
116
+ :n_officers_per_population,
117
+ :racial_match_community_police,
118
+ :percent_police_white,
119
+ :percent_police_black,
120
+ :percent_police_hispanic,
121
+ :percent_police_asian,
122
+ :percent_police_minority,
123
+ :n_officers_assigned_drug_units,
124
+ :n_kinds_drugs_seized,
125
+ :police_average_overtime_worked,
126
+ :land_area,
127
+ :population_density,
128
+ :percent_use_public_transit,
129
+ :n_police_cars,
130
+ :n_police_operating_budget,
131
+ :lemas_percent_police_on_patrol,
132
+ :lemas_gang_unit_deployed,
133
+ :lemas_percent_office_drug_units,
134
+ :police_operating_budget_per_population,
135
+ :total_violent_crimes_per_population
136
+ )
137
+
138
+ def initialize
139
+ super()
140
+ @metadata.id = "communities"
141
+ @metadata.name = "Communities"
142
+ @metadata.url = "https://archive.ics.uci.edu/ml/datasets/communities+and+crime"
143
+ @metadata.description = lambda do
144
+ read_names
145
+ end
146
+ end
147
+
148
+ def each
149
+ return to_enum(__method__) unless block_given?
150
+
151
+ open_data do |csv|
152
+ csv.each do |row|
153
+ row = row.collect.with_index do |column, i|
154
+ if column == "?"
155
+ nil
156
+ else
157
+ case i
158
+ when 3 # communityname
159
+ # when 124 # LemasGangUnitDeploy
160
+ # 0 means NO, 1 means YES, 0.5 means Part Time
161
+ else
162
+ column = Float(column)
163
+ end
164
+ column
165
+ end
166
+ end
167
+ record = Record.new(*row)
168
+ yield(record)
169
+ end
170
+ end
171
+ end
172
+
173
+ private
174
+ def base_url
175
+ "https://archive.ics.uci.edu/ml/machine-learning-databases/communities"
176
+ end
177
+
178
+ def open_data
179
+ data_path = cache_dir_path + "communities.data"
180
+ unless data_path.exist?
181
+ data_url = "#{base_url}/communities.data"
182
+ download(data_path, data_url)
183
+ end
184
+ CSV.open(data_path) do |csv|
185
+ yield(csv)
186
+ end
187
+ end
188
+
189
+ def read_names
190
+ names_path = cache_dir_path + "communities.names"
191
+ unless names_path.exist?
192
+ names_url = "#{base_url}/communities.names"
193
+ download(names_path, names_url)
194
+ end
195
+ names_path.read
196
+ end
197
+ end
198
+ end
@@ -1,6 +1,7 @@
1
1
  require "pathname"
2
2
 
3
3
  require_relative "downloader"
4
+ require_relative "error"
4
5
  require_relative "metadata"
5
6
  require_relative "table"
6
7
 
@@ -0,0 +1,320 @@
1
+ # frozen_string_literal: true
2
+ require 'digest/md5'
3
+ require 'net/http'
4
+ require 'uri'
5
+ require 'json'
6
+
7
+ module Datasets
8
+ module EStatJapan
9
+ Record = Struct.new(:id, :name, :values)
10
+ # configuration injection
11
+ module Configurable
12
+ attr_accessor :app_id
13
+
14
+ #
15
+ # configuration for e-Stat API
16
+ # See detail at https://www.e-stat.go.jp/api/api-dev/how_to_use (Japanese only).
17
+ # @example
18
+ # Datasets::EStatJapan.configure do |config|
19
+ # # put your App ID for e-Stat app_id
20
+ # config.app_id = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
21
+ # end
22
+ #
23
+ def configure
24
+ yield self
25
+ end
26
+ end
27
+
28
+ extend Configurable
29
+
30
+ # wrapper class for e-Stat API service
31
+ class StatsData < Dataset
32
+ attr_accessor :app_id, :id
33
+
34
+ #
35
+ # generate accessor instance for e-Stat API's endpoint `getStatsData`.
36
+ # for detail spec : https://www.e-stat.go.jp/api/api-info/e-stat-manual
37
+ # @param [String] id Statistical data id
38
+ # @param [Array<String>] areas Target areas (fetch all if omitted)
39
+ # @param [Array<String>] categories Category IDs (fetch all if omitted)
40
+ # @param [Array<String>] times Time axes (fetch all if omitted)
41
+ # @param [Array<Number>] skip_levels Skip levels for parsing (defaults to `[1]`)
42
+ # @param [String] hierarchy_selection Select target from 'child', 'parent', or 'both'. (Example: 札幌市○○区 -> 'child':札幌市○○区 only; 'parent':札幌市 only; 'both': Both selected) (defaults to `both`)
43
+ # @param [Boolean] skip_nil_column Skip column if contains nil
44
+ # @param [Boolean] skip_nil_row Skip row if contains nil
45
+ # @example
46
+ # stats_data = Datasets::EStatJapan::StatsData.new(
47
+ # "0000020201", # A Population and household (key name: A 人口・世帯)
48
+ # categories: ["A1101"], # Population (key name: A1101_人口総数)
49
+ # areas: ["01105", "01106"], # Toyohira-ku Sapporo-shi Hokkaido, Minami-ku Sapporo-shi Hokkaido
50
+ # times: ["1981100000", "1982100000"],
51
+ # hierarchy_selection: 'child',
52
+ # skip_child_area: true,
53
+ # skip_nil_column: true,
54
+ # skip_nil_row: false,
55
+ # )
56
+ #
57
+ def initialize(id,
58
+ app_id: nil,
59
+ areas: nil, categories: nil, times: nil,
60
+ skip_levels: [1],
61
+ hierarchy_selection: 'child',
62
+ skip_nil_column: true,
63
+ skip_nil_row: false,
64
+ time_range: nil)
65
+ @app_id = app_id || fetch_app_id
66
+ if @app_id.nil? || @app_id.empty?
67
+ raise ArgumentError, 'Please set app_id via `Datasets::EStatJapan.configure` method, environment var `ESTATJAPAN_APP_ID` or keyword argument `:app_id`'
68
+ end
69
+
70
+ super()
71
+
72
+ @api_version = '3.0'
73
+ @base_url = "https://api.e-stat.go.jp/rest/#{@api_version}/app/json/getStatsData"
74
+ @metadata.id = "e-stat-japan-#{@api_version}"
75
+ @metadata.name = "e-Stat API #{@api_version}"
76
+ @metadata.url = @base_url
77
+ @metadata.description = "e-Stat API #{@api_version}"
78
+
79
+ @id = id
80
+ @areas = areas
81
+ @categories = categories
82
+ @times = times
83
+ @skip_levels = skip_levels
84
+ case hierarchy_selection
85
+ when 'child' then
86
+ @skip_child_area = false
87
+ @skip_parent_area = true
88
+ when 'parent' then
89
+ @skip_child_area = true
90
+ @skip_parent_area = false
91
+ else # 'both'
92
+ @skip_child_area = false
93
+ @skip_parent_area = false
94
+ end
95
+ @skip_nil_column = skip_nil_column
96
+ @skip_nil_row = skip_nil_row
97
+ @time_range = time_range
98
+
99
+ @url = generate_url
100
+ option_hash = Digest::MD5.hexdigest(@url.to_s)
101
+ base_name = "e-stat-japan-#{option_hash}.json"
102
+ @data_path = cache_dir_path + base_name
103
+ @loaded = false
104
+ end
105
+
106
+ #
107
+ # fetch data records from Remote API
108
+ # @example
109
+ # indices = []
110
+ # rows = []
111
+ # map_id_name = {}
112
+ # estat.each do |record|
113
+ # # Select Hokkaido prefecture only
114
+ # next unless record.id.to_s.start_with? '01'
115
+ # indices << record.id
116
+ # rows << record.values
117
+ # map_id_name[record.id] = record.name
118
+ # end
119
+ #
120
+ def each
121
+ return to_enum(__method__) unless block_given?
122
+
123
+ load_data
124
+
125
+ # create rows
126
+ @areas.each do |a_key, a_value|
127
+ rows = []
128
+ @time_tables.reject { |_key, x| x[:skip] }.each do |st_key, _st_value|
129
+ row = @columns.reject { |_key, x| x[:skip] }.map do |c_key, _c_value|
130
+ @indexed_data.dig(st_key, a_key, c_key)
131
+ end
132
+ rows << row
133
+ end
134
+ next if @skip_nil_row && rows.flatten.count(nil).positive?
135
+
136
+ yield Record.new(a_key, a_value['@name'], rows.flatten)
137
+ end
138
+ end
139
+
140
+ def areas
141
+ load_data
142
+ @areas
143
+ end
144
+
145
+ def time_tables
146
+ load_data
147
+ @time_tables
148
+ end
149
+
150
+ def columns
151
+ load_data
152
+ @columns
153
+ end
154
+
155
+ def schema
156
+ load_data
157
+ @schema
158
+ end
159
+
160
+ private
161
+
162
+ def generate_url
163
+ # generates url for query
164
+ params = {
165
+ appId: @app_id, lang: 'J',
166
+ statsDataId: @id,
167
+ metaGetFlg: 'Y', cntGetFlg: 'N',
168
+ sectionHeaderFlg: '1'
169
+ }
170
+ params['cdArea'] = @areas.join(',') if @areas.instance_of?(Array)
171
+ params['cdCat01'] = @categories.join(',') if @categories.instance_of?(Array)
172
+ params['cdTime'] = @times.join(',') if @times.instance_of?(Array)
173
+
174
+ URI.parse("#{@base_url}?#{URI.encode_www_form(params)}")
175
+ end
176
+
177
+ def extract_def(data, id)
178
+ rec = data.dig('GET_STATS_DATA',
179
+ 'STATISTICAL_DATA',
180
+ 'CLASS_INF',
181
+ 'CLASS_OBJ')
182
+ rec.select { |x| x['@id'] == id }
183
+ end
184
+
185
+ def index_def(data_def)
186
+ unless data_def.first['CLASS'].instance_of?(Array)
187
+ # convert to array when number of element is 1
188
+ data_def.first['CLASS'] = [data_def.first['CLASS']]
189
+ end
190
+ Hash[*data_def.first['CLASS'].map { |x| [x['@code'], x] }.flatten]
191
+ end
192
+
193
+ def get_values(data)
194
+ data.dig('GET_STATS_DATA',
195
+ 'STATISTICAL_DATA',
196
+ 'DATA_INF',
197
+ 'VALUE')
198
+ end
199
+
200
+ def fetch_app_id
201
+ EStatJapan.app_id || ENV['ESTATJAPAN_APP_ID']
202
+ end
203
+
204
+ def load_data
205
+ return if @loaded
206
+
207
+ fetch_data
208
+ index_data
209
+ end
210
+
211
+ def fetch_data
212
+ # MEMO:
213
+ # The e-stat api always returns 200 (Ok)
214
+ # even if error happens dispite of its error mapping.
215
+ # So we can't avoid caching retrieved response from the api.
216
+ # ref: https://www.e-stat.go.jp/api/api-info/e-stat-manual3-0
217
+ download(@data_path, @url.to_s) unless @data_path.exist?
218
+ end
219
+
220
+ def index_data
221
+ # parse json
222
+ raw_data = File.open(@data_path) do |io|
223
+ JSON.parse(io.read)
224
+ end
225
+
226
+ # check status
227
+ api_status = raw_data.dig('GET_STATS_DATA', 'RESULT', 'STATUS')
228
+ if api_status != 0
229
+ # remove error response cache manually
230
+ FileUtils.rm(@data_path)
231
+ error_msg = raw_data.dig('GET_STATS_DATA', 'RESULT', 'ERROR_MSG')
232
+ raise APIError, "code #{api_status} : #{error_msg}"
233
+ end
234
+
235
+ # index data
236
+ ## table_def = extract_def(raw_data, "tab")
237
+ timetable_def = extract_def(raw_data, 'time')
238
+ column_def = extract_def(raw_data, 'cat01')
239
+ area_def = extract_def(raw_data, 'area')
240
+
241
+ @time_tables = index_def(timetable_def)
242
+ @columns = index_def(column_def)
243
+ @areas = index_def(area_def)
244
+
245
+ ## apply time_range to time_tables
246
+ @time_tables.select! { |k, _v| @time_tables.keys[@time_range].include? k } if @time_range.instance_of?(Range)
247
+
248
+ @indexed_data = Hash[*@time_tables.keys.map { |x| [x, {}] }.flatten]
249
+ get_values(raw_data).each do |row|
250
+ next unless @time_tables.key?(row['@time'])
251
+
252
+ data = @indexed_data.dig(row['@time'], row['@area']) || {}
253
+ new_data = data.merge(row['@cat01'] => row['$'].to_f)
254
+ @indexed_data[row['@time']][row['@area']] = new_data
255
+ end
256
+
257
+ skip_areas
258
+ skip_nil_column
259
+ @schema = create_header
260
+ @loaded = true
261
+ end
262
+
263
+ def skip_areas
264
+ # skip levels
265
+ @areas.reject! { |_key, x| @skip_levels.include? x['@level'].to_i }
266
+
267
+ # skip area that has children
268
+ if @skip_parent_area
269
+ # inspect hieralchy of areas
270
+ @areas.each do |_a_key, a_value|
271
+ next unless @areas.key? a_value['@parentCode']
272
+
273
+ @areas[a_value['@parentCode']][:has_children] = true
274
+ end
275
+ # filter areas without children
276
+ @areas.reject! { |_key, x| x[:has_children] }
277
+ end
278
+
279
+ # skip child area
280
+ @areas.reject! { |_a_key, a_value| (@areas.key? a_value['@parentCode']) } if @skip_child_area
281
+ end
282
+
283
+ def skip_nil_column
284
+ return unless @skip_nil_column
285
+
286
+ # filter time_tables and columns
287
+ @areas.each do |a_key, _a_value|
288
+ @time_tables.each do |st_key, st_value|
289
+ unless @indexed_data[st_key].key?(a_key)
290
+ st_value[:skip] = true
291
+ next
292
+ end
293
+ @columns.each do |c_key, c_value|
294
+ unless @indexed_data.dig(st_key, a_key).key?(c_key)
295
+ c_value[:skip] = true
296
+ next
297
+ end
298
+ end
299
+ end
300
+ end
301
+ end
302
+
303
+ def create_header
304
+ schema = []
305
+ @time_tables.reject { |_key, x| x[:skip] }.each do |_st_key, st_value|
306
+ @columns.reject { |_key, x| x[:skip] }.each do |_c_key, c_value|
307
+ schema << "#{st_value['@name']}_#{c_value['@name']}"
308
+ end
309
+ end
310
+ schema
311
+ end
312
+ end
313
+
314
+ class ArgumentError < Error
315
+ end
316
+
317
+ class APIError < Error
318
+ end
319
+ end
320
+ end