red-datasets 0.0.6 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -7
  3. data/doc/text/news.md +124 -0
  4. data/lib/datasets.rb +18 -6
  5. data/lib/datasets/adult.rb +84 -0
  6. data/lib/datasets/cldr-plurals.rb +385 -0
  7. data/lib/datasets/communities.rb +198 -0
  8. data/lib/datasets/dataset.rb +13 -0
  9. data/lib/datasets/dictionary.rb +59 -0
  10. data/lib/datasets/downloader.rb +37 -62
  11. data/lib/datasets/e-stat-japan.rb +320 -0
  12. data/lib/datasets/error.rb +4 -0
  13. data/lib/datasets/fashion-mnist.rb +12 -0
  14. data/lib/datasets/hepatitis.rb +207 -0
  15. data/lib/datasets/iris.rb +1 -1
  16. data/lib/datasets/libsvm-dataset-list.rb +277 -0
  17. data/lib/datasets/libsvm.rb +135 -0
  18. data/lib/datasets/mnist.rb +11 -8
  19. data/lib/datasets/mushroom.rb +256 -0
  20. data/lib/datasets/penguins.rb +125 -0
  21. data/lib/datasets/penn-treebank.rb +2 -9
  22. data/lib/datasets/postal-code-japan.rb +154 -0
  23. data/lib/datasets/table.rb +99 -3
  24. data/lib/datasets/version.rb +1 -1
  25. data/lib/datasets/wikipedia.rb +2 -10
  26. data/lib/datasets/wine.rb +64 -0
  27. data/red-datasets.gemspec +4 -0
  28. data/test/helper.rb +1 -0
  29. data/test/run-test.rb +2 -0
  30. data/test/test-adult.rb +126 -0
  31. data/test/test-cldr-plurals.rb +180 -0
  32. data/test/test-communities.rb +290 -0
  33. data/test/test-dictionary.rb +43 -0
  34. data/test/test-e-stat-japan.rb +383 -0
  35. data/test/test-fashion-mnist.rb +137 -0
  36. data/test/test-hepatitis.rb +74 -0
  37. data/test/test-libsvm-dataset-list.rb +47 -0
  38. data/test/test-libsvm.rb +205 -0
  39. data/test/test-mnist.rb +95 -70
  40. data/test/test-mushroom.rb +80 -0
  41. data/test/test-penguins.rb +239 -0
  42. data/test/test-penn-treebank.rb +6 -6
  43. data/test/test-postal-code-japan.rb +69 -0
  44. data/test/test-table.rb +144 -19
  45. data/test/test-wine.rb +58 -0
  46. metadata +89 -8
@@ -0,0 +1,198 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class Communities < Dataset
7
+ Record = Struct.new(
8
+ :state,
9
+ :county,
10
+ :community,
11
+ :community_name,
12
+ :fold,
13
+ :population,
14
+ :household_size,
15
+ :race_percent_black,
16
+ :race_percent_white,
17
+ :race_percent_asian,
18
+ :race_percent_hispanic,
19
+ :age_percent_12_to_21,
20
+ :age_percent_12_to_29,
21
+ :age_percent_16_to_24,
22
+ :age_percent_65_and_upper,
23
+ :n_people_urban,
24
+ :percent_people_urban,
25
+ :median_income,
26
+ :percent_households_with_wage,
27
+ :percent_households_with_farm_self,
28
+ :percent_households_with_investment_income,
29
+ :percent_households_with_social_security,
30
+ :percent_households_with_public_assistant,
31
+ :percent_households_with_retire,
32
+ :median_family_income,
33
+ :per_capita_income,
34
+ :per_capita_income_white,
35
+ :per_capita_income_black,
36
+ :per_capita_income_indian,
37
+ :per_capita_income_asian,
38
+ :per_capita_income_other,
39
+ :per_capita_income_hispanic,
40
+ :n_people_under_poverty,
41
+ :percent_people_under_poverty,
42
+ :percent_less_9th_grade,
43
+ :percent_not_high_school_graduate,
44
+ :percent_bachelors_or_more,
45
+ :percent_unemployed,
46
+ :percent_employed,
47
+ :percent_employed_manufacturing,
48
+ :percent_employed_professional_service,
49
+ :percent_occupations_manufacturing,
50
+ :percent_occupations_management_professional,
51
+ :male_percent_divorced,
52
+ :male_percent_never_married,
53
+ :female_percent_divorced,
54
+ :total_percent_divorced,
55
+ :mean_persons_per_family,
56
+ :percent_family_2_parents,
57
+ :percent_kids_2_parents,
58
+ :percent_young_kids_2_parents,
59
+ :percent_teen_2_parents,
60
+ :percent_work_mom_young_kids,
61
+ :percent_work_mom,
62
+ :n_illegals,
63
+ :percent_illegals,
64
+ :n_immigrants,
65
+ :percent_immigrants_recent,
66
+ :percent_immigrants_recent_5,
67
+ :percent_immigrants_recent_8,
68
+ :percent_immigrants_recent_10,
69
+ :percent_population_immigranted_recent,
70
+ :percent_population_immigranted_recent_5,
71
+ :percent_population_immigranted_recent_8,
72
+ :percent_population_immigranted_recent_10,
73
+ :percent_speak_english_only,
74
+ :percent_not_speak_english_well,
75
+ :percent_large_households_family,
76
+ :percent_large_households_occupied,
77
+ :mean_persons_per_occupied_household,
78
+ :mean_persons_per_owner_occupied_household,
79
+ :mean_persons_per_rental_occupied_household,
80
+ :percent_persons_owner_occupied_household,
81
+ :percent_persons_dense_housing,
82
+ :percent_housing_less_3_bedrooms,
83
+ :median_n_bedrooms,
84
+ :n_vacant_households,
85
+ :percent_housing_occupied,
86
+ :percent_housing_owner_occupied,
87
+ :percent_vacant_housing_boarded,
88
+ :percent_vacant_housing_more_6_months,
89
+ :median_year_housing_built,
90
+ :percent_housing_no_phone,
91
+ :percent_housing_without_full_plumbing,
92
+ :owner_occupied_housing_lower_quartile,
93
+ :owner_occupied_housing_median,
94
+ :owner_occupied_housing_higher_quartile,
95
+ :rental_housing_lower_quartile,
96
+ :rental_housing_median,
97
+ :rental_housing_higher_quartile,
98
+ :median_rent,
99
+ :median_rent_percent_household_income,
100
+ :median_owner_cost_percent_household_income,
101
+ :median_owner_cost_percent_household_income_no_mortgage,
102
+ :n_people_shelter,
103
+ :n_people_street,
104
+ :percent_foreign_born,
105
+ :percent_born_same_state,
106
+ :percent_same_house_85,
107
+ :percent_same_city_85,
108
+ :percent_same_state_85,
109
+ :lemas_sworn_full_time,
110
+ :lemas_sworn_full_time_per_population,
111
+ :lemas_sworn_full_time_field,
112
+ :lemas_sworn_full_time_field_per_population,
113
+ :lemas_total_requests,
114
+ :lemas_total_requests_per_population,
115
+ :total_requests_per_officer,
116
+ :n_officers_per_population,
117
+ :racial_match_community_police,
118
+ :percent_police_white,
119
+ :percent_police_black,
120
+ :percent_police_hispanic,
121
+ :percent_police_asian,
122
+ :percent_police_minority,
123
+ :n_officers_assigned_drug_units,
124
+ :n_kinds_drugs_seized,
125
+ :police_average_overtime_worked,
126
+ :land_area,
127
+ :population_density,
128
+ :percent_use_public_transit,
129
+ :n_police_cars,
130
+ :n_police_operating_budget,
131
+ :lemas_percent_police_on_patrol,
132
+ :lemas_gang_unit_deployed,
133
+ :lemas_percent_office_drug_units,
134
+ :police_operating_budget_per_population,
135
+ :total_violent_crimes_per_population
136
+ )
137
+
138
+ def initialize
139
+ super()
140
+ @metadata.id = "communities"
141
+ @metadata.name = "Communities"
142
+ @metadata.url = "https://archive.ics.uci.edu/ml/datasets/communities+and+crime"
143
+ @metadata.description = lambda do
144
+ read_names
145
+ end
146
+ end
147
+
148
+ def each
149
+ return to_enum(__method__) unless block_given?
150
+
151
+ open_data do |csv|
152
+ csv.each do |row|
153
+ row = row.collect.with_index do |column, i|
154
+ if column == "?"
155
+ nil
156
+ else
157
+ case i
158
+ when 3 # communityname
159
+ # when 124 # LemasGangUnitDeploy
160
+ # 0 means NO, 1 means YES, 0.5 means Part Time
161
+ else
162
+ column = Float(column)
163
+ end
164
+ column
165
+ end
166
+ end
167
+ record = Record.new(*row)
168
+ yield(record)
169
+ end
170
+ end
171
+ end
172
+
173
+ private
174
+ def base_url
175
+ "https://archive.ics.uci.edu/ml/machine-learning-databases/communities"
176
+ end
177
+
178
+ def open_data
179
+ data_path = cache_dir_path + "communities.data"
180
+ unless data_path.exist?
181
+ data_url = "#{base_url}/communities.data"
182
+ download(data_path, data_url)
183
+ end
184
+ CSV.open(data_path) do |csv|
185
+ yield(csv)
186
+ end
187
+ end
188
+
189
+ def read_names
190
+ names_path = cache_dir_path + "communities.names"
191
+ unless names_path.exist?
192
+ names_url = "#{base_url}/communities.names"
193
+ download(names_path, names_url)
194
+ end
195
+ names_path.read
196
+ end
197
+ end
198
+ end
@@ -1,6 +1,7 @@
1
1
  require "pathname"
2
2
 
3
3
  require_relative "downloader"
4
+ require_relative "error"
4
5
  require_relative "metadata"
5
6
  require_relative "table"
6
7
 
@@ -34,5 +35,17 @@ module Datasets
34
35
  downloader = Downloader.new(url)
35
36
  downloader.download(output_path)
36
37
  end
38
+
39
+ def extract_bz2(path)
40
+ input, output = IO.pipe
41
+ pid = spawn("bzcat", path.to_s, {:out => output})
42
+ begin
43
+ output.close
44
+ yield(input)
45
+ ensure
46
+ input.close
47
+ Process.waitpid(pid)
48
+ end
49
+ end
37
50
  end
38
51
  end
@@ -0,0 +1,59 @@
1
+ module Datasets
2
+ class Dictionary
3
+ include Enumerable
4
+
5
+ def initialize(values)
6
+ build_dictionary(values)
7
+ end
8
+
9
+ def id(value)
10
+ @value_to_id[value]
11
+ end
12
+
13
+ def value(id)
14
+ @id_to_value[id]
15
+ end
16
+
17
+ def ids
18
+ @id_to_value.keys
19
+ end
20
+
21
+ def values
22
+ @id_to_value.values
23
+ end
24
+
25
+ def each(&block)
26
+ @id_to_value.each(&block)
27
+ end
28
+
29
+ def size
30
+ @id_to_value.size
31
+ end
32
+ alias_method :length, :size
33
+
34
+ def encode(values)
35
+ values.collect do |value|
36
+ id(value)
37
+ end
38
+ end
39
+
40
+ def decode(ids)
41
+ ids.collect do |id|
42
+ value(id)
43
+ end
44
+ end
45
+
46
+ private
47
+ def build_dictionary(values)
48
+ @id_to_value = {}
49
+ @value_to_id = {}
50
+ id = 0
51
+ values.each do |value|
52
+ next if @value_to_id.key?(value)
53
+ @id_to_value[id] = value
54
+ @value_to_id[value] = id
55
+ id += 1
56
+ end
57
+ end
58
+ end
59
+ end
@@ -3,7 +3,7 @@ begin
3
3
  require "io/console"
4
4
  rescue LoadError
5
5
  end
6
- require "open-uri"
6
+ require "net/http"
7
7
  require "pathname"
8
8
 
9
9
  module Datasets
@@ -15,84 +15,59 @@ module Datasets
15
15
  url = URI.parse(url)
16
16
  end
17
17
  @url = url
18
- @url.extend(CurrentBufferReadable)
18
+ unless @url.is_a?(URI::HTTP)
19
+ raise ArgumentError, "download URL must be HTTP or HTTPS: <#{@url}>"
20
+ end
19
21
  end
20
22
 
21
23
  def download(output_path)
22
24
  output_path.parent.mkpath
23
25
 
26
+ headers = {"User-Agent" => "Red Datasets/#{VERSION}"}
24
27
  start = nil
25
28
  partial_output_path = Pathname.new("#{output_path}.partial")
26
29
  if partial_output_path.exist?
27
30
  start = partial_output_path.size
31
+ headers["Range"] = "bytes=#{start}-"
28
32
  end
29
33
 
30
- progress_reporter = nil
31
- content_length_proc = lambda do |content_length|
32
- base_name = @url.path.split("/").last
33
- size_max = content_length
34
- size_max += start if start
35
- progress_reporter = ProgressReporter.new(base_name, size_max)
36
- end
37
- progress_proc = lambda do |size_current|
38
- size_current += start if start
39
- progress_reporter.report(size_current) if progress_reporter
40
- end
41
- options = {
42
- :content_length_proc => content_length_proc,
43
- :progress_proc => progress_proc,
44
- }
45
- if start
46
- options["Range"] = "bytes=#{start}-"
47
- end
34
+ Net::HTTP.start(@url.hostname,
35
+ @url.port,
36
+ :use_ssl => (@url.scheme == "https")) do |http|
37
+ path = @url.path
38
+ path += "?#{@url.query}" if @url.query
39
+ request = Net::HTTP::Get.new(path, headers)
40
+ http.request(request) do |response|
41
+ case response
42
+ when Net::HTTPPartialContent
43
+ mode = "ab"
44
+ when Net::HTTPSuccess
45
+ start = nil
46
+ mode = "wb"
47
+ else
48
+ break
49
+ end
48
50
 
49
- begin
50
- @url.open(options) do |input|
51
- copy_stream(input, partial_output_path)
52
- end
53
- rescue Interrupt, Net::ReadTimeout
54
- if @url.current_buffer
55
- input = @url.current_buffer.io
56
- input.rewind
57
- copy_stream(input, partial_output_path)
51
+ base_name = @url.path.split("/").last
52
+ size_current = 0
53
+ size_max = response.content_length
54
+ if start
55
+ size_current += start
56
+ size_max += start
57
+ end
58
+ progress_reporter = ProgressReporter.new(base_name, size_max)
59
+ partial_output_path.open(mode) do |output|
60
+ response.read_body do |chunk|
61
+ size_current += chunk.bytesize
62
+ progress_reporter.report(size_current)
63
+ output.write(chunk)
64
+ end
65
+ end
58
66
  end
59
- raise
60
67
  end
61
-
62
68
  FileUtils.mv(partial_output_path, output_path)
63
69
  end
64
70
 
65
- private
66
- def copy_stream(input, partial_output_path)
67
- if partial_output_path.exist?
68
- # TODO: It's better that we use "206 Partial Content" response
69
- # to detect partial response.
70
- partial_head = partial_output_path.open("rb") do |partial_output|
71
- partial_output.read(256)
72
- end
73
- input_head = input.read(partial_head.bytesize)
74
- input.rewind
75
- if partial_head == input_head
76
- mode = "wb"
77
- else
78
- mode = "ab"
79
- end
80
- else
81
- mode = "wb"
82
- end
83
- partial_output_path.open(mode) do |partial_output|
84
- IO.copy_stream(input, partial_output)
85
- end
86
- end
87
-
88
- module CurrentBufferReadable
89
- attr_reader :current_buffer
90
- def buffer_open(buffer, proxy, options)
91
- @current_buffer = buffer
92
- super
93
- end
94
- end
95
-
96
71
  class ProgressReporter
97
72
  def initialize(base_name, size_max)
98
73
  @base_name = base_name
@@ -0,0 +1,320 @@
1
+ # frozen_string_literal: true
2
+ require 'digest/md5'
3
+ require 'net/http'
4
+ require 'uri'
5
+ require 'json'
6
+
7
+ module Datasets
8
+ module EStatJapan
9
+ Record = Struct.new(:id, :name, :values)
10
+ # configuration injection
11
+ module Configurable
12
+ attr_accessor :app_id
13
+
14
+ #
15
+ # configuration for e-Stat API
16
+ # See detail at https://www.e-stat.go.jp/api/api-dev/how_to_use (Japanese only).
17
+ # @example
18
+ # Datasets::EStatJapan.configure do |config|
19
+ # # put your App ID for e-Stat app_id
20
+ # config.app_id = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
21
+ # end
22
+ #
23
+ def configure
24
+ yield self
25
+ end
26
+ end
27
+
28
+ extend Configurable
29
+
30
+ # wrapper class for e-Stat API service
31
+ class StatsData < Dataset
32
+ attr_accessor :app_id, :id
33
+
34
+ #
35
+ # generate accessor instance for e-Stat API's endpoint `getStatsData`.
36
+ # for detail spec : https://www.e-stat.go.jp/api/api-info/e-stat-manual
37
+ # @param [String] id Statistical data id
38
+ # @param [Array<String>] areas Target areas (fetch all if omitted)
39
+ # @param [Array<String>] categories Category IDs (fetch all if omitted)
40
+ # @param [Array<String>] times Time axes (fetch all if omitted)
41
+ # @param [Array<Number>] skip_levels Skip levels for parsing (defaults to `[1]`)
42
+ # @param [String] hierarchy_selection Select target from 'child', 'parent', or 'both'. (Example: 札幌市○○区 -> 'child':札幌市○○区 only; 'parent':札幌市 only; 'both': Both selected) (defaults to `both`)
43
+ # @param [Boolean] skip_nil_column Skip column if contains nil
44
+ # @param [Boolean] skip_nil_row Skip row if contains nil
45
+ # @example
46
+ # stats_data = Datasets::EStatJapan::StatsData.new(
47
+ # "0000020201", # A Population and household (key name: A 人口・世帯)
48
+ # categories: ["A1101"], # Population (key name: A1101_人口総数)
49
+ # areas: ["01105", "01106"], # Toyohira-ku Sapporo-shi Hokkaido, Minami-ku Sapporo-shi Hokkaido
50
+ # times: ["1981100000", "1982100000"],
51
+ # hierarchy_selection: 'child',
52
+ # skip_child_area: true,
53
+ # skip_nil_column: true,
54
+ # skip_nil_row: false,
55
+ # )
56
+ #
57
+ def initialize(id,
58
+ app_id: nil,
59
+ areas: nil, categories: nil, times: nil,
60
+ skip_levels: [1],
61
+ hierarchy_selection: 'child',
62
+ skip_nil_column: true,
63
+ skip_nil_row: false,
64
+ time_range: nil)
65
+ @app_id = app_id || fetch_app_id
66
+ if @app_id.nil? || @app_id.empty?
67
+ raise ArgumentError, 'Please set app_id via `Datasets::EStatJapan.configure` method, environment var `ESTATJAPAN_APP_ID` or keyword argument `:app_id`'
68
+ end
69
+
70
+ super()
71
+
72
+ @api_version = '3.0'
73
+ @base_url = "https://api.e-stat.go.jp/rest/#{@api_version}/app/json/getStatsData"
74
+ @metadata.id = "e-stat-japan-#{@api_version}"
75
+ @metadata.name = "e-Stat API #{@api_version}"
76
+ @metadata.url = @base_url
77
+ @metadata.description = "e-Stat API #{@api_version}"
78
+
79
+ @id = id
80
+ @areas = areas
81
+ @categories = categories
82
+ @times = times
83
+ @skip_levels = skip_levels
84
+ case hierarchy_selection
85
+ when 'child' then
86
+ @skip_child_area = false
87
+ @skip_parent_area = true
88
+ when 'parent' then
89
+ @skip_child_area = true
90
+ @skip_parent_area = false
91
+ else # 'both'
92
+ @skip_child_area = false
93
+ @skip_parent_area = false
94
+ end
95
+ @skip_nil_column = skip_nil_column
96
+ @skip_nil_row = skip_nil_row
97
+ @time_range = time_range
98
+
99
+ @url = generate_url
100
+ option_hash = Digest::MD5.hexdigest(@url.to_s)
101
+ base_name = "e-stat-japan-#{option_hash}.json"
102
+ @data_path = cache_dir_path + base_name
103
+ @loaded = false
104
+ end
105
+
106
+ #
107
+ # fetch data records from Remote API
108
+ # @example
109
+ # indices = []
110
+ # rows = []
111
+ # map_id_name = {}
112
+ # estat.each do |record|
113
+ # # Select Hokkaido prefecture only
114
+ # next unless record.id.to_s.start_with? '01'
115
+ # indices << record.id
116
+ # rows << record.values
117
+ # map_id_name[record.id] = record.name
118
+ # end
119
+ #
120
+ def each
121
+ return to_enum(__method__) unless block_given?
122
+
123
+ load_data
124
+
125
+ # create rows
126
+ @areas.each do |a_key, a_value|
127
+ rows = []
128
+ @time_tables.reject { |_key, x| x[:skip] }.each do |st_key, _st_value|
129
+ row = @columns.reject { |_key, x| x[:skip] }.map do |c_key, _c_value|
130
+ @indexed_data.dig(st_key, a_key, c_key)
131
+ end
132
+ rows << row
133
+ end
134
+ next if @skip_nil_row && rows.flatten.count(nil).positive?
135
+
136
+ yield Record.new(a_key, a_value['@name'], rows.flatten)
137
+ end
138
+ end
139
+
140
+ def areas
141
+ load_data
142
+ @areas
143
+ end
144
+
145
+ def time_tables
146
+ load_data
147
+ @time_tables
148
+ end
149
+
150
+ def columns
151
+ load_data
152
+ @columns
153
+ end
154
+
155
+ def schema
156
+ load_data
157
+ @schema
158
+ end
159
+
160
+ private
161
+
162
+ def generate_url
163
+ # generates url for query
164
+ params = {
165
+ appId: @app_id, lang: 'J',
166
+ statsDataId: @id,
167
+ metaGetFlg: 'Y', cntGetFlg: 'N',
168
+ sectionHeaderFlg: '1'
169
+ }
170
+ params['cdArea'] = @areas.join(',') if @areas.instance_of?(Array)
171
+ params['cdCat01'] = @categories.join(',') if @categories.instance_of?(Array)
172
+ params['cdTime'] = @times.join(',') if @times.instance_of?(Array)
173
+
174
+ URI.parse("#{@base_url}?#{URI.encode_www_form(params)}")
175
+ end
176
+
177
+ def extract_def(data, id)
178
+ rec = data.dig('GET_STATS_DATA',
179
+ 'STATISTICAL_DATA',
180
+ 'CLASS_INF',
181
+ 'CLASS_OBJ')
182
+ rec.select { |x| x['@id'] == id }
183
+ end
184
+
185
+ def index_def(data_def)
186
+ unless data_def.first['CLASS'].instance_of?(Array)
187
+ # convert to array when number of element is 1
188
+ data_def.first['CLASS'] = [data_def.first['CLASS']]
189
+ end
190
+ Hash[*data_def.first['CLASS'].map { |x| [x['@code'], x] }.flatten]
191
+ end
192
+
193
+ def get_values(data)
194
+ data.dig('GET_STATS_DATA',
195
+ 'STATISTICAL_DATA',
196
+ 'DATA_INF',
197
+ 'VALUE')
198
+ end
199
+
200
+ def fetch_app_id
201
+ EStatJapan.app_id || ENV['ESTATJAPAN_APP_ID']
202
+ end
203
+
204
+ def load_data
205
+ return if @loaded
206
+
207
+ fetch_data
208
+ index_data
209
+ end
210
+
211
+ def fetch_data
212
+ # MEMO:
213
+ # The e-stat api always returns 200 (Ok)
214
+ # even if error happens dispite of its error mapping.
215
+ # So we can't avoid caching retrieved response from the api.
216
+ # ref: https://www.e-stat.go.jp/api/api-info/e-stat-manual3-0
217
+ download(@data_path, @url.to_s) unless @data_path.exist?
218
+ end
219
+
220
+ def index_data
221
+ # parse json
222
+ raw_data = File.open(@data_path) do |io|
223
+ JSON.parse(io.read)
224
+ end
225
+
226
+ # check status
227
+ api_status = raw_data.dig('GET_STATS_DATA', 'RESULT', 'STATUS')
228
+ if api_status != 0
229
+ # remove error response cache manually
230
+ FileUtils.rm(@data_path)
231
+ error_msg = raw_data.dig('GET_STATS_DATA', 'RESULT', 'ERROR_MSG')
232
+ raise APIError, "code #{api_status} : #{error_msg}"
233
+ end
234
+
235
+ # index data
236
+ ## table_def = extract_def(raw_data, "tab")
237
+ timetable_def = extract_def(raw_data, 'time')
238
+ column_def = extract_def(raw_data, 'cat01')
239
+ area_def = extract_def(raw_data, 'area')
240
+
241
+ @time_tables = index_def(timetable_def)
242
+ @columns = index_def(column_def)
243
+ @areas = index_def(area_def)
244
+
245
+ ## apply time_range to time_tables
246
+ @time_tables.select! { |k, _v| @time_tables.keys[@time_range].include? k } if @time_range.instance_of?(Range)
247
+
248
+ @indexed_data = Hash[*@time_tables.keys.map { |x| [x, {}] }.flatten]
249
+ get_values(raw_data).each do |row|
250
+ next unless @time_tables.key?(row['@time'])
251
+
252
+ data = @indexed_data.dig(row['@time'], row['@area']) || {}
253
+ new_data = data.merge(row['@cat01'] => row['$'].to_f)
254
+ @indexed_data[row['@time']][row['@area']] = new_data
255
+ end
256
+
257
+ skip_areas
258
+ skip_nil_column
259
+ @schema = create_header
260
+ @loaded = true
261
+ end
262
+
263
+ def skip_areas
264
+ # skip levels
265
+ @areas.reject! { |_key, x| @skip_levels.include? x['@level'].to_i }
266
+
267
+ # skip area that has children
268
+ if @skip_parent_area
269
+ # inspect hieralchy of areas
270
+ @areas.each do |_a_key, a_value|
271
+ next unless @areas.key? a_value['@parentCode']
272
+
273
+ @areas[a_value['@parentCode']][:has_children] = true
274
+ end
275
+ # filter areas without children
276
+ @areas.reject! { |_key, x| x[:has_children] }
277
+ end
278
+
279
+ # skip child area
280
+ @areas.reject! { |_a_key, a_value| (@areas.key? a_value['@parentCode']) } if @skip_child_area
281
+ end
282
+
283
+ def skip_nil_column
284
+ return unless @skip_nil_column
285
+
286
+ # filter time_tables and columns
287
+ @areas.each do |a_key, _a_value|
288
+ @time_tables.each do |st_key, st_value|
289
+ unless @indexed_data[st_key].key?(a_key)
290
+ st_value[:skip] = true
291
+ next
292
+ end
293
+ @columns.each do |c_key, c_value|
294
+ unless @indexed_data.dig(st_key, a_key).key?(c_key)
295
+ c_value[:skip] = true
296
+ next
297
+ end
298
+ end
299
+ end
300
+ end
301
+ end
302
+
303
+ def create_header
304
+ schema = []
305
+ @time_tables.reject { |_key, x| x[:skip] }.each do |_st_key, st_value|
306
+ @columns.reject { |_key, x| x[:skip] }.each do |_c_key, c_value|
307
+ schema << "#{st_value['@name']}_#{c_value['@name']}"
308
+ end
309
+ end
310
+ schema
311
+ end
312
+ end
313
+
314
+ class ArgumentError < Error
315
+ end
316
+
317
+ class APIError < Error
318
+ end
319
+ end
320
+ end