red-datasets 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -0
  3. data/Rakefile +10 -0
  4. data/doc/text/news.md +36 -0
  5. data/lib/datasets/california-housing.rb +1 -1
  6. data/lib/datasets/dataset.rb +2 -2
  7. data/lib/datasets/downloader.rb +51 -17
  8. data/lib/datasets/fashion-mnist.rb +6 -2
  9. data/lib/datasets/ggplot2-dataset.rb +3 -3
  10. data/lib/datasets/house-of-councillor.rb +169 -0
  11. data/lib/datasets/house-of-representative.rb +107 -0
  12. data/lib/datasets/japanese-date-parser.rb +38 -0
  13. data/lib/datasets/kuzushiji-mnist.rb +6 -2
  14. data/lib/datasets/lazy.rb +2 -0
  15. data/lib/datasets/libsvm-dataset-list.rb +1 -1
  16. data/lib/datasets/mnist.rb +12 -6
  17. data/lib/datasets/nagoya-university-conversation-corpus.rb +2 -2
  18. data/lib/datasets/penguins.rb +28 -5
  19. data/lib/datasets/postal-code-japan.rb +3 -3
  20. data/lib/datasets/quora-duplicate-question-pair.rb +1 -1
  21. data/lib/datasets/version.rb +1 -1
  22. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +2 -2
  23. data/lib/datasets/wikipedia.rb +2 -2
  24. data/test/japanese-date-parser-test.rb +27 -0
  25. data/test/test-adult.rb +36 -86
  26. data/test/test-aozora-bunko.rb +5 -5
  27. data/test/test-california-housing.rb +12 -31
  28. data/test/test-cldr-plurals.rb +1 -1
  29. data/test/test-diamonds.rb +13 -33
  30. data/test/test-downloader.rb +1 -1
  31. data/test/test-geolonia.rb +17 -41
  32. data/test/test-house-of-councillor.rb +223 -0
  33. data/test/test-house-of-representative.rb +54 -0
  34. data/test/test-nagoya-university-conversation-corpus.rb +17 -69
  35. data/test/test-postal-code-japan.rb +7 -0
  36. data/test/test-quora-duplicate-question-pair.rb +7 -21
  37. data/test/test-rdataset.rb +24 -22
  38. data/test/test-sudachi-synonym-dictionary.rb +12 -31
  39. data/test/test-wikipedia.rb +5 -5
  40. metadata +12 -6
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0231a4f9da16ad1b2cb562a360468b3450fec684e2bcf0ca500195499e8f7397
4
- data.tar.gz: c2938b6d72fea58413a743ccad111fc8c95699d9b417e64941ca1681128a1706
3
+ metadata.gz: 01ddaa57da3c64de47cfd9eb2ca9ae2ec3cbcb5d35138fdd74f74009f062358f
4
+ data.tar.gz: 431dba2c0e41bc25a4e2716ed20936ee2022c2b04c49683bb7d0d2e2aaa2f99e
5
5
  SHA512:
6
- metadata.gz: 2ea402beb78be117e28ca906490526a28a8d1e1430181a89432953d26eb0b0a8ea70d0c582d438c5e6b668b3a48eb60db336ef8174fa4dd1a52e20c19f5b4d9b
7
- data.tar.gz: 38b59c46e875ae61ab0794f2f9f474969b7e4e08e06078ca55ce4d1930d9538647e6b0152c1c48e8f0d45318ed2f71801dc99152d0799e06459937fb3d29978d
6
+ metadata.gz: ab12e9783e4a23b81f9bd1be22c31704f9095026cb27185a6fe985e106320982fb999e1cf2348f6b18b509a7f1a6a5b58d405ece5d541e8ddbe43cd08f252a80
7
+ data.tar.gz: 157df5fffd3ba8fd021cdef3933c0a9e99a0fa7f173771e2177d1a86e79788c3250b12453f06084e24afcf624ec3283e702e6bd7595f08e2cf2b5a7dd404065c
data/README.md CHANGED
@@ -29,6 +29,8 @@ You can use datasets easily because you can access each dataset with multiple wa
29
29
  * Fuel Economy Dataset
30
30
  * Geolonia Japanese Addresses
31
31
  * Hepatitis
32
+ * House of Councillors of Japan
33
+ * House of Representatives of Japan
32
34
  * Iris Dataset
33
35
  * Libsvm
34
36
  * MNIST database
data/Rakefile CHANGED
@@ -13,6 +13,16 @@ end
13
13
  helper.install
14
14
  spec = helper.gemspec
15
15
 
16
+ release_task = Rake.application["release"]
17
+ # We use Trusted Publishing.
18
+ release_task.prerequisites.delete("build")
19
+ release_task.prerequisites.delete("release:rubygem_push")
20
+ release_task_comment = release_task.comment
21
+ if release_task_comment
22
+ release_task.clear_comments
23
+ release_task.comment = release_task_comment.gsub(/ and build.*$/, "")
24
+ end
25
+
16
26
  task default: :test
17
27
 
18
28
  desc "Run tests"
data/doc/text/news.md CHANGED
@@ -1,5 +1,41 @@
1
1
  # News
2
2
 
3
+ ## 0.1.9 - 2025-04-08
4
+
5
+ ### Improvements
6
+
7
+ * `Datasets::Penguins`: Changed to use `POST` for downloading data
8
+ from EDI.
9
+
10
+ ## 0.1.8 - 2025-02-07
11
+
12
+ ### Improvements
13
+
14
+ * Suppressed "literal string will be frozen" warnings.
15
+
16
+ * Patch by Tsutomu Katsube
17
+
18
+ * `Datasets::HouseOfCouncillor`: Added.
19
+
20
+ * [GH-143](https://github.com/red-data-tools/red-datasets/issues/143)
21
+
22
+ * [GH-181](https://github.com/red-data-tools/red-datasets/issues/181)
23
+
24
+ * Patch by Tsutomu Katsube
25
+
26
+ * `Datasets::HouseOfRepresentative`: Added.
27
+
28
+ * [GH-143](https://github.com/red-data-tools/red-datasets/issues/142)
29
+
30
+ * [GH-181](https://github.com/red-data-tools/red-datasets/issues/184)
31
+
32
+ * Patch by Tsutomu Katsube
33
+
34
+ ### Thanks
35
+
36
+ * Tsutomu Katsube
37
+
38
+
3
39
  ## 0.1.7 - 2023-05-29
4
40
 
5
41
  ### Improvements
@@ -36,7 +36,7 @@ Available from http://lib.stat.cmu.edu/datasets/.
36
36
  file_name = "cadata.txt"
37
37
  download(data_path, data_url)
38
38
  open_data(data_path, file_name) do |input|
39
- data = ""
39
+ data = +""
40
40
  input.each_line do |line|
41
41
  next unless line.start_with?(" ")
42
42
  data << line.lstrip.gsub(/ +/, ",")
@@ -33,8 +33,8 @@ module Datasets
33
33
  @cache_path ||= CachePath.new(@metadata.id)
34
34
  end
35
35
 
36
- def download(output_path, url, &block)
37
- downloader = Downloader.new(url)
36
+ def download(output_path, url, *fallback_urls, **options, &block)
37
+ downloader = Downloader.new(url, *fallback_urls, **options)
38
38
  downloader.download(output_path, &block)
39
39
  end
40
40
 
@@ -6,20 +6,17 @@ end
6
6
  require "net/http"
7
7
  require "pathname"
8
8
 
9
+ require_relative "error"
10
+
9
11
  module Datasets
10
12
  class Downloader
11
- class TooManyRedirects < StandardError; end
13
+ class TooManyRedirects < Error; end
12
14
 
13
- def initialize(url)
14
- if url.is_a?(URI::Generic)
15
- url = url.dup
16
- else
17
- url = URI.parse(url)
18
- end
19
- @url = url
20
- unless @url.is_a?(URI::HTTP)
21
- raise ArgumentError, "download URL must be HTTP or HTTPS: <#{@url}>"
22
- end
15
+ def initialize(url, *fallback_urls, http_method: nil, http_parameters: nil)
16
+ @url = normalize_url(url)
17
+ @fallback_urls = fallback_urls.collect { |fallback_url| normalize_url(fallback_url) }
18
+ @http_method = http_method
19
+ @http_parameters = http_parameters
23
20
  end
24
21
 
25
22
  def download(output_path, &block)
@@ -45,7 +42,7 @@ module Datasets
45
42
  headers["Range"] = "bytes=#{start}-"
46
43
  end
47
44
 
48
- start_http(@url, headers) do |response|
45
+ start_http(@url, @fallback_urls, headers) do |response|
49
46
  if response.is_a?(Net::HTTPPartialContent)
50
47
  mode = "ab"
51
48
  else
@@ -85,6 +82,18 @@ module Datasets
85
82
  end
86
83
  end
87
84
 
85
+ private def normalize_url(url)
86
+ if url.is_a?(URI::Generic)
87
+ url = url.dup
88
+ else
89
+ url = URI.parse(url)
90
+ end
91
+ unless url.is_a?(URI::HTTP)
92
+ raise ArgumentError, "download URL must be HTTP or HTTPS: <#{url}>"
93
+ end
94
+ url
95
+ end
96
+
88
97
  private def synchronize(output_path, partial_output_path)
89
98
  begin
90
99
  Process.getpgid(Process.pid)
@@ -104,7 +113,8 @@ module Datasets
104
113
  rescue ArgumentError
105
114
  # The process that acquired the lock will be exited before
106
115
  # it stores its process ID.
107
- valid_lock_path = (lock_path.mtime > 10)
116
+ elapsed_time = Time.now - lock_path.mtime
117
+ valid_lock_path = (elapsed_time > 10)
108
118
  else
109
119
  begin
110
120
  Process.getpgid(pid)
@@ -133,7 +143,7 @@ module Datasets
133
143
  end
134
144
  end
135
145
 
136
- private def start_http(url, headers, limit = 10, &block)
146
+ private def start_http(url, fallback_urls, headers, limit = 10, &block)
137
147
  if limit == 0
138
148
  raise TooManyRedirects, "too many redirections: #{url}"
139
149
  end
@@ -143,7 +153,21 @@ module Datasets
143
153
  http.start do
144
154
  path = url.path
145
155
  path += "?#{url.query}" if url.query
146
- request = Net::HTTP::Get.new(path, headers)
156
+ if @http_method == :post
157
+ # TODO: We may want to add @http_content_type, @http_body
158
+ # and so on.
159
+ if @http_parameters
160
+ body = URI.encode_www_form(@http_parameters)
161
+ content_type = "application/x-www-form-urlencoded"
162
+ headers = {"Content-Type" => content_type}.merge(headers)
163
+ else
164
+ body = ""
165
+ end
166
+ request = Net::HTTP::Post.new(path, headers)
167
+ request.body = body
168
+ else
169
+ request = Net::HTTP::Get.new(path, headers)
170
+ end
147
171
  http.request(request) do |response|
148
172
  case response
149
173
  when Net::HTTPSuccess, Net::HTTPPartialContent
@@ -151,8 +175,18 @@ module Datasets
151
175
  when Net::HTTPRedirection
152
176
  url = URI.parse(response[:location])
153
177
  $stderr.puts "Redirect to #{url}"
154
- return start_http(url, headers, limit - 1, &block)
178
+ return start_http(url, fallback_urls, headers, limit - 1, &block)
155
179
  else
180
+ if response.is_a?(Net::HTTPForbidden)
181
+ next_url, *rest_fallback_urls = fallback_urls
182
+ if next_url
183
+ message = "#{response.code}: #{response.message}: " +
184
+ "fallback: <#{url}> -> <#{next_url}>"
185
+ $stderr.puts(message)
186
+ return start_http(next_url, rest_fallback_urls, headers, &block)
187
+ end
188
+ end
189
+
156
190
  message = response.code
157
191
  if response.message and not response.message.empty?
158
192
  message += ": #{response.message}"
@@ -167,7 +201,7 @@ module Datasets
167
201
  private def yield_chunks(path)
168
202
  path.open("rb") do |output|
169
203
  chunk_size = 1024 * 1024
170
- chunk = ""
204
+ chunk = +""
171
205
  while output.read(chunk_size, chunk)
172
206
  yield(chunk)
173
207
  end
@@ -2,9 +2,13 @@ require_relative 'mnist'
2
2
 
3
3
  module Datasets
4
4
  class FashionMNIST < MNIST
5
- BASE_URL = "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/"
6
-
7
5
  private
6
+ def base_urls
7
+ [
8
+ "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/",
9
+ ]
10
+ end
11
+
8
12
  def dataset_name
9
13
  "Fashion-MNIST"
10
14
  end
@@ -17,7 +17,7 @@ module Datasets
17
17
  data_path = cache_dir_path + data_base_name
18
18
  data_url = "#{download_base_url}/data-raw/#{data_base_name}"
19
19
  download(data_path, data_url)
20
- CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
20
+ CSV.open(data_path, headers: :first_row, converters: :numeric) do |csv|
21
21
  record_class = self.class::Record
22
22
  csv.each do |row|
23
23
  record = record_class.new(*row.fields)
@@ -37,7 +37,7 @@ module Datasets
37
37
  data_r_url = "#{download_base_url}/R/#{data_r_base_name}"
38
38
  download(data_r_path, data_r_url)
39
39
  descriptions = {}
40
- comment = ""
40
+ comment = +""
41
41
  File.open(data_r_path) do |data_r|
42
42
  data_r.each_line do |line|
43
43
  case line.chomp
@@ -51,7 +51,7 @@ module Datasets
51
51
  when /\A"(.+)"\z/
52
52
  name = Regexp.last_match[1]
53
53
  descriptions[name] = parse_roxygen(comment.rstrip)
54
- comment = ""
54
+ comment = +""
55
55
  end
56
56
  end
57
57
  descriptions[@ggplot2_dataset_name]
@@ -0,0 +1,169 @@
1
+ require_relative "dataset"
2
+
3
+ module Datasets
4
+ class HouseOfCouncillor < Dataset
5
+ Bill = Struct.new(:council_time,
6
+ :bill_type,
7
+ :submit_time,
8
+ :submit_number,
9
+ :title,
10
+ :bill_url,
11
+ :bill_summary_url,
12
+ :proposed_bill_url,
13
+ :proposed_on,
14
+ :proposed_on_from_house_of_representatives,
15
+ :proposed_on_to_house_of_representatives,
16
+ :prior_deliberations_type,
17
+ :continuation_type,
18
+ :proposers,
19
+ :submitter,
20
+ :submitter_type,
21
+ :progress_of_house_of_councillors_committees_etc_refer_on,
22
+ :progress_of_house_of_councillors_committees_etc_committee_etc,
23
+ :progress_of_house_of_councillors_committees_etc_pass_on,
24
+ :progress_of_house_of_councillors_committees_etc_result,
25
+ :progress_of_house_of_councillors_plenary_sitting_pass_on,
26
+ :progress_of_house_of_councillors_plenary_sitting_result,
27
+ :progress_of_house_of_councillors_plenary_sitting_committees,
28
+ :progress_of_house_of_councillors_plenary_sitting_vote_type,
29
+ :progress_of_house_of_councillors_plenary_sitting_vote_method,
30
+ :progress_of_house_of_councillors_plenary_sitting_result_url,
31
+ :progress_of_house_of_representatives_committees_etc_refer_on,
32
+ :progress_of_house_of_representatives_committees_etc_committee_etc,
33
+ :progress_of_house_of_representatives_committees_etc_pass_on,
34
+ :progress_of_house_of_representatives_committees_etc_result,
35
+ :progress_of_house_of_representatives_plenary_sitting_pass_on,
36
+ :progress_of_house_of_representatives_plenary_sitting_result,
37
+ :progress_of_house_of_representatives_plenary_sitting_committees,
38
+ :progress_of_house_of_representatives_plenary_sitting_vote_type,
39
+ :progress_of_house_of_representatives_plenary_sitting_vote_method,
40
+ :promulgated_on,
41
+ :law_number,
42
+ :entracted_law_url,
43
+ :notes)
44
+
45
+ InHouseGroup = Struct.new(:in_house_group_name_and_abbreviation_on,
46
+ :in_house_group_name,
47
+ :in_house_group_abbreviation,
48
+ :number_of_members_on,
49
+ :number_of_members,
50
+ :number_of_women_members,
51
+ :first_term_expires_on,
52
+ :first_term_proportional_representation_number_of_members,
53
+ :first_term_proportional_representation_number_of_women_members,
54
+ :first_term_election_district_number_of_members,
55
+ :first_term_election_district_number_of_women_members,
56
+ :first_term_total_number_of_members,
57
+ :first_term_total_number_of_women_members,
58
+ :second_term_expires_on,
59
+ :second_term_proportional_representation_number_of_members,
60
+ :second_term_proportional_representation_number_of_women_members,
61
+ :second_term_election_district_number_of_members,
62
+ :second_term_election_district_number_of_women_members,
63
+ :second_term_total_number_of_members,
64
+ :second_term_total_number_of_women_members)
65
+
66
+ Member = Struct.new(:professional_name,
67
+ :true_name,
68
+ :profile_url,
69
+ :professional_name_reading,
70
+ :in_house_group_abbreviation,
71
+ :constituency,
72
+ :expiration_of_term,
73
+ :photo_url,
74
+ :elected_years,
75
+ :elected_number,
76
+ :responsibilities,
77
+ :responsibility_on,
78
+ :career,
79
+ :career_on)
80
+
81
+ Question = Struct.new(:submit_time,
82
+ :submit_number,
83
+ :title,
84
+ :submitter,
85
+ :number_of_submissions,
86
+ :question_for_text_html_url,
87
+ :answer_for_text_html_url,
88
+ :question_for_text_pdf_url,
89
+ :answer_for_text_pdf_url,
90
+ :question_url,
91
+ :submitted_on,
92
+ :transfered_on,
93
+ :received_answer_on,
94
+ :notes)
95
+
96
+ VALID_TYPES = [
97
+ :bill,
98
+ :in_house_group,
99
+ :member,
100
+ :question
101
+ ]
102
+
103
+ def initialize(type: :bill)
104
+ super()
105
+ @type = type
106
+ unless VALID_TYPES.include?(type)
107
+ message = +":type must be one of ["
108
+ message << VALID_TYPES.collect(&:inspect).join(", ")
109
+ message << "]: #{@type.inspect}"
110
+ raise ArgumentError, message
111
+ end
112
+
113
+ @metadata.id = "house-of-councillor"
114
+ @metadata.name = "Bill, in-House group, member and question of the House of Councillors of Japan"
115
+ @metadata.url = "https://smartnews-smri.github.io/house-of-councillors"
116
+ @metadata.licenses = ["MIT"]
117
+ @metadata.description = "The House of Councillors of Japan (type: #{@type})"
118
+ end
119
+
120
+ def each
121
+ return to_enum(__method__) unless block_given?
122
+
123
+ open_data do |csv|
124
+ csv.each do |row|
125
+ case @type
126
+ when :bill
127
+ record = Bill.new(*row.fields)
128
+ when :in_house_group
129
+ record = InHouseGroup.new(*row.fields)
130
+ when :member
131
+ %w(当選年).each do |ints_column_name|
132
+ row[ints_column_name] = parse_ints(row[ints_column_name])
133
+ end
134
+ record = Member.new(*row.fields)
135
+ when :question
136
+ record = Question.new(*row.fields)
137
+ end
138
+ yield(record)
139
+ end
140
+ end
141
+ end
142
+
143
+ private
144
+
145
+ def open_data
146
+ data_url = +"https://smartnews-smri.github.io/house-of-councillors/data"
147
+ case @type
148
+ when :bill
149
+ data_url << "/gian.csv"
150
+ when :in_house_group
151
+ data_url << "/kaiha.csv"
152
+ when :member
153
+ data_url << "/giin.csv"
154
+ when :question
155
+ data_url << "/syuisyo.csv"
156
+ end
157
+ data_path = cache_dir_path + "#{@type}.csv"
158
+ download(data_path, data_url)
159
+
160
+ CSV.open(data_path, col_sep: ",", headers: true, converters: %i(date integer)) do |csv|
161
+ yield(csv)
162
+ end
163
+ end
164
+
165
+ def parse_ints(column_value)
166
+ column_value.to_s.split("、").collect(&:to_i)
167
+ end
168
+ end
169
+ end
@@ -0,0 +1,107 @@
1
+ require_relative "dataset"
2
+ require_relative "japanese-date-parser"
3
+
4
+ module Datasets
5
+ class HouseOfRepresentative < Dataset
6
+ Record = Struct.new(:carry_time,
7
+ :caption,
8
+ :type,
9
+ :submit_time,
10
+ :submit_number,
11
+ :title,
12
+ :discussion_status,
13
+ :progress,
14
+ :progress_url,
15
+ :text,
16
+ :text_url,
17
+ :bill_type,
18
+ :submitter,
19
+ :submitter_in_house_groups,
20
+ :house_of_representatives_of_accepted_bill_on_preliminary_consideration,
21
+ :house_of_representatives_of_preliminary_refer_on,
22
+ :house_of_representatives_of_preliminary_refer_commission,
23
+ :house_of_representatives_of_accepted_bill_on,
24
+ :house_of_representatives_of_refer_on,
25
+ :house_of_representatives_of_refer_commission,
26
+ :house_of_representatives_of_finished_consideration_on,
27
+ :house_of_representatives_of_consideration_result,
28
+ :house_of_representatives_of_finished_deliberation_on,
29
+ :house_of_representatives_of_deliberation_result,
30
+ :house_of_representatives_of_attitude_of_in_house_group_during_deliberation,
31
+ :house_of_representatives_of_support_in_house_group_during_deliberation,
32
+ :house_of_representatives_of_opposition_in_house_group_during_deliberation,
33
+ :house_of_councillors_of_accepted_bill_on_preliminary_consideration,
34
+ :house_of_councillors_of_preliminary_refer_on,
35
+ :house_of_councillors_of_preliminary_refer_commission,
36
+ :house_of_councillors_of_accepted_bill_on,
37
+ :house_of_councillors_of_refer_on,
38
+ :house_of_councillors_of_refer_commission,
39
+ :house_of_councillors_of_finished_consideration_on,
40
+ :house_of_councillors_of_consideration_result,
41
+ :house_of_councillors_of_finished_deliberation_on,
42
+ :house_of_councillors_of_deliberation_result,
43
+ :promulgated_on,
44
+ :law_number,
45
+ :submitters,
46
+ :supporters_of_submitted_bill)
47
+
48
+ def initialize
49
+ super()
50
+
51
+ @metadata.id = "house-of-representative"
52
+ @metadata.name = "Bill of the House of Representatives of Japan"
53
+ @metadata.url = "https://smartnews-smri.github.io/house-of-representatives"
54
+ @metadata.licenses = ["MIT"]
55
+ @metadata.description = "Bill of the House of Representatives of Japan"
56
+ end
57
+
58
+ def each
59
+ return to_enum(__method__) unless block_given?
60
+
61
+ open_data do |csv|
62
+ csv.each do |row|
63
+ record = Record.new(*row.fields)
64
+ yield(record)
65
+ end
66
+ end
67
+ end
68
+
69
+ private
70
+
71
+ def open_data
72
+ data_url = "https://raw.githubusercontent.com/smartnews-smri/house-of-representatives/main/data/gian.csv"
73
+ data_path = cache_dir_path + "gian.csv"
74
+ download(data_path, data_url)
75
+
76
+ parser = JapaneseDateParser.new
77
+ japanese_date_converter = lambda do |field, info|
78
+ if info.header.end_with?("年月日")
79
+ parser.parse(field)
80
+ else
81
+ field
82
+ end
83
+ end
84
+ array_converter = lambda do |field, info|
85
+ case info.header
86
+ when "議案提出会派", "衆議院審議時賛成会派", "衆議院審議時反対会派", "議案提出者一覧", "議案提出の賛成者"
87
+ parse_array(field)
88
+ else
89
+ field
90
+ end
91
+ end
92
+ File.open(data_path) do |data_file|
93
+ options = {
94
+ col_sep: ",",
95
+ headers: true,
96
+ converters: [:integer, japanese_date_converter, array_converter],
97
+ }
98
+ # There are two columns within one column. To split into two columns, `#gsub` is necessary.
99
+ yield(CSV.new(data_file.read.gsub("/", ","), **options))
100
+ end
101
+ end
102
+
103
+ def parse_array(column_value)
104
+ column_value&.split("; ")
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,38 @@
1
+ module Datasets
2
+ class JapaneseDateParser
3
+ class UnsupportedEraInitialRange < Error; end
4
+
5
+ ERA_INITIALS = {
6
+ "平成" => "H",
7
+ "令和" => "R",
8
+ }.freeze
9
+
10
+ def parse(string)
11
+ case string
12
+ when nil
13
+ nil
14
+ when /\A(平成|令和|..)\s*(\d{1,2}|元)年\s*(\d{1,2})月\s*(\d{1,2})日\z/
15
+ match_data = Regexp.last_match
16
+ era_initial = ERA_INITIALS[match_data[1]]
17
+ if era_initial.nil?
18
+ message = +"era must be one of ["
19
+ message << ERA_INITIALS.keys.join(", ")
20
+ message << "]: #{match_data[1]}"
21
+ raise UnsupportedEraInitialRange, message
22
+ end
23
+
24
+ year = match_data[2]
25
+ if year == "元"
26
+ year = "01"
27
+ else
28
+ year = year.rjust(2, "0")
29
+ end
30
+ month = match_data[3].rjust(2, "0")
31
+ day = match_data[4].rjust(2, "0")
32
+ Date.jisx0301("#{era_initial}#{year}.#{month}.#{day}")
33
+ else
34
+ string
35
+ end
36
+ end
37
+ end
38
+ end
@@ -2,9 +2,13 @@ require_relative 'mnist'
2
2
 
3
3
  module Datasets
4
4
  class KuzushijiMNIST < MNIST
5
- BASE_URL = "http://codh.rois.ac.jp/kmnist/dataset/kmnist/"
6
-
7
5
  private
6
+ def base_urls
7
+ [
8
+ "http://codh.rois.ac.jp/kmnist/dataset/kmnist/",
9
+ ]
10
+ end
11
+
8
12
  def dataset_name
9
13
  "Kuzushiji-MNIST"
10
14
  end
data/lib/datasets/lazy.rb CHANGED
@@ -57,6 +57,8 @@ module Datasets
57
57
  LAZY_LOADER.register(:FuelEconomy, "datasets/fuel-economy")
58
58
  LAZY_LOADER.register(:Geolonia, "datasets/geolonia")
59
59
  LAZY_LOADER.register(:Hepatitis, "datasets/hepatitis")
60
+ LAZY_LOADER.register(:HouseOfCouncillor, "datasets/house-of-councillor")
61
+ LAZY_LOADER.register(:HouseOfRepresentative, "datasets/house-of-representative")
60
62
  LAZY_LOADER.register(:Iris, "datasets/iris")
61
63
  LAZY_LOADER.register(:ITACorpus, "datasets/ita-corpus")
62
64
  LAZY_LOADER.register(:KuzushijiMNIST, "datasets/kuzushiji-mnist")
@@ -110,7 +110,7 @@ module Datasets
110
110
  @row = []
111
111
  when "td"
112
112
  @in_td = true
113
- @row << {:text => ""}
113
+ @row << {:text => +""}
114
114
  when "a"
115
115
  @row.last[:href] = attributes["href"] if @in_td
116
116
  end
@@ -4,8 +4,6 @@ require_relative "dataset"
4
4
 
5
5
  module Datasets
6
6
  class MNIST < Dataset
7
- BASE_URL = "http://yann.lecun.com/exdb/mnist/"
8
-
9
7
  class Record < Struct.new(:data, :label)
10
8
  def pixels
11
9
  data.unpack("C*")
@@ -27,7 +25,7 @@ module Datasets
27
25
 
28
26
  @metadata.id = "#{dataset_name.downcase}-#{type}"
29
27
  @metadata.name = "#{dataset_name}: #{type}"
30
- @metadata.url = self.class::BASE_URL
28
+ @metadata.url = base_urls.first
31
29
  @metadata.licenses = licenses
32
30
  @type = type
33
31
 
@@ -44,15 +42,23 @@ module Datasets
44
42
 
45
43
  image_path = cache_dir_path + target_file(:image)
46
44
  label_path = cache_dir_path + target_file(:label)
47
- base_url = self.class::BASE_URL
48
45
 
49
- download(image_path, base_url + target_file(:image))
50
- download(label_path, base_url + target_file(:label))
46
+ download(image_path,
47
+ *base_urls.collect { |base_url| base_url + target_file(:image) })
48
+ download(label_path,
49
+ *base_urls.collect { |base_url| base_url + target_file(:label) })
51
50
 
52
51
  open_data(image_path, label_path, &block)
53
52
  end
54
53
 
55
54
  private
55
+ def base_urls
56
+ [
57
+ "http://yann.lecun.com/exdb/mnist/",
58
+ "https://ossci-datasets.s3.amazonaws.com/mnist/",
59
+ ]
60
+ end
61
+
56
62
  def licenses
57
63
  []
58
64
  end