red-datasets 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -0
  3. data/Rakefile +10 -0
  4. data/doc/text/news.md +29 -0
  5. data/lib/datasets/california-housing.rb +1 -1
  6. data/lib/datasets/dataset.rb +2 -2
  7. data/lib/datasets/downloader.rb +34 -16
  8. data/lib/datasets/fashion-mnist.rb +6 -2
  9. data/lib/datasets/ggplot2-dataset.rb +3 -3
  10. data/lib/datasets/house-of-councillor.rb +169 -0
  11. data/lib/datasets/house-of-representative.rb +107 -0
  12. data/lib/datasets/japanese-date-parser.rb +38 -0
  13. data/lib/datasets/kuzushiji-mnist.rb +6 -2
  14. data/lib/datasets/lazy.rb +2 -0
  15. data/lib/datasets/libsvm-dataset-list.rb +1 -1
  16. data/lib/datasets/mnist.rb +12 -6
  17. data/lib/datasets/nagoya-university-conversation-corpus.rb +2 -2
  18. data/lib/datasets/postal-code-japan.rb +3 -3
  19. data/lib/datasets/quora-duplicate-question-pair.rb +1 -1
  20. data/lib/datasets/version.rb +1 -1
  21. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +2 -2
  22. data/lib/datasets/wikipedia.rb +2 -2
  23. data/test/japanese-date-parser-test.rb +27 -0
  24. data/test/test-adult.rb +36 -86
  25. data/test/test-aozora-bunko.rb +5 -5
  26. data/test/test-california-housing.rb +12 -31
  27. data/test/test-cldr-plurals.rb +1 -1
  28. data/test/test-diamonds.rb +13 -33
  29. data/test/test-downloader.rb +1 -1
  30. data/test/test-geolonia.rb +17 -41
  31. data/test/test-house-of-councillor.rb +223 -0
  32. data/test/test-house-of-representative.rb +54 -0
  33. data/test/test-nagoya-university-conversation-corpus.rb +17 -69
  34. data/test/test-postal-code-japan.rb +7 -0
  35. data/test/test-quora-duplicate-question-pair.rb +7 -21
  36. data/test/test-rdataset.rb +24 -22
  37. data/test/test-sudachi-synonym-dictionary.rb +12 -31
  38. data/test/test-wikipedia.rb +5 -5
  39. metadata +12 -6
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0231a4f9da16ad1b2cb562a360468b3450fec684e2bcf0ca500195499e8f7397
4
- data.tar.gz: c2938b6d72fea58413a743ccad111fc8c95699d9b417e64941ca1681128a1706
3
+ metadata.gz: deff1c4d13294030c25c06691e272881eb049274decd9e2a958d0486c792ef26
4
+ data.tar.gz: e48151e045fbf343291a5f5770409dea7672ff39c75bcc617152b52a39890f31
5
5
  SHA512:
6
- metadata.gz: 2ea402beb78be117e28ca906490526a28a8d1e1430181a89432953d26eb0b0a8ea70d0c582d438c5e6b668b3a48eb60db336ef8174fa4dd1a52e20c19f5b4d9b
7
- data.tar.gz: 38b59c46e875ae61ab0794f2f9f474969b7e4e08e06078ca55ce4d1930d9538647e6b0152c1c48e8f0d45318ed2f71801dc99152d0799e06459937fb3d29978d
6
+ metadata.gz: 7ae5a39a716bfa719f937c6ff139bd90ccf625e8a7ce72298507506f1a2c6a100e81c1273bf395a40d0ae8f1a13c1b1fb554e123e49dff718607a734ffd6c67b
7
+ data.tar.gz: 72e664ba5f2a569a92d9d4237588f92f7398621642486e4c9e6c930bade4c17e4a68d30da14cb277fcd602025f959c42284462bf4370872f9826c9634ff78aca
data/README.md CHANGED
@@ -29,6 +29,8 @@ You can use datasets easily because you can access each dataset with multiple wa
29
29
  * Fuel Economy Dataset
30
30
  * Geolonia Japanese Addresses
31
31
  * Hepatitis
32
+ * House of Councillors of Japan
33
+ * House of Representatives of Japan
32
34
  * Iris Dataset
33
35
  * Libsvm
34
36
  * MNIST database
data/Rakefile CHANGED
@@ -13,6 +13,16 @@ end
13
13
  helper.install
14
14
  spec = helper.gemspec
15
15
 
16
+ release_task = Rake.application["release"]
17
+ # We use Trusted Publishing.
18
+ release_task.prerequisites.delete("build")
19
+ release_task.prerequisites.delete("release:rubygem_push")
20
+ release_task_comment = release_task.comment
21
+ if release_task_comment
22
+ release_task.clear_comments
23
+ release_task.comment = release_task_comment.gsub(/ and build.*$/, "")
24
+ end
25
+
16
26
  task default: :test
17
27
 
18
28
  desc "Run tests"
data/doc/text/news.md CHANGED
@@ -1,5 +1,34 @@
1
1
  # News
2
2
 
3
+ ## 0.1.8 - 2025-02-07
4
+
5
+ ### Improvements
6
+
7
+ * Suppressed "literal string will be frozen" warnings.
8
+
9
+ * Patch by Tsutomu Katsube
10
+
11
+ * `Datasets::HouseOfCouncillor`: Added.
12
+
13
+ * [GH-143](https://github.com/red-data-tools/red-datasets/issues/143)
14
+
15
+ * [GH-181](https://github.com/red-data-tools/red-datasets/issues/181)
16
+
17
+ * Patch by Tsutomu Katsube
18
+
19
+ * `Datasets::HouseOfRepresentative`: Added.
20
+
21
+ * [GH-143](https://github.com/red-data-tools/red-datasets/issues/142)
22
+
23
+ * [GH-181](https://github.com/red-data-tools/red-datasets/issues/184)
24
+
25
+ * Patch by Tsutomu Katsube
26
+
27
+ ### Thanks
28
+
29
+ * Tsutomu Katsube
30
+
31
+
3
32
  ## 0.1.7 - 2023-05-29
4
33
 
5
34
  ### Improvements
@@ -36,7 +36,7 @@ Available from http://lib.stat.cmu.edu/datasets/.
36
36
  file_name = "cadata.txt"
37
37
  download(data_path, data_url)
38
38
  open_data(data_path, file_name) do |input|
39
- data = ""
39
+ data = +""
40
40
  input.each_line do |line|
41
41
  next unless line.start_with?(" ")
42
42
  data << line.lstrip.gsub(/ +/, ",")
@@ -33,8 +33,8 @@ module Datasets
33
33
  @cache_path ||= CachePath.new(@metadata.id)
34
34
  end
35
35
 
36
- def download(output_path, url, &block)
37
- downloader = Downloader.new(url)
36
+ def download(output_path, url, *fallback_urls, &block)
37
+ downloader = Downloader.new(url, *fallback_urls)
38
38
  downloader.download(output_path, &block)
39
39
  end
40
40
 
@@ -6,20 +6,15 @@ end
6
6
  require "net/http"
7
7
  require "pathname"
8
8
 
9
+ require_relative "error"
10
+
9
11
  module Datasets
10
12
  class Downloader
11
- class TooManyRedirects < StandardError; end
13
+ class TooManyRedirects < Error; end
12
14
 
13
- def initialize(url)
14
- if url.is_a?(URI::Generic)
15
- url = url.dup
16
- else
17
- url = URI.parse(url)
18
- end
19
- @url = url
20
- unless @url.is_a?(URI::HTTP)
21
- raise ArgumentError, "download URL must be HTTP or HTTPS: <#{@url}>"
22
- end
15
+ def initialize(url, *fallback_urls)
16
+ @url = normalize_url(url)
17
+ @fallback_urls = fallback_urls.collect { |fallback_url| normalize_url(fallback_url) }
23
18
  end
24
19
 
25
20
  def download(output_path, &block)
@@ -45,7 +40,7 @@ module Datasets
45
40
  headers["Range"] = "bytes=#{start}-"
46
41
  end
47
42
 
48
- start_http(@url, headers) do |response|
43
+ start_http(@url, @fallback_urls, headers) do |response|
49
44
  if response.is_a?(Net::HTTPPartialContent)
50
45
  mode = "ab"
51
46
  else
@@ -85,6 +80,18 @@ module Datasets
85
80
  end
86
81
  end
87
82
 
83
+ private def normalize_url(url)
84
+ if url.is_a?(URI::Generic)
85
+ url = url.dup
86
+ else
87
+ url = URI.parse(url)
88
+ end
89
+ unless url.is_a?(URI::HTTP)
90
+ raise ArgumentError, "download URL must be HTTP or HTTPS: <#{url}>"
91
+ end
92
+ url
93
+ end
94
+
88
95
  private def synchronize(output_path, partial_output_path)
89
96
  begin
90
97
  Process.getpgid(Process.pid)
@@ -104,7 +111,8 @@ module Datasets
104
111
  rescue ArgumentError
105
112
  # The process that acquired the lock will be exited before
106
113
  # it stores its process ID.
107
- valid_lock_path = (lock_path.mtime > 10)
114
+ elapsed_time = Time.now - lock_path.mtime
115
+ valid_lock_path = (elapsed_time > 10)
108
116
  else
109
117
  begin
110
118
  Process.getpgid(pid)
@@ -133,7 +141,7 @@ module Datasets
133
141
  end
134
142
  end
135
143
 
136
- private def start_http(url, headers, limit = 10, &block)
144
+ private def start_http(url, fallback_urls, headers, limit = 10, &block)
137
145
  if limit == 0
138
146
  raise TooManyRedirects, "too many redirections: #{url}"
139
147
  end
@@ -151,8 +159,18 @@ module Datasets
151
159
  when Net::HTTPRedirection
152
160
  url = URI.parse(response[:location])
153
161
  $stderr.puts "Redirect to #{url}"
154
- return start_http(url, headers, limit - 1, &block)
162
+ return start_http(url, fallback_urls, headers, limit - 1, &block)
155
163
  else
164
+ if response.is_a?(Net::HTTPForbidden)
165
+ next_url, *rest_fallback_urls = fallback_urls
166
+ if next_url
167
+ message = "#{response.code}: #{response.message}: " +
168
+ "fallback: <#{url}> -> <#{next_url}>"
169
+ $stderr.puts(message)
170
+ return start_http(next_url, rest_fallback_urls, headers, &block)
171
+ end
172
+ end
173
+
156
174
  message = response.code
157
175
  if response.message and not response.message.empty?
158
176
  message += ": #{response.message}"
@@ -167,7 +185,7 @@ module Datasets
167
185
  private def yield_chunks(path)
168
186
  path.open("rb") do |output|
169
187
  chunk_size = 1024 * 1024
170
- chunk = ""
188
+ chunk = +""
171
189
  while output.read(chunk_size, chunk)
172
190
  yield(chunk)
173
191
  end
@@ -2,9 +2,13 @@ require_relative 'mnist'
2
2
 
3
3
  module Datasets
4
4
  class FashionMNIST < MNIST
5
- BASE_URL = "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/"
6
-
7
5
  private
6
+ def base_urls
7
+ [
8
+ "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/",
9
+ ]
10
+ end
11
+
8
12
  def dataset_name
9
13
  "Fashion-MNIST"
10
14
  end
@@ -17,7 +17,7 @@ module Datasets
17
17
  data_path = cache_dir_path + data_base_name
18
18
  data_url = "#{download_base_url}/data-raw/#{data_base_name}"
19
19
  download(data_path, data_url)
20
- CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
20
+ CSV.open(data_path, headers: :first_row, converters: :numeric) do |csv|
21
21
  record_class = self.class::Record
22
22
  csv.each do |row|
23
23
  record = record_class.new(*row.fields)
@@ -37,7 +37,7 @@ module Datasets
37
37
  data_r_url = "#{download_base_url}/R/#{data_r_base_name}"
38
38
  download(data_r_path, data_r_url)
39
39
  descriptions = {}
40
- comment = ""
40
+ comment = +""
41
41
  File.open(data_r_path) do |data_r|
42
42
  data_r.each_line do |line|
43
43
  case line.chomp
@@ -51,7 +51,7 @@ module Datasets
51
51
  when /\A"(.+)"\z/
52
52
  name = Regexp.last_match[1]
53
53
  descriptions[name] = parse_roxygen(comment.rstrip)
54
- comment = ""
54
+ comment = +""
55
55
  end
56
56
  end
57
57
  descriptions[@ggplot2_dataset_name]
@@ -0,0 +1,169 @@
1
+ require_relative "dataset"
2
+
3
+ module Datasets
4
+ class HouseOfCouncillor < Dataset
5
+ Bill = Struct.new(:council_time,
6
+ :bill_type,
7
+ :submit_time,
8
+ :submit_number,
9
+ :title,
10
+ :bill_url,
11
+ :bill_summary_url,
12
+ :proposed_bill_url,
13
+ :proposed_on,
14
+ :proposed_on_from_house_of_representatives,
15
+ :proposed_on_to_house_of_representatives,
16
+ :prior_deliberations_type,
17
+ :continuation_type,
18
+ :proposers,
19
+ :submitter,
20
+ :submitter_type,
21
+ :progress_of_house_of_councillors_committees_etc_refer_on,
22
+ :progress_of_house_of_councillors_committees_etc_committee_etc,
23
+ :progress_of_house_of_councillors_committees_etc_pass_on,
24
+ :progress_of_house_of_councillors_committees_etc_result,
25
+ :progress_of_house_of_councillors_plenary_sitting_pass_on,
26
+ :progress_of_house_of_councillors_plenary_sitting_result,
27
+ :progress_of_house_of_councillors_plenary_sitting_committees,
28
+ :progress_of_house_of_councillors_plenary_sitting_vote_type,
29
+ :progress_of_house_of_councillors_plenary_sitting_vote_method,
30
+ :progress_of_house_of_councillors_plenary_sitting_result_url,
31
+ :progress_of_house_of_representatives_committees_etc_refer_on,
32
+ :progress_of_house_of_representatives_committees_etc_committee_etc,
33
+ :progress_of_house_of_representatives_committees_etc_pass_on,
34
+ :progress_of_house_of_representatives_committees_etc_result,
35
+ :progress_of_house_of_representatives_plenary_sitting_pass_on,
36
+ :progress_of_house_of_representatives_plenary_sitting_result,
37
+ :progress_of_house_of_representatives_plenary_sitting_committees,
38
+ :progress_of_house_of_representatives_plenary_sitting_vote_type,
39
+ :progress_of_house_of_representatives_plenary_sitting_vote_method,
40
+ :promulgated_on,
41
+ :law_number,
42
+ :entracted_law_url,
43
+ :notes)
44
+
45
+ InHouseGroup = Struct.new(:in_house_group_name_and_abbreviation_on,
46
+ :in_house_group_name,
47
+ :in_house_group_abbreviation,
48
+ :number_of_members_on,
49
+ :number_of_members,
50
+ :number_of_women_members,
51
+ :first_term_expires_on,
52
+ :first_term_proportional_representation_number_of_members,
53
+ :first_term_proportional_representation_number_of_women_members,
54
+ :first_term_election_district_number_of_members,
55
+ :first_term_election_district_number_of_women_members,
56
+ :first_term_total_number_of_members,
57
+ :first_term_total_number_of_women_members,
58
+ :second_term_expires_on,
59
+ :second_term_proportional_representation_number_of_members,
60
+ :second_term_proportional_representation_number_of_women_members,
61
+ :second_term_election_district_number_of_members,
62
+ :second_term_election_district_number_of_women_members,
63
+ :second_term_total_number_of_members,
64
+ :second_term_total_number_of_women_members)
65
+
66
+ Member = Struct.new(:professional_name,
67
+ :true_name,
68
+ :profile_url,
69
+ :professional_name_reading,
70
+ :in_house_group_abbreviation,
71
+ :constituency,
72
+ :expiration_of_term,
73
+ :photo_url,
74
+ :elected_years,
75
+ :elected_number,
76
+ :responsibilities,
77
+ :responsibility_on,
78
+ :career,
79
+ :career_on)
80
+
81
+ Question = Struct.new(:submit_time,
82
+ :submit_number,
83
+ :title,
84
+ :submitter,
85
+ :number_of_submissions,
86
+ :question_for_text_html_url,
87
+ :answer_for_text_html_url,
88
+ :question_for_text_pdf_url,
89
+ :answer_for_text_pdf_url,
90
+ :question_url,
91
+ :submitted_on,
92
+ :transfered_on,
93
+ :received_answer_on,
94
+ :notes)
95
+
96
+ VALID_TYPES = [
97
+ :bill,
98
+ :in_house_group,
99
+ :member,
100
+ :question
101
+ ]
102
+
103
+ def initialize(type: :bill)
104
+ super()
105
+ @type = type
106
+ unless VALID_TYPES.include?(type)
107
+ message = +":type must be one of ["
108
+ message << VALID_TYPES.collect(&:inspect).join(", ")
109
+ message << "]: #{@type.inspect}"
110
+ raise ArgumentError, message
111
+ end
112
+
113
+ @metadata.id = "house-of-councillor"
114
+ @metadata.name = "Bill, in-House group, member and question of the House of Councillors of Japan"
115
+ @metadata.url = "https://smartnews-smri.github.io/house-of-councillors"
116
+ @metadata.licenses = ["MIT"]
117
+ @metadata.description = "The House of Councillors of Japan (type: #{@type})"
118
+ end
119
+
120
+ def each
121
+ return to_enum(__method__) unless block_given?
122
+
123
+ open_data do |csv|
124
+ csv.each do |row|
125
+ case @type
126
+ when :bill
127
+ record = Bill.new(*row.fields)
128
+ when :in_house_group
129
+ record = InHouseGroup.new(*row.fields)
130
+ when :member
131
+ %w(当選年).each do |ints_column_name|
132
+ row[ints_column_name] = parse_ints(row[ints_column_name])
133
+ end
134
+ record = Member.new(*row.fields)
135
+ when :question
136
+ record = Question.new(*row.fields)
137
+ end
138
+ yield(record)
139
+ end
140
+ end
141
+ end
142
+
143
+ private
144
+
145
+ def open_data
146
+ data_url = +"https://smartnews-smri.github.io/house-of-councillors/data"
147
+ case @type
148
+ when :bill
149
+ data_url << "/gian.csv"
150
+ when :in_house_group
151
+ data_url << "/kaiha.csv"
152
+ when :member
153
+ data_url << "/giin.csv"
154
+ when :question
155
+ data_url << "/syuisyo.csv"
156
+ end
157
+ data_path = cache_dir_path + "#{@type}.csv"
158
+ download(data_path, data_url)
159
+
160
+ CSV.open(data_path, col_sep: ",", headers: true, converters: %i(date integer)) do |csv|
161
+ yield(csv)
162
+ end
163
+ end
164
+
165
+ def parse_ints(column_value)
166
+ column_value.to_s.split("、").collect(&:to_i)
167
+ end
168
+ end
169
+ end
@@ -0,0 +1,107 @@
1
+ require_relative "dataset"
2
+ require_relative "japanese-date-parser"
3
+
4
+ module Datasets
5
+ class HouseOfRepresentative < Dataset
6
+ Record = Struct.new(:carry_time,
7
+ :caption,
8
+ :type,
9
+ :submit_time,
10
+ :submit_number,
11
+ :title,
12
+ :discussion_status,
13
+ :progress,
14
+ :progress_url,
15
+ :text,
16
+ :text_url,
17
+ :bill_type,
18
+ :submitter,
19
+ :submitter_in_house_groups,
20
+ :house_of_representatives_of_accepted_bill_on_preliminary_consideration,
21
+ :house_of_representatives_of_preliminary_refer_on,
22
+ :house_of_representatives_of_preliminary_refer_commission,
23
+ :house_of_representatives_of_accepted_bill_on,
24
+ :house_of_representatives_of_refer_on,
25
+ :house_of_representatives_of_refer_commission,
26
+ :house_of_representatives_of_finished_consideration_on,
27
+ :house_of_representatives_of_consideration_result,
28
+ :house_of_representatives_of_finished_deliberation_on,
29
+ :house_of_representatives_of_deliberation_result,
30
+ :house_of_representatives_of_attitude_of_in_house_group_during_deliberation,
31
+ :house_of_representatives_of_support_in_house_group_during_deliberation,
32
+ :house_of_representatives_of_opposition_in_house_group_during_deliberation,
33
+ :house_of_councillors_of_accepted_bill_on_preliminary_consideration,
34
+ :house_of_councillors_of_preliminary_refer_on,
35
+ :house_of_councillors_of_preliminary_refer_commission,
36
+ :house_of_councillors_of_accepted_bill_on,
37
+ :house_of_councillors_of_refer_on,
38
+ :house_of_councillors_of_refer_commission,
39
+ :house_of_councillors_of_finished_consideration_on,
40
+ :house_of_councillors_of_consideration_result,
41
+ :house_of_councillors_of_finished_deliberation_on,
42
+ :house_of_councillors_of_deliberation_result,
43
+ :promulgated_on,
44
+ :law_number,
45
+ :submitters,
46
+ :supporters_of_submitted_bill)
47
+
48
+ def initialize
49
+ super()
50
+
51
+ @metadata.id = "house-of-representative"
52
+ @metadata.name = "Bill of the House of Representatives of Japan"
53
+ @metadata.url = "https://smartnews-smri.github.io/house-of-representatives"
54
+ @metadata.licenses = ["MIT"]
55
+ @metadata.description = "Bill of the House of Representatives of Japan"
56
+ end
57
+
58
+ def each
59
+ return to_enum(__method__) unless block_given?
60
+
61
+ open_data do |csv|
62
+ csv.each do |row|
63
+ record = Record.new(*row.fields)
64
+ yield(record)
65
+ end
66
+ end
67
+ end
68
+
69
+ private
70
+
71
+ def open_data
72
+ data_url = "https://raw.githubusercontent.com/smartnews-smri/house-of-representatives/main/data/gian.csv"
73
+ data_path = cache_dir_path + "gian.csv"
74
+ download(data_path, data_url)
75
+
76
+ parser = JapaneseDateParser.new
77
+ japanese_date_converter = lambda do |field, info|
78
+ if info.header.end_with?("年月日")
79
+ parser.parse(field)
80
+ else
81
+ field
82
+ end
83
+ end
84
+ array_converter = lambda do |field, info|
85
+ case info.header
86
+ when "議案提出会派", "衆議院審議時賛成会派", "衆議院審議時反対会派", "議案提出者一覧", "議案提出の賛成者"
87
+ parse_array(field)
88
+ else
89
+ field
90
+ end
91
+ end
92
+ File.open(data_path) do |data_file|
93
+ options = {
94
+ col_sep: ",",
95
+ headers: true,
96
+ converters: [:integer, japanese_date_converter, array_converter],
97
+ }
98
+ # There are two columns within one column. To split into two columns, `#gsub` is necessary.
99
+ yield(CSV.new(data_file.read.gsub("/", ","), **options))
100
+ end
101
+ end
102
+
103
+ def parse_array(column_value)
104
+ column_value&.split("; ")
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,38 @@
1
+ module Datasets
2
+ class JapaneseDateParser
3
+ class UnsupportedEraInitialRange < Error; end
4
+
5
+ ERA_INITIALS = {
6
+ "平成" => "H",
7
+ "令和" => "R",
8
+ }.freeze
9
+
10
+ def parse(string)
11
+ case string
12
+ when nil
13
+ nil
14
+ when /\A(平成|令和|..)\s*(\d{1,2}|元)年\s*(\d{1,2})月\s*(\d{1,2})日\z/
15
+ match_data = Regexp.last_match
16
+ era_initial = ERA_INITIALS[match_data[1]]
17
+ if era_initial.nil?
18
+ message = +"era must be one of ["
19
+ message << ERA_INITIALS.keys.join(", ")
20
+ message << "]: #{match_data[1]}"
21
+ raise UnsupportedEraInitialRange, message
22
+ end
23
+
24
+ year = match_data[2]
25
+ if year == "元"
26
+ year = "01"
27
+ else
28
+ year = year.rjust(2, "0")
29
+ end
30
+ month = match_data[3].rjust(2, "0")
31
+ day = match_data[4].rjust(2, "0")
32
+ Date.jisx0301("#{era_initial}#{year}.#{month}.#{day}")
33
+ else
34
+ string
35
+ end
36
+ end
37
+ end
38
+ end
@@ -2,9 +2,13 @@ require_relative 'mnist'
2
2
 
3
3
  module Datasets
4
4
  class KuzushijiMNIST < MNIST
5
- BASE_URL = "http://codh.rois.ac.jp/kmnist/dataset/kmnist/"
6
-
7
5
  private
6
+ def base_urls
7
+ [
8
+ "http://codh.rois.ac.jp/kmnist/dataset/kmnist/",
9
+ ]
10
+ end
11
+
8
12
  def dataset_name
9
13
  "Kuzushiji-MNIST"
10
14
  end
data/lib/datasets/lazy.rb CHANGED
@@ -57,6 +57,8 @@ module Datasets
57
57
  LAZY_LOADER.register(:FuelEconomy, "datasets/fuel-economy")
58
58
  LAZY_LOADER.register(:Geolonia, "datasets/geolonia")
59
59
  LAZY_LOADER.register(:Hepatitis, "datasets/hepatitis")
60
+ LAZY_LOADER.register(:HouseOfCouncillor, "datasets/house-of-councillor")
61
+ LAZY_LOADER.register(:HouseOfRepresentative, "datasets/house-of-representative")
60
62
  LAZY_LOADER.register(:Iris, "datasets/iris")
61
63
  LAZY_LOADER.register(:ITACorpus, "datasets/ita-corpus")
62
64
  LAZY_LOADER.register(:KuzushijiMNIST, "datasets/kuzushiji-mnist")
@@ -110,7 +110,7 @@ module Datasets
110
110
  @row = []
111
111
  when "td"
112
112
  @in_td = true
113
- @row << {:text => ""}
113
+ @row << {:text => +""}
114
114
  when "a"
115
115
  @row.last[:href] = attributes["href"] if @in_td
116
116
  end
@@ -4,8 +4,6 @@ require_relative "dataset"
4
4
 
5
5
  module Datasets
6
6
  class MNIST < Dataset
7
- BASE_URL = "http://yann.lecun.com/exdb/mnist/"
8
-
9
7
  class Record < Struct.new(:data, :label)
10
8
  def pixels
11
9
  data.unpack("C*")
@@ -27,7 +25,7 @@ module Datasets
27
25
 
28
26
  @metadata.id = "#{dataset_name.downcase}-#{type}"
29
27
  @metadata.name = "#{dataset_name}: #{type}"
30
- @metadata.url = self.class::BASE_URL
28
+ @metadata.url = base_urls.first
31
29
  @metadata.licenses = licenses
32
30
  @type = type
33
31
 
@@ -44,15 +42,23 @@ module Datasets
44
42
 
45
43
  image_path = cache_dir_path + target_file(:image)
46
44
  label_path = cache_dir_path + target_file(:label)
47
- base_url = self.class::BASE_URL
48
45
 
49
- download(image_path, base_url + target_file(:image))
50
- download(label_path, base_url + target_file(:label))
46
+ download(image_path,
47
+ *base_urls.collect { |base_url| base_url + target_file(:image) })
48
+ download(label_path,
49
+ *base_urls.collect { |base_url| base_url + target_file(:label) })
51
50
 
52
51
  open_data(image_path, label_path, &block)
53
52
  end
54
53
 
55
54
  private
55
+ def base_urls
56
+ [
57
+ "http://yann.lecun.com/exdb/mnist/",
58
+ "https://ossci-datasets.s3.amazonaws.com/mnist/",
59
+ ]
60
+ end
61
+
56
62
  def licenses
57
63
  []
58
64
  end
@@ -28,8 +28,8 @@ module Datasets
28
28
 
29
29
  def initialize
30
30
  super()
31
- @metadata.id = 'nagoya-university-conversation-curpus'
32
- @metadata.name = 'Nagoya University Conversation Curpus'
31
+ @metadata.id = 'nagoya-university-conversation-corpus'
32
+ @metadata.name = 'Nagoya University Conversation Corpus'
33
33
  @metadata.url = 'https://mmsrv.ninjal.ac.jp/nucc/'
34
34
  @metadata.licenses = ['CC-BY-NC-ND-4.0']
35
35
  @metadata.description = <<~DESCRIPTION
@@ -41,7 +41,7 @@ module Datasets
41
41
  super()
42
42
  @reading = reading
43
43
  unless VALID_READINGS.include?(@reading)
44
- message = ":reading must be one of ["
44
+ message = +":reading must be one of ["
45
45
  message << VALID_READINGS.collect(&:inspect).join(", ")
46
46
  message << "]: #{@reading.inspect}"
47
47
  raise ArgumentError, message
@@ -104,14 +104,14 @@ module Datasets
104
104
 
105
105
  private
106
106
  def open_data
107
- data_url = "https://www.post.japanpost.jp/zipcode/dl"
107
+ data_url = +"https://www.post.japanpost.jp/zipcode/dl"
108
108
  case @reading
109
109
  when :lowercase
110
110
  data_url << "/kogaki/zip/ken_all.zip"
111
111
  when :uppercase
112
112
  data_url << "/oogaki/zip/ken_all.zip"
113
113
  when :romaji
114
- data_url << "/roman/ken_all_rome.zip"
114
+ data_url << "/roman/KEN_ALL_ROME.zip"
115
115
  end
116
116
  data_path = cache_dir_path + "#{@reading}-ken-all.zip"
117
117
  download(data_path, data_url)