red-datasets 0.1.7 → 0.1.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -0
  3. data/Rakefile +10 -0
  4. data/doc/text/news.md +29 -0
  5. data/lib/datasets/california-housing.rb +1 -1
  6. data/lib/datasets/dataset.rb +2 -2
  7. data/lib/datasets/downloader.rb +34 -16
  8. data/lib/datasets/fashion-mnist.rb +6 -2
  9. data/lib/datasets/ggplot2-dataset.rb +3 -3
  10. data/lib/datasets/house-of-councillor.rb +169 -0
  11. data/lib/datasets/house-of-representative.rb +107 -0
  12. data/lib/datasets/japanese-date-parser.rb +38 -0
  13. data/lib/datasets/kuzushiji-mnist.rb +6 -2
  14. data/lib/datasets/lazy.rb +2 -0
  15. data/lib/datasets/libsvm-dataset-list.rb +1 -1
  16. data/lib/datasets/mnist.rb +12 -6
  17. data/lib/datasets/nagoya-university-conversation-corpus.rb +2 -2
  18. data/lib/datasets/postal-code-japan.rb +3 -3
  19. data/lib/datasets/quora-duplicate-question-pair.rb +1 -1
  20. data/lib/datasets/version.rb +1 -1
  21. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +2 -2
  22. data/lib/datasets/wikipedia.rb +2 -2
  23. data/test/japanese-date-parser-test.rb +27 -0
  24. data/test/test-adult.rb +36 -86
  25. data/test/test-aozora-bunko.rb +5 -5
  26. data/test/test-california-housing.rb +12 -31
  27. data/test/test-cldr-plurals.rb +1 -1
  28. data/test/test-diamonds.rb +13 -33
  29. data/test/test-downloader.rb +1 -1
  30. data/test/test-geolonia.rb +17 -41
  31. data/test/test-house-of-councillor.rb +223 -0
  32. data/test/test-house-of-representative.rb +54 -0
  33. data/test/test-nagoya-university-conversation-corpus.rb +17 -69
  34. data/test/test-postal-code-japan.rb +7 -0
  35. data/test/test-quora-duplicate-question-pair.rb +7 -21
  36. data/test/test-rdataset.rb +24 -22
  37. data/test/test-sudachi-synonym-dictionary.rb +12 -31
  38. data/test/test-wikipedia.rb +5 -5
  39. metadata +12 -6
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0231a4f9da16ad1b2cb562a360468b3450fec684e2bcf0ca500195499e8f7397
4
- data.tar.gz: c2938b6d72fea58413a743ccad111fc8c95699d9b417e64941ca1681128a1706
3
+ metadata.gz: deff1c4d13294030c25c06691e272881eb049274decd9e2a958d0486c792ef26
4
+ data.tar.gz: e48151e045fbf343291a5f5770409dea7672ff39c75bcc617152b52a39890f31
5
5
  SHA512:
6
- metadata.gz: 2ea402beb78be117e28ca906490526a28a8d1e1430181a89432953d26eb0b0a8ea70d0c582d438c5e6b668b3a48eb60db336ef8174fa4dd1a52e20c19f5b4d9b
7
- data.tar.gz: 38b59c46e875ae61ab0794f2f9f474969b7e4e08e06078ca55ce4d1930d9538647e6b0152c1c48e8f0d45318ed2f71801dc99152d0799e06459937fb3d29978d
6
+ metadata.gz: 7ae5a39a716bfa719f937c6ff139bd90ccf625e8a7ce72298507506f1a2c6a100e81c1273bf395a40d0ae8f1a13c1b1fb554e123e49dff718607a734ffd6c67b
7
+ data.tar.gz: 72e664ba5f2a569a92d9d4237588f92f7398621642486e4c9e6c930bade4c17e4a68d30da14cb277fcd602025f959c42284462bf4370872f9826c9634ff78aca
data/README.md CHANGED
@@ -29,6 +29,8 @@ You can use datasets easily because you can access each dataset with multiple wa
29
29
  * Fuel Economy Dataset
30
30
  * Geolonia Japanese Addresses
31
31
  * Hepatitis
32
+ * House of Councillors of Japan
33
+ * House of Representatives of Japan
32
34
  * Iris Dataset
33
35
  * Libsvm
34
36
  * MNIST database
data/Rakefile CHANGED
@@ -13,6 +13,16 @@ end
13
13
  helper.install
14
14
  spec = helper.gemspec
15
15
 
16
+ release_task = Rake.application["release"]
17
+ # We use Trusted Publishing.
18
+ release_task.prerequisites.delete("build")
19
+ release_task.prerequisites.delete("release:rubygem_push")
20
+ release_task_comment = release_task.comment
21
+ if release_task_comment
22
+ release_task.clear_comments
23
+ release_task.comment = release_task_comment.gsub(/ and build.*$/, "")
24
+ end
25
+
16
26
  task default: :test
17
27
 
18
28
  desc "Run tests"
data/doc/text/news.md CHANGED
@@ -1,5 +1,34 @@
1
1
  # News
2
2
 
3
+ ## 0.1.8 - 2025-02-07
4
+
5
+ ### Improvements
6
+
7
+ * Suppressed "literal string will be frozen" warnings.
8
+
9
+ * Patch by Tsutomu Katsube
10
+
11
+ * `Datasets::HouseOfCouncillor`: Added.
12
+
13
+ * [GH-143](https://github.com/red-data-tools/red-datasets/issues/143)
14
+
15
+ * [GH-181](https://github.com/red-data-tools/red-datasets/issues/181)
16
+
17
+ * Patch by Tsutomu Katsube
18
+
19
+ * `Datasets::HouseOfRepresentative`: Added.
20
+
21
+ * [GH-143](https://github.com/red-data-tools/red-datasets/issues/142)
22
+
23
+ * [GH-181](https://github.com/red-data-tools/red-datasets/issues/184)
24
+
25
+ * Patch by Tsutomu Katsube
26
+
27
+ ### Thanks
28
+
29
+ * Tsutomu Katsube
30
+
31
+
3
32
  ## 0.1.7 - 2023-05-29
4
33
 
5
34
  ### Improvements
@@ -36,7 +36,7 @@ Available from http://lib.stat.cmu.edu/datasets/.
36
36
  file_name = "cadata.txt"
37
37
  download(data_path, data_url)
38
38
  open_data(data_path, file_name) do |input|
39
- data = ""
39
+ data = +""
40
40
  input.each_line do |line|
41
41
  next unless line.start_with?(" ")
42
42
  data << line.lstrip.gsub(/ +/, ",")
@@ -33,8 +33,8 @@ module Datasets
33
33
  @cache_path ||= CachePath.new(@metadata.id)
34
34
  end
35
35
 
36
- def download(output_path, url, &block)
37
- downloader = Downloader.new(url)
36
+ def download(output_path, url, *fallback_urls, &block)
37
+ downloader = Downloader.new(url, *fallback_urls)
38
38
  downloader.download(output_path, &block)
39
39
  end
40
40
 
@@ -6,20 +6,15 @@ end
6
6
  require "net/http"
7
7
  require "pathname"
8
8
 
9
+ require_relative "error"
10
+
9
11
  module Datasets
10
12
  class Downloader
11
- class TooManyRedirects < StandardError; end
13
+ class TooManyRedirects < Error; end
12
14
 
13
- def initialize(url)
14
- if url.is_a?(URI::Generic)
15
- url = url.dup
16
- else
17
- url = URI.parse(url)
18
- end
19
- @url = url
20
- unless @url.is_a?(URI::HTTP)
21
- raise ArgumentError, "download URL must be HTTP or HTTPS: <#{@url}>"
22
- end
15
+ def initialize(url, *fallback_urls)
16
+ @url = normalize_url(url)
17
+ @fallback_urls = fallback_urls.collect { |fallback_url| normalize_url(fallback_url) }
23
18
  end
24
19
 
25
20
  def download(output_path, &block)
@@ -45,7 +40,7 @@ module Datasets
45
40
  headers["Range"] = "bytes=#{start}-"
46
41
  end
47
42
 
48
- start_http(@url, headers) do |response|
43
+ start_http(@url, @fallback_urls, headers) do |response|
49
44
  if response.is_a?(Net::HTTPPartialContent)
50
45
  mode = "ab"
51
46
  else
@@ -85,6 +80,18 @@ module Datasets
85
80
  end
86
81
  end
87
82
 
83
+ private def normalize_url(url)
84
+ if url.is_a?(URI::Generic)
85
+ url = url.dup
86
+ else
87
+ url = URI.parse(url)
88
+ end
89
+ unless url.is_a?(URI::HTTP)
90
+ raise ArgumentError, "download URL must be HTTP or HTTPS: <#{url}>"
91
+ end
92
+ url
93
+ end
94
+
88
95
  private def synchronize(output_path, partial_output_path)
89
96
  begin
90
97
  Process.getpgid(Process.pid)
@@ -104,7 +111,8 @@ module Datasets
104
111
  rescue ArgumentError
105
112
  # The process that acquired the lock will be exited before
106
113
  # it stores its process ID.
107
- valid_lock_path = (lock_path.mtime > 10)
114
+ elapsed_time = Time.now - lock_path.mtime
115
+ valid_lock_path = (elapsed_time > 10)
108
116
  else
109
117
  begin
110
118
  Process.getpgid(pid)
@@ -133,7 +141,7 @@ module Datasets
133
141
  end
134
142
  end
135
143
 
136
- private def start_http(url, headers, limit = 10, &block)
144
+ private def start_http(url, fallback_urls, headers, limit = 10, &block)
137
145
  if limit == 0
138
146
  raise TooManyRedirects, "too many redirections: #{url}"
139
147
  end
@@ -151,8 +159,18 @@ module Datasets
151
159
  when Net::HTTPRedirection
152
160
  url = URI.parse(response[:location])
153
161
  $stderr.puts "Redirect to #{url}"
154
- return start_http(url, headers, limit - 1, &block)
162
+ return start_http(url, fallback_urls, headers, limit - 1, &block)
155
163
  else
164
+ if response.is_a?(Net::HTTPForbidden)
165
+ next_url, *rest_fallback_urls = fallback_urls
166
+ if next_url
167
+ message = "#{response.code}: #{response.message}: " +
168
+ "fallback: <#{url}> -> <#{next_url}>"
169
+ $stderr.puts(message)
170
+ return start_http(next_url, rest_fallback_urls, headers, &block)
171
+ end
172
+ end
173
+
156
174
  message = response.code
157
175
  if response.message and not response.message.empty?
158
176
  message += ": #{response.message}"
@@ -167,7 +185,7 @@ module Datasets
167
185
  private def yield_chunks(path)
168
186
  path.open("rb") do |output|
169
187
  chunk_size = 1024 * 1024
170
- chunk = ""
188
+ chunk = +""
171
189
  while output.read(chunk_size, chunk)
172
190
  yield(chunk)
173
191
  end
@@ -2,9 +2,13 @@ require_relative 'mnist'
2
2
 
3
3
  module Datasets
4
4
  class FashionMNIST < MNIST
5
- BASE_URL = "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/"
6
-
7
5
  private
6
+ def base_urls
7
+ [
8
+ "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/",
9
+ ]
10
+ end
11
+
8
12
  def dataset_name
9
13
  "Fashion-MNIST"
10
14
  end
@@ -17,7 +17,7 @@ module Datasets
17
17
  data_path = cache_dir_path + data_base_name
18
18
  data_url = "#{download_base_url}/data-raw/#{data_base_name}"
19
19
  download(data_path, data_url)
20
- CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
20
+ CSV.open(data_path, headers: :first_row, converters: :numeric) do |csv|
21
21
  record_class = self.class::Record
22
22
  csv.each do |row|
23
23
  record = record_class.new(*row.fields)
@@ -37,7 +37,7 @@ module Datasets
37
37
  data_r_url = "#{download_base_url}/R/#{data_r_base_name}"
38
38
  download(data_r_path, data_r_url)
39
39
  descriptions = {}
40
- comment = ""
40
+ comment = +""
41
41
  File.open(data_r_path) do |data_r|
42
42
  data_r.each_line do |line|
43
43
  case line.chomp
@@ -51,7 +51,7 @@ module Datasets
51
51
  when /\A"(.+)"\z/
52
52
  name = Regexp.last_match[1]
53
53
  descriptions[name] = parse_roxygen(comment.rstrip)
54
- comment = ""
54
+ comment = +""
55
55
  end
56
56
  end
57
57
  descriptions[@ggplot2_dataset_name]
@@ -0,0 +1,169 @@
1
+ require_relative "dataset"
2
+
3
+ module Datasets
4
+ class HouseOfCouncillor < Dataset
5
+ Bill = Struct.new(:council_time,
6
+ :bill_type,
7
+ :submit_time,
8
+ :submit_number,
9
+ :title,
10
+ :bill_url,
11
+ :bill_summary_url,
12
+ :proposed_bill_url,
13
+ :proposed_on,
14
+ :proposed_on_from_house_of_representatives,
15
+ :proposed_on_to_house_of_representatives,
16
+ :prior_deliberations_type,
17
+ :continuation_type,
18
+ :proposers,
19
+ :submitter,
20
+ :submitter_type,
21
+ :progress_of_house_of_councillors_committees_etc_refer_on,
22
+ :progress_of_house_of_councillors_committees_etc_committee_etc,
23
+ :progress_of_house_of_councillors_committees_etc_pass_on,
24
+ :progress_of_house_of_councillors_committees_etc_result,
25
+ :progress_of_house_of_councillors_plenary_sitting_pass_on,
26
+ :progress_of_house_of_councillors_plenary_sitting_result,
27
+ :progress_of_house_of_councillors_plenary_sitting_committees,
28
+ :progress_of_house_of_councillors_plenary_sitting_vote_type,
29
+ :progress_of_house_of_councillors_plenary_sitting_vote_method,
30
+ :progress_of_house_of_councillors_plenary_sitting_result_url,
31
+ :progress_of_house_of_representatives_committees_etc_refer_on,
32
+ :progress_of_house_of_representatives_committees_etc_committee_etc,
33
+ :progress_of_house_of_representatives_committees_etc_pass_on,
34
+ :progress_of_house_of_representatives_committees_etc_result,
35
+ :progress_of_house_of_representatives_plenary_sitting_pass_on,
36
+ :progress_of_house_of_representatives_plenary_sitting_result,
37
+ :progress_of_house_of_representatives_plenary_sitting_committees,
38
+ :progress_of_house_of_representatives_plenary_sitting_vote_type,
39
+ :progress_of_house_of_representatives_plenary_sitting_vote_method,
40
+ :promulgated_on,
41
+ :law_number,
42
+ :entracted_law_url,
43
+ :notes)
44
+
45
+ InHouseGroup = Struct.new(:in_house_group_name_and_abbreviation_on,
46
+ :in_house_group_name,
47
+ :in_house_group_abbreviation,
48
+ :number_of_members_on,
49
+ :number_of_members,
50
+ :number_of_women_members,
51
+ :first_term_expires_on,
52
+ :first_term_proportional_representation_number_of_members,
53
+ :first_term_proportional_representation_number_of_women_members,
54
+ :first_term_election_district_number_of_members,
55
+ :first_term_election_district_number_of_women_members,
56
+ :first_term_total_number_of_members,
57
+ :first_term_total_number_of_women_members,
58
+ :second_term_expires_on,
59
+ :second_term_proportional_representation_number_of_members,
60
+ :second_term_proportional_representation_number_of_women_members,
61
+ :second_term_election_district_number_of_members,
62
+ :second_term_election_district_number_of_women_members,
63
+ :second_term_total_number_of_members,
64
+ :second_term_total_number_of_women_members)
65
+
66
+ Member = Struct.new(:professional_name,
67
+ :true_name,
68
+ :profile_url,
69
+ :professional_name_reading,
70
+ :in_house_group_abbreviation,
71
+ :constituency,
72
+ :expiration_of_term,
73
+ :photo_url,
74
+ :elected_years,
75
+ :elected_number,
76
+ :responsibilities,
77
+ :responsibility_on,
78
+ :career,
79
+ :career_on)
80
+
81
+ Question = Struct.new(:submit_time,
82
+ :submit_number,
83
+ :title,
84
+ :submitter,
85
+ :number_of_submissions,
86
+ :question_for_text_html_url,
87
+ :answer_for_text_html_url,
88
+ :question_for_text_pdf_url,
89
+ :answer_for_text_pdf_url,
90
+ :question_url,
91
+ :submitted_on,
92
+ :transfered_on,
93
+ :received_answer_on,
94
+ :notes)
95
+
96
+ VALID_TYPES = [
97
+ :bill,
98
+ :in_house_group,
99
+ :member,
100
+ :question
101
+ ]
102
+
103
+ def initialize(type: :bill)
104
+ super()
105
+ @type = type
106
+ unless VALID_TYPES.include?(type)
107
+ message = +":type must be one of ["
108
+ message << VALID_TYPES.collect(&:inspect).join(", ")
109
+ message << "]: #{@type.inspect}"
110
+ raise ArgumentError, message
111
+ end
112
+
113
+ @metadata.id = "house-of-councillor"
114
+ @metadata.name = "Bill, in-House group, member and question of the House of Councillors of Japan"
115
+ @metadata.url = "https://smartnews-smri.github.io/house-of-councillors"
116
+ @metadata.licenses = ["MIT"]
117
+ @metadata.description = "The House of Councillors of Japan (type: #{@type})"
118
+ end
119
+
120
+ def each
121
+ return to_enum(__method__) unless block_given?
122
+
123
+ open_data do |csv|
124
+ csv.each do |row|
125
+ case @type
126
+ when :bill
127
+ record = Bill.new(*row.fields)
128
+ when :in_house_group
129
+ record = InHouseGroup.new(*row.fields)
130
+ when :member
131
+ %w(当選年).each do |ints_column_name|
132
+ row[ints_column_name] = parse_ints(row[ints_column_name])
133
+ end
134
+ record = Member.new(*row.fields)
135
+ when :question
136
+ record = Question.new(*row.fields)
137
+ end
138
+ yield(record)
139
+ end
140
+ end
141
+ end
142
+
143
+ private
144
+
145
+ def open_data
146
+ data_url = +"https://smartnews-smri.github.io/house-of-councillors/data"
147
+ case @type
148
+ when :bill
149
+ data_url << "/gian.csv"
150
+ when :in_house_group
151
+ data_url << "/kaiha.csv"
152
+ when :member
153
+ data_url << "/giin.csv"
154
+ when :question
155
+ data_url << "/syuisyo.csv"
156
+ end
157
+ data_path = cache_dir_path + "#{@type}.csv"
158
+ download(data_path, data_url)
159
+
160
+ CSV.open(data_path, col_sep: ",", headers: true, converters: %i(date integer)) do |csv|
161
+ yield(csv)
162
+ end
163
+ end
164
+
165
+ def parse_ints(column_value)
166
+ column_value.to_s.split("、").collect(&:to_i)
167
+ end
168
+ end
169
+ end
@@ -0,0 +1,107 @@
1
+ require_relative "dataset"
2
+ require_relative "japanese-date-parser"
3
+
4
+ module Datasets
5
+ class HouseOfRepresentative < Dataset
6
+ Record = Struct.new(:carry_time,
7
+ :caption,
8
+ :type,
9
+ :submit_time,
10
+ :submit_number,
11
+ :title,
12
+ :discussion_status,
13
+ :progress,
14
+ :progress_url,
15
+ :text,
16
+ :text_url,
17
+ :bill_type,
18
+ :submitter,
19
+ :submitter_in_house_groups,
20
+ :house_of_representatives_of_accepted_bill_on_preliminary_consideration,
21
+ :house_of_representatives_of_preliminary_refer_on,
22
+ :house_of_representatives_of_preliminary_refer_commission,
23
+ :house_of_representatives_of_accepted_bill_on,
24
+ :house_of_representatives_of_refer_on,
25
+ :house_of_representatives_of_refer_commission,
26
+ :house_of_representatives_of_finished_consideration_on,
27
+ :house_of_representatives_of_consideration_result,
28
+ :house_of_representatives_of_finished_deliberation_on,
29
+ :house_of_representatives_of_deliberation_result,
30
+ :house_of_representatives_of_attitude_of_in_house_group_during_deliberation,
31
+ :house_of_representatives_of_support_in_house_group_during_deliberation,
32
+ :house_of_representatives_of_opposition_in_house_group_during_deliberation,
33
+ :house_of_councillors_of_accepted_bill_on_preliminary_consideration,
34
+ :house_of_councillors_of_preliminary_refer_on,
35
+ :house_of_councillors_of_preliminary_refer_commission,
36
+ :house_of_councillors_of_accepted_bill_on,
37
+ :house_of_councillors_of_refer_on,
38
+ :house_of_councillors_of_refer_commission,
39
+ :house_of_councillors_of_finished_consideration_on,
40
+ :house_of_councillors_of_consideration_result,
41
+ :house_of_councillors_of_finished_deliberation_on,
42
+ :house_of_councillors_of_deliberation_result,
43
+ :promulgated_on,
44
+ :law_number,
45
+ :submitters,
46
+ :supporters_of_submitted_bill)
47
+
48
+ def initialize
49
+ super()
50
+
51
+ @metadata.id = "house-of-representative"
52
+ @metadata.name = "Bill of the House of Representatives of Japan"
53
+ @metadata.url = "https://smartnews-smri.github.io/house-of-representatives"
54
+ @metadata.licenses = ["MIT"]
55
+ @metadata.description = "Bill of the House of Representatives of Japan"
56
+ end
57
+
58
+ def each
59
+ return to_enum(__method__) unless block_given?
60
+
61
+ open_data do |csv|
62
+ csv.each do |row|
63
+ record = Record.new(*row.fields)
64
+ yield(record)
65
+ end
66
+ end
67
+ end
68
+
69
+ private
70
+
71
+ def open_data
72
+ data_url = "https://raw.githubusercontent.com/smartnews-smri/house-of-representatives/main/data/gian.csv"
73
+ data_path = cache_dir_path + "gian.csv"
74
+ download(data_path, data_url)
75
+
76
+ parser = JapaneseDateParser.new
77
+ japanese_date_converter = lambda do |field, info|
78
+ if info.header.end_with?("年月日")
79
+ parser.parse(field)
80
+ else
81
+ field
82
+ end
83
+ end
84
+ array_converter = lambda do |field, info|
85
+ case info.header
86
+ when "議案提出会派", "衆議院審議時賛成会派", "衆議院審議時反対会派", "議案提出者一覧", "議案提出の賛成者"
87
+ parse_array(field)
88
+ else
89
+ field
90
+ end
91
+ end
92
+ File.open(data_path) do |data_file|
93
+ options = {
94
+ col_sep: ",",
95
+ headers: true,
96
+ converters: [:integer, japanese_date_converter, array_converter],
97
+ }
98
+ # There are two columns within one column. To split into two columns, `#gsub` is necessary.
99
+ yield(CSV.new(data_file.read.gsub("/", ","), **options))
100
+ end
101
+ end
102
+
103
+ def parse_array(column_value)
104
+ column_value&.split("; ")
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,38 @@
1
+ module Datasets
2
+ class JapaneseDateParser
3
+ class UnsupportedEraInitialRange < Error; end
4
+
5
+ ERA_INITIALS = {
6
+ "平成" => "H",
7
+ "令和" => "R",
8
+ }.freeze
9
+
10
+ def parse(string)
11
+ case string
12
+ when nil
13
+ nil
14
+ when /\A(平成|令和|..)\s*(\d{1,2}|元)年\s*(\d{1,2})月\s*(\d{1,2})日\z/
15
+ match_data = Regexp.last_match
16
+ era_initial = ERA_INITIALS[match_data[1]]
17
+ if era_initial.nil?
18
+ message = +"era must be one of ["
19
+ message << ERA_INITIALS.keys.join(", ")
20
+ message << "]: #{match_data[1]}"
21
+ raise UnsupportedEraInitialRange, message
22
+ end
23
+
24
+ year = match_data[2]
25
+ if year == "元"
26
+ year = "01"
27
+ else
28
+ year = year.rjust(2, "0")
29
+ end
30
+ month = match_data[3].rjust(2, "0")
31
+ day = match_data[4].rjust(2, "0")
32
+ Date.jisx0301("#{era_initial}#{year}.#{month}.#{day}")
33
+ else
34
+ string
35
+ end
36
+ end
37
+ end
38
+ end
@@ -2,9 +2,13 @@ require_relative 'mnist'
2
2
 
3
3
  module Datasets
4
4
  class KuzushijiMNIST < MNIST
5
- BASE_URL = "http://codh.rois.ac.jp/kmnist/dataset/kmnist/"
6
-
7
5
  private
6
+ def base_urls
7
+ [
8
+ "http://codh.rois.ac.jp/kmnist/dataset/kmnist/",
9
+ ]
10
+ end
11
+
8
12
  def dataset_name
9
13
  "Kuzushiji-MNIST"
10
14
  end
data/lib/datasets/lazy.rb CHANGED
@@ -57,6 +57,8 @@ module Datasets
57
57
  LAZY_LOADER.register(:FuelEconomy, "datasets/fuel-economy")
58
58
  LAZY_LOADER.register(:Geolonia, "datasets/geolonia")
59
59
  LAZY_LOADER.register(:Hepatitis, "datasets/hepatitis")
60
+ LAZY_LOADER.register(:HouseOfCouncillor, "datasets/house-of-councillor")
61
+ LAZY_LOADER.register(:HouseOfRepresentative, "datasets/house-of-representative")
60
62
  LAZY_LOADER.register(:Iris, "datasets/iris")
61
63
  LAZY_LOADER.register(:ITACorpus, "datasets/ita-corpus")
62
64
  LAZY_LOADER.register(:KuzushijiMNIST, "datasets/kuzushiji-mnist")
@@ -110,7 +110,7 @@ module Datasets
110
110
  @row = []
111
111
  when "td"
112
112
  @in_td = true
113
- @row << {:text => ""}
113
+ @row << {:text => +""}
114
114
  when "a"
115
115
  @row.last[:href] = attributes["href"] if @in_td
116
116
  end
@@ -4,8 +4,6 @@ require_relative "dataset"
4
4
 
5
5
  module Datasets
6
6
  class MNIST < Dataset
7
- BASE_URL = "http://yann.lecun.com/exdb/mnist/"
8
-
9
7
  class Record < Struct.new(:data, :label)
10
8
  def pixels
11
9
  data.unpack("C*")
@@ -27,7 +25,7 @@ module Datasets
27
25
 
28
26
  @metadata.id = "#{dataset_name.downcase}-#{type}"
29
27
  @metadata.name = "#{dataset_name}: #{type}"
30
- @metadata.url = self.class::BASE_URL
28
+ @metadata.url = base_urls.first
31
29
  @metadata.licenses = licenses
32
30
  @type = type
33
31
 
@@ -44,15 +42,23 @@ module Datasets
44
42
 
45
43
  image_path = cache_dir_path + target_file(:image)
46
44
  label_path = cache_dir_path + target_file(:label)
47
- base_url = self.class::BASE_URL
48
45
 
49
- download(image_path, base_url + target_file(:image))
50
- download(label_path, base_url + target_file(:label))
46
+ download(image_path,
47
+ *base_urls.collect { |base_url| base_url + target_file(:image) })
48
+ download(label_path,
49
+ *base_urls.collect { |base_url| base_url + target_file(:label) })
51
50
 
52
51
  open_data(image_path, label_path, &block)
53
52
  end
54
53
 
55
54
  private
55
+ def base_urls
56
+ [
57
+ "http://yann.lecun.com/exdb/mnist/",
58
+ "https://ossci-datasets.s3.amazonaws.com/mnist/",
59
+ ]
60
+ end
61
+
56
62
  def licenses
57
63
  []
58
64
  end
@@ -28,8 +28,8 @@ module Datasets
28
28
 
29
29
  def initialize
30
30
  super()
31
- @metadata.id = 'nagoya-university-conversation-curpus'
32
- @metadata.name = 'Nagoya University Conversation Curpus'
31
+ @metadata.id = 'nagoya-university-conversation-corpus'
32
+ @metadata.name = 'Nagoya University Conversation Corpus'
33
33
  @metadata.url = 'https://mmsrv.ninjal.ac.jp/nucc/'
34
34
  @metadata.licenses = ['CC-BY-NC-ND-4.0']
35
35
  @metadata.description = <<~DESCRIPTION
@@ -41,7 +41,7 @@ module Datasets
41
41
  super()
42
42
  @reading = reading
43
43
  unless VALID_READINGS.include?(@reading)
44
- message = ":reading must be one of ["
44
+ message = +":reading must be one of ["
45
45
  message << VALID_READINGS.collect(&:inspect).join(", ")
46
46
  message << "]: #{@reading.inspect}"
47
47
  raise ArgumentError, message
@@ -104,14 +104,14 @@ module Datasets
104
104
 
105
105
  private
106
106
  def open_data
107
- data_url = "https://www.post.japanpost.jp/zipcode/dl"
107
+ data_url = +"https://www.post.japanpost.jp/zipcode/dl"
108
108
  case @reading
109
109
  when :lowercase
110
110
  data_url << "/kogaki/zip/ken_all.zip"
111
111
  when :uppercase
112
112
  data_url << "/oogaki/zip/ken_all.zip"
113
113
  when :romaji
114
- data_url << "/roman/ken_all_rome.zip"
114
+ data_url << "/roman/KEN_ALL_ROME.zip"
115
115
  end
116
116
  data_path = cache_dir_path + "#{@reading}-ken-all.zip"
117
117
  download(data_path, data_url)