red-datasets 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -0
- data/Rakefile +10 -0
- data/doc/text/news.md +36 -0
- data/lib/datasets/california-housing.rb +1 -1
- data/lib/datasets/dataset.rb +2 -2
- data/lib/datasets/downloader.rb +51 -17
- data/lib/datasets/fashion-mnist.rb +6 -2
- data/lib/datasets/ggplot2-dataset.rb +3 -3
- data/lib/datasets/house-of-councillor.rb +169 -0
- data/lib/datasets/house-of-representative.rb +107 -0
- data/lib/datasets/japanese-date-parser.rb +38 -0
- data/lib/datasets/kuzushiji-mnist.rb +6 -2
- data/lib/datasets/lazy.rb +2 -0
- data/lib/datasets/libsvm-dataset-list.rb +1 -1
- data/lib/datasets/mnist.rb +12 -6
- data/lib/datasets/nagoya-university-conversation-corpus.rb +2 -2
- data/lib/datasets/penguins.rb +28 -5
- data/lib/datasets/postal-code-japan.rb +3 -3
- data/lib/datasets/quora-duplicate-question-pair.rb +1 -1
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia-kyoto-japanese-english.rb +2 -2
- data/lib/datasets/wikipedia.rb +2 -2
- data/test/japanese-date-parser-test.rb +27 -0
- data/test/test-adult.rb +36 -86
- data/test/test-aozora-bunko.rb +5 -5
- data/test/test-california-housing.rb +12 -31
- data/test/test-cldr-plurals.rb +1 -1
- data/test/test-diamonds.rb +13 -33
- data/test/test-downloader.rb +1 -1
- data/test/test-geolonia.rb +17 -41
- data/test/test-house-of-councillor.rb +223 -0
- data/test/test-house-of-representative.rb +54 -0
- data/test/test-nagoya-university-conversation-corpus.rb +17 -69
- data/test/test-postal-code-japan.rb +7 -0
- data/test/test-quora-duplicate-question-pair.rb +7 -21
- data/test/test-rdataset.rb +24 -22
- data/test/test-sudachi-synonym-dictionary.rb +12 -31
- data/test/test-wikipedia.rb +5 -5
- metadata +12 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 01ddaa57da3c64de47cfd9eb2ca9ae2ec3cbcb5d35138fdd74f74009f062358f
|
4
|
+
data.tar.gz: 431dba2c0e41bc25a4e2716ed20936ee2022c2b04c49683bb7d0d2e2aaa2f99e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ab12e9783e4a23b81f9bd1be22c31704f9095026cb27185a6fe985e106320982fb999e1cf2348f6b18b509a7f1a6a5b58d405ece5d541e8ddbe43cd08f252a80
|
7
|
+
data.tar.gz: 157df5fffd3ba8fd021cdef3933c0a9e99a0fa7f173771e2177d1a86e79788c3250b12453f06084e24afcf624ec3283e702e6bd7595f08e2cf2b5a7dd404065c
|
data/README.md
CHANGED
@@ -29,6 +29,8 @@ You can use datasets easily because you can access each dataset with multiple wa
|
|
29
29
|
* Fuel Economy Dataset
|
30
30
|
* Geolonia Japanese Addresses
|
31
31
|
* Hepatitis
|
32
|
+
* House of Councillors of Japan
|
33
|
+
* House of Representatives of Japan
|
32
34
|
* Iris Dataset
|
33
35
|
* Libsvm
|
34
36
|
* MNIST database
|
data/Rakefile
CHANGED
@@ -13,6 +13,16 @@ end
|
|
13
13
|
helper.install
|
14
14
|
spec = helper.gemspec
|
15
15
|
|
16
|
+
release_task = Rake.application["release"]
|
17
|
+
# We use Trusted Publishing.
|
18
|
+
release_task.prerequisites.delete("build")
|
19
|
+
release_task.prerequisites.delete("release:rubygem_push")
|
20
|
+
release_task_comment = release_task.comment
|
21
|
+
if release_task_comment
|
22
|
+
release_task.clear_comments
|
23
|
+
release_task.comment = release_task_comment.gsub(/ and build.*$/, "")
|
24
|
+
end
|
25
|
+
|
16
26
|
task default: :test
|
17
27
|
|
18
28
|
desc "Run tests"
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,41 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 0.1.9 - 2025-04-08
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* `Datasets::Penguins`: Changed to use `POST` for downloading data
|
8
|
+
from EDI.
|
9
|
+
|
10
|
+
## 0.1.8 - 2025-02-07
|
11
|
+
|
12
|
+
### Improvements
|
13
|
+
|
14
|
+
* Suppressed "literal string will be frozen" warnings.
|
15
|
+
|
16
|
+
* Patch by Tsutomu Katsube
|
17
|
+
|
18
|
+
* `Datasets::HouseOfCouncillor`: Added.
|
19
|
+
|
20
|
+
* [GH-143](https://github.com/red-data-tools/red-datasets/issues/143)
|
21
|
+
|
22
|
+
* [GH-181](https://github.com/red-data-tools/red-datasets/issues/181)
|
23
|
+
|
24
|
+
* Patch by Tsutomu Katsube
|
25
|
+
|
26
|
+
* `Datasets::HouseOfRepresentative`: Added.
|
27
|
+
|
28
|
+
* [GH-143](https://github.com/red-data-tools/red-datasets/issues/142)
|
29
|
+
|
30
|
+
* [GH-181](https://github.com/red-data-tools/red-datasets/issues/184)
|
31
|
+
|
32
|
+
* Patch by Tsutomu Katsube
|
33
|
+
|
34
|
+
### Thanks
|
35
|
+
|
36
|
+
* Tsutomu Katsube
|
37
|
+
|
38
|
+
|
3
39
|
## 0.1.7 - 2023-05-29
|
4
40
|
|
5
41
|
### Improvements
|
@@ -36,7 +36,7 @@ Available from http://lib.stat.cmu.edu/datasets/.
|
|
36
36
|
file_name = "cadata.txt"
|
37
37
|
download(data_path, data_url)
|
38
38
|
open_data(data_path, file_name) do |input|
|
39
|
-
data = ""
|
39
|
+
data = +""
|
40
40
|
input.each_line do |line|
|
41
41
|
next unless line.start_with?(" ")
|
42
42
|
data << line.lstrip.gsub(/ +/, ",")
|
data/lib/datasets/dataset.rb
CHANGED
@@ -33,8 +33,8 @@ module Datasets
|
|
33
33
|
@cache_path ||= CachePath.new(@metadata.id)
|
34
34
|
end
|
35
35
|
|
36
|
-
def download(output_path, url, &block)
|
37
|
-
downloader = Downloader.new(url)
|
36
|
+
def download(output_path, url, *fallback_urls, **options, &block)
|
37
|
+
downloader = Downloader.new(url, *fallback_urls, **options)
|
38
38
|
downloader.download(output_path, &block)
|
39
39
|
end
|
40
40
|
|
data/lib/datasets/downloader.rb
CHANGED
@@ -6,20 +6,17 @@ end
|
|
6
6
|
require "net/http"
|
7
7
|
require "pathname"
|
8
8
|
|
9
|
+
require_relative "error"
|
10
|
+
|
9
11
|
module Datasets
|
10
12
|
class Downloader
|
11
|
-
class TooManyRedirects <
|
13
|
+
class TooManyRedirects < Error; end
|
12
14
|
|
13
|
-
def initialize(url)
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
end
|
19
|
-
@url = url
|
20
|
-
unless @url.is_a?(URI::HTTP)
|
21
|
-
raise ArgumentError, "download URL must be HTTP or HTTPS: <#{@url}>"
|
22
|
-
end
|
15
|
+
def initialize(url, *fallback_urls, http_method: nil, http_parameters: nil)
|
16
|
+
@url = normalize_url(url)
|
17
|
+
@fallback_urls = fallback_urls.collect { |fallback_url| normalize_url(fallback_url) }
|
18
|
+
@http_method = http_method
|
19
|
+
@http_parameters = http_parameters
|
23
20
|
end
|
24
21
|
|
25
22
|
def download(output_path, &block)
|
@@ -45,7 +42,7 @@ module Datasets
|
|
45
42
|
headers["Range"] = "bytes=#{start}-"
|
46
43
|
end
|
47
44
|
|
48
|
-
start_http(@url, headers) do |response|
|
45
|
+
start_http(@url, @fallback_urls, headers) do |response|
|
49
46
|
if response.is_a?(Net::HTTPPartialContent)
|
50
47
|
mode = "ab"
|
51
48
|
else
|
@@ -85,6 +82,18 @@ module Datasets
|
|
85
82
|
end
|
86
83
|
end
|
87
84
|
|
85
|
+
private def normalize_url(url)
|
86
|
+
if url.is_a?(URI::Generic)
|
87
|
+
url = url.dup
|
88
|
+
else
|
89
|
+
url = URI.parse(url)
|
90
|
+
end
|
91
|
+
unless url.is_a?(URI::HTTP)
|
92
|
+
raise ArgumentError, "download URL must be HTTP or HTTPS: <#{url}>"
|
93
|
+
end
|
94
|
+
url
|
95
|
+
end
|
96
|
+
|
88
97
|
private def synchronize(output_path, partial_output_path)
|
89
98
|
begin
|
90
99
|
Process.getpgid(Process.pid)
|
@@ -104,7 +113,8 @@ module Datasets
|
|
104
113
|
rescue ArgumentError
|
105
114
|
# The process that acquired the lock will be exited before
|
106
115
|
# it stores its process ID.
|
107
|
-
|
116
|
+
elapsed_time = Time.now - lock_path.mtime
|
117
|
+
valid_lock_path = (elapsed_time > 10)
|
108
118
|
else
|
109
119
|
begin
|
110
120
|
Process.getpgid(pid)
|
@@ -133,7 +143,7 @@ module Datasets
|
|
133
143
|
end
|
134
144
|
end
|
135
145
|
|
136
|
-
private def start_http(url, headers, limit = 10, &block)
|
146
|
+
private def start_http(url, fallback_urls, headers, limit = 10, &block)
|
137
147
|
if limit == 0
|
138
148
|
raise TooManyRedirects, "too many redirections: #{url}"
|
139
149
|
end
|
@@ -143,7 +153,21 @@ module Datasets
|
|
143
153
|
http.start do
|
144
154
|
path = url.path
|
145
155
|
path += "?#{url.query}" if url.query
|
146
|
-
|
156
|
+
if @http_method == :post
|
157
|
+
# TODO: We may want to add @http_content_type, @http_body
|
158
|
+
# and so on.
|
159
|
+
if @http_parameters
|
160
|
+
body = URI.encode_www_form(@http_parameters)
|
161
|
+
content_type = "application/x-www-form-urlencoded"
|
162
|
+
headers = {"Content-Type" => content_type}.merge(headers)
|
163
|
+
else
|
164
|
+
body = ""
|
165
|
+
end
|
166
|
+
request = Net::HTTP::Post.new(path, headers)
|
167
|
+
request.body = body
|
168
|
+
else
|
169
|
+
request = Net::HTTP::Get.new(path, headers)
|
170
|
+
end
|
147
171
|
http.request(request) do |response|
|
148
172
|
case response
|
149
173
|
when Net::HTTPSuccess, Net::HTTPPartialContent
|
@@ -151,8 +175,18 @@ module Datasets
|
|
151
175
|
when Net::HTTPRedirection
|
152
176
|
url = URI.parse(response[:location])
|
153
177
|
$stderr.puts "Redirect to #{url}"
|
154
|
-
return start_http(url, headers, limit - 1, &block)
|
178
|
+
return start_http(url, fallback_urls, headers, limit - 1, &block)
|
155
179
|
else
|
180
|
+
if response.is_a?(Net::HTTPForbidden)
|
181
|
+
next_url, *rest_fallback_urls = fallback_urls
|
182
|
+
if next_url
|
183
|
+
message = "#{response.code}: #{response.message}: " +
|
184
|
+
"fallback: <#{url}> -> <#{next_url}>"
|
185
|
+
$stderr.puts(message)
|
186
|
+
return start_http(next_url, rest_fallback_urls, headers, &block)
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
156
190
|
message = response.code
|
157
191
|
if response.message and not response.message.empty?
|
158
192
|
message += ": #{response.message}"
|
@@ -167,7 +201,7 @@ module Datasets
|
|
167
201
|
private def yield_chunks(path)
|
168
202
|
path.open("rb") do |output|
|
169
203
|
chunk_size = 1024 * 1024
|
170
|
-
chunk = ""
|
204
|
+
chunk = +""
|
171
205
|
while output.read(chunk_size, chunk)
|
172
206
|
yield(chunk)
|
173
207
|
end
|
@@ -2,9 +2,13 @@ require_relative 'mnist'
|
|
2
2
|
|
3
3
|
module Datasets
|
4
4
|
class FashionMNIST < MNIST
|
5
|
-
BASE_URL = "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/"
|
6
|
-
|
7
5
|
private
|
6
|
+
def base_urls
|
7
|
+
[
|
8
|
+
"http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/",
|
9
|
+
]
|
10
|
+
end
|
11
|
+
|
8
12
|
def dataset_name
|
9
13
|
"Fashion-MNIST"
|
10
14
|
end
|
@@ -17,7 +17,7 @@ module Datasets
|
|
17
17
|
data_path = cache_dir_path + data_base_name
|
18
18
|
data_url = "#{download_base_url}/data-raw/#{data_base_name}"
|
19
19
|
download(data_path, data_url)
|
20
|
-
CSV.open(data_path, headers: :first_row, converters: :
|
20
|
+
CSV.open(data_path, headers: :first_row, converters: :numeric) do |csv|
|
21
21
|
record_class = self.class::Record
|
22
22
|
csv.each do |row|
|
23
23
|
record = record_class.new(*row.fields)
|
@@ -37,7 +37,7 @@ module Datasets
|
|
37
37
|
data_r_url = "#{download_base_url}/R/#{data_r_base_name}"
|
38
38
|
download(data_r_path, data_r_url)
|
39
39
|
descriptions = {}
|
40
|
-
comment = ""
|
40
|
+
comment = +""
|
41
41
|
File.open(data_r_path) do |data_r|
|
42
42
|
data_r.each_line do |line|
|
43
43
|
case line.chomp
|
@@ -51,7 +51,7 @@ module Datasets
|
|
51
51
|
when /\A"(.+)"\z/
|
52
52
|
name = Regexp.last_match[1]
|
53
53
|
descriptions[name] = parse_roxygen(comment.rstrip)
|
54
|
-
comment = ""
|
54
|
+
comment = +""
|
55
55
|
end
|
56
56
|
end
|
57
57
|
descriptions[@ggplot2_dataset_name]
|
@@ -0,0 +1,169 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class HouseOfCouncillor < Dataset
|
5
|
+
Bill = Struct.new(:council_time,
|
6
|
+
:bill_type,
|
7
|
+
:submit_time,
|
8
|
+
:submit_number,
|
9
|
+
:title,
|
10
|
+
:bill_url,
|
11
|
+
:bill_summary_url,
|
12
|
+
:proposed_bill_url,
|
13
|
+
:proposed_on,
|
14
|
+
:proposed_on_from_house_of_representatives,
|
15
|
+
:proposed_on_to_house_of_representatives,
|
16
|
+
:prior_deliberations_type,
|
17
|
+
:continuation_type,
|
18
|
+
:proposers,
|
19
|
+
:submitter,
|
20
|
+
:submitter_type,
|
21
|
+
:progress_of_house_of_councillors_committees_etc_refer_on,
|
22
|
+
:progress_of_house_of_councillors_committees_etc_committee_etc,
|
23
|
+
:progress_of_house_of_councillors_committees_etc_pass_on,
|
24
|
+
:progress_of_house_of_councillors_committees_etc_result,
|
25
|
+
:progress_of_house_of_councillors_plenary_sitting_pass_on,
|
26
|
+
:progress_of_house_of_councillors_plenary_sitting_result,
|
27
|
+
:progress_of_house_of_councillors_plenary_sitting_committees,
|
28
|
+
:progress_of_house_of_councillors_plenary_sitting_vote_type,
|
29
|
+
:progress_of_house_of_councillors_plenary_sitting_vote_method,
|
30
|
+
:progress_of_house_of_councillors_plenary_sitting_result_url,
|
31
|
+
:progress_of_house_of_representatives_committees_etc_refer_on,
|
32
|
+
:progress_of_house_of_representatives_committees_etc_committee_etc,
|
33
|
+
:progress_of_house_of_representatives_committees_etc_pass_on,
|
34
|
+
:progress_of_house_of_representatives_committees_etc_result,
|
35
|
+
:progress_of_house_of_representatives_plenary_sitting_pass_on,
|
36
|
+
:progress_of_house_of_representatives_plenary_sitting_result,
|
37
|
+
:progress_of_house_of_representatives_plenary_sitting_committees,
|
38
|
+
:progress_of_house_of_representatives_plenary_sitting_vote_type,
|
39
|
+
:progress_of_house_of_representatives_plenary_sitting_vote_method,
|
40
|
+
:promulgated_on,
|
41
|
+
:law_number,
|
42
|
+
:entracted_law_url,
|
43
|
+
:notes)
|
44
|
+
|
45
|
+
InHouseGroup = Struct.new(:in_house_group_name_and_abbreviation_on,
|
46
|
+
:in_house_group_name,
|
47
|
+
:in_house_group_abbreviation,
|
48
|
+
:number_of_members_on,
|
49
|
+
:number_of_members,
|
50
|
+
:number_of_women_members,
|
51
|
+
:first_term_expires_on,
|
52
|
+
:first_term_proportional_representation_number_of_members,
|
53
|
+
:first_term_proportional_representation_number_of_women_members,
|
54
|
+
:first_term_election_district_number_of_members,
|
55
|
+
:first_term_election_district_number_of_women_members,
|
56
|
+
:first_term_total_number_of_members,
|
57
|
+
:first_term_total_number_of_women_members,
|
58
|
+
:second_term_expires_on,
|
59
|
+
:second_term_proportional_representation_number_of_members,
|
60
|
+
:second_term_proportional_representation_number_of_women_members,
|
61
|
+
:second_term_election_district_number_of_members,
|
62
|
+
:second_term_election_district_number_of_women_members,
|
63
|
+
:second_term_total_number_of_members,
|
64
|
+
:second_term_total_number_of_women_members)
|
65
|
+
|
66
|
+
Member = Struct.new(:professional_name,
|
67
|
+
:true_name,
|
68
|
+
:profile_url,
|
69
|
+
:professional_name_reading,
|
70
|
+
:in_house_group_abbreviation,
|
71
|
+
:constituency,
|
72
|
+
:expiration_of_term,
|
73
|
+
:photo_url,
|
74
|
+
:elected_years,
|
75
|
+
:elected_number,
|
76
|
+
:responsibilities,
|
77
|
+
:responsibility_on,
|
78
|
+
:career,
|
79
|
+
:career_on)
|
80
|
+
|
81
|
+
Question = Struct.new(:submit_time,
|
82
|
+
:submit_number,
|
83
|
+
:title,
|
84
|
+
:submitter,
|
85
|
+
:number_of_submissions,
|
86
|
+
:question_for_text_html_url,
|
87
|
+
:answer_for_text_html_url,
|
88
|
+
:question_for_text_pdf_url,
|
89
|
+
:answer_for_text_pdf_url,
|
90
|
+
:question_url,
|
91
|
+
:submitted_on,
|
92
|
+
:transfered_on,
|
93
|
+
:received_answer_on,
|
94
|
+
:notes)
|
95
|
+
|
96
|
+
VALID_TYPES = [
|
97
|
+
:bill,
|
98
|
+
:in_house_group,
|
99
|
+
:member,
|
100
|
+
:question
|
101
|
+
]
|
102
|
+
|
103
|
+
def initialize(type: :bill)
|
104
|
+
super()
|
105
|
+
@type = type
|
106
|
+
unless VALID_TYPES.include?(type)
|
107
|
+
message = +":type must be one of ["
|
108
|
+
message << VALID_TYPES.collect(&:inspect).join(", ")
|
109
|
+
message << "]: #{@type.inspect}"
|
110
|
+
raise ArgumentError, message
|
111
|
+
end
|
112
|
+
|
113
|
+
@metadata.id = "house-of-councillor"
|
114
|
+
@metadata.name = "Bill, in-House group, member and question of the House of Councillors of Japan"
|
115
|
+
@metadata.url = "https://smartnews-smri.github.io/house-of-councillors"
|
116
|
+
@metadata.licenses = ["MIT"]
|
117
|
+
@metadata.description = "The House of Councillors of Japan (type: #{@type})"
|
118
|
+
end
|
119
|
+
|
120
|
+
def each
|
121
|
+
return to_enum(__method__) unless block_given?
|
122
|
+
|
123
|
+
open_data do |csv|
|
124
|
+
csv.each do |row|
|
125
|
+
case @type
|
126
|
+
when :bill
|
127
|
+
record = Bill.new(*row.fields)
|
128
|
+
when :in_house_group
|
129
|
+
record = InHouseGroup.new(*row.fields)
|
130
|
+
when :member
|
131
|
+
%w(当選年).each do |ints_column_name|
|
132
|
+
row[ints_column_name] = parse_ints(row[ints_column_name])
|
133
|
+
end
|
134
|
+
record = Member.new(*row.fields)
|
135
|
+
when :question
|
136
|
+
record = Question.new(*row.fields)
|
137
|
+
end
|
138
|
+
yield(record)
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
private
|
144
|
+
|
145
|
+
def open_data
|
146
|
+
data_url = +"https://smartnews-smri.github.io/house-of-councillors/data"
|
147
|
+
case @type
|
148
|
+
when :bill
|
149
|
+
data_url << "/gian.csv"
|
150
|
+
when :in_house_group
|
151
|
+
data_url << "/kaiha.csv"
|
152
|
+
when :member
|
153
|
+
data_url << "/giin.csv"
|
154
|
+
when :question
|
155
|
+
data_url << "/syuisyo.csv"
|
156
|
+
end
|
157
|
+
data_path = cache_dir_path + "#{@type}.csv"
|
158
|
+
download(data_path, data_url)
|
159
|
+
|
160
|
+
CSV.open(data_path, col_sep: ",", headers: true, converters: %i(date integer)) do |csv|
|
161
|
+
yield(csv)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
def parse_ints(column_value)
|
166
|
+
column_value.to_s.split("、").collect(&:to_i)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
require_relative "japanese-date-parser"
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class HouseOfRepresentative < Dataset
|
6
|
+
Record = Struct.new(:carry_time,
|
7
|
+
:caption,
|
8
|
+
:type,
|
9
|
+
:submit_time,
|
10
|
+
:submit_number,
|
11
|
+
:title,
|
12
|
+
:discussion_status,
|
13
|
+
:progress,
|
14
|
+
:progress_url,
|
15
|
+
:text,
|
16
|
+
:text_url,
|
17
|
+
:bill_type,
|
18
|
+
:submitter,
|
19
|
+
:submitter_in_house_groups,
|
20
|
+
:house_of_representatives_of_accepted_bill_on_preliminary_consideration,
|
21
|
+
:house_of_representatives_of_preliminary_refer_on,
|
22
|
+
:house_of_representatives_of_preliminary_refer_commission,
|
23
|
+
:house_of_representatives_of_accepted_bill_on,
|
24
|
+
:house_of_representatives_of_refer_on,
|
25
|
+
:house_of_representatives_of_refer_commission,
|
26
|
+
:house_of_representatives_of_finished_consideration_on,
|
27
|
+
:house_of_representatives_of_consideration_result,
|
28
|
+
:house_of_representatives_of_finished_deliberation_on,
|
29
|
+
:house_of_representatives_of_deliberation_result,
|
30
|
+
:house_of_representatives_of_attitude_of_in_house_group_during_deliberation,
|
31
|
+
:house_of_representatives_of_support_in_house_group_during_deliberation,
|
32
|
+
:house_of_representatives_of_opposition_in_house_group_during_deliberation,
|
33
|
+
:house_of_councillors_of_accepted_bill_on_preliminary_consideration,
|
34
|
+
:house_of_councillors_of_preliminary_refer_on,
|
35
|
+
:house_of_councillors_of_preliminary_refer_commission,
|
36
|
+
:house_of_councillors_of_accepted_bill_on,
|
37
|
+
:house_of_councillors_of_refer_on,
|
38
|
+
:house_of_councillors_of_refer_commission,
|
39
|
+
:house_of_councillors_of_finished_consideration_on,
|
40
|
+
:house_of_councillors_of_consideration_result,
|
41
|
+
:house_of_councillors_of_finished_deliberation_on,
|
42
|
+
:house_of_councillors_of_deliberation_result,
|
43
|
+
:promulgated_on,
|
44
|
+
:law_number,
|
45
|
+
:submitters,
|
46
|
+
:supporters_of_submitted_bill)
|
47
|
+
|
48
|
+
def initialize
|
49
|
+
super()
|
50
|
+
|
51
|
+
@metadata.id = "house-of-representative"
|
52
|
+
@metadata.name = "Bill of the House of Representatives of Japan"
|
53
|
+
@metadata.url = "https://smartnews-smri.github.io/house-of-representatives"
|
54
|
+
@metadata.licenses = ["MIT"]
|
55
|
+
@metadata.description = "Bill of the House of Representatives of Japan"
|
56
|
+
end
|
57
|
+
|
58
|
+
def each
|
59
|
+
return to_enum(__method__) unless block_given?
|
60
|
+
|
61
|
+
open_data do |csv|
|
62
|
+
csv.each do |row|
|
63
|
+
record = Record.new(*row.fields)
|
64
|
+
yield(record)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
def open_data
|
72
|
+
data_url = "https://raw.githubusercontent.com/smartnews-smri/house-of-representatives/main/data/gian.csv"
|
73
|
+
data_path = cache_dir_path + "gian.csv"
|
74
|
+
download(data_path, data_url)
|
75
|
+
|
76
|
+
parser = JapaneseDateParser.new
|
77
|
+
japanese_date_converter = lambda do |field, info|
|
78
|
+
if info.header.end_with?("年月日")
|
79
|
+
parser.parse(field)
|
80
|
+
else
|
81
|
+
field
|
82
|
+
end
|
83
|
+
end
|
84
|
+
array_converter = lambda do |field, info|
|
85
|
+
case info.header
|
86
|
+
when "議案提出会派", "衆議院審議時賛成会派", "衆議院審議時反対会派", "議案提出者一覧", "議案提出の賛成者"
|
87
|
+
parse_array(field)
|
88
|
+
else
|
89
|
+
field
|
90
|
+
end
|
91
|
+
end
|
92
|
+
File.open(data_path) do |data_file|
|
93
|
+
options = {
|
94
|
+
col_sep: ",",
|
95
|
+
headers: true,
|
96
|
+
converters: [:integer, japanese_date_converter, array_converter],
|
97
|
+
}
|
98
|
+
# There are two columns within one column. To split into two columns, `#gsub` is necessary.
|
99
|
+
yield(CSV.new(data_file.read.gsub("/", ","), **options))
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def parse_array(column_value)
|
104
|
+
column_value&.split("; ")
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module Datasets
|
2
|
+
class JapaneseDateParser
|
3
|
+
class UnsupportedEraInitialRange < Error; end
|
4
|
+
|
5
|
+
ERA_INITIALS = {
|
6
|
+
"平成" => "H",
|
7
|
+
"令和" => "R",
|
8
|
+
}.freeze
|
9
|
+
|
10
|
+
def parse(string)
|
11
|
+
case string
|
12
|
+
when nil
|
13
|
+
nil
|
14
|
+
when /\A(平成|令和|..)\s*(\d{1,2}|元)年\s*(\d{1,2})月\s*(\d{1,2})日\z/
|
15
|
+
match_data = Regexp.last_match
|
16
|
+
era_initial = ERA_INITIALS[match_data[1]]
|
17
|
+
if era_initial.nil?
|
18
|
+
message = +"era must be one of ["
|
19
|
+
message << ERA_INITIALS.keys.join(", ")
|
20
|
+
message << "]: #{match_data[1]}"
|
21
|
+
raise UnsupportedEraInitialRange, message
|
22
|
+
end
|
23
|
+
|
24
|
+
year = match_data[2]
|
25
|
+
if year == "元"
|
26
|
+
year = "01"
|
27
|
+
else
|
28
|
+
year = year.rjust(2, "0")
|
29
|
+
end
|
30
|
+
month = match_data[3].rjust(2, "0")
|
31
|
+
day = match_data[4].rjust(2, "0")
|
32
|
+
Date.jisx0301("#{era_initial}#{year}.#{month}.#{day}")
|
33
|
+
else
|
34
|
+
string
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -2,9 +2,13 @@ require_relative 'mnist'
|
|
2
2
|
|
3
3
|
module Datasets
|
4
4
|
class KuzushijiMNIST < MNIST
|
5
|
-
BASE_URL = "http://codh.rois.ac.jp/kmnist/dataset/kmnist/"
|
6
|
-
|
7
5
|
private
|
6
|
+
def base_urls
|
7
|
+
[
|
8
|
+
"http://codh.rois.ac.jp/kmnist/dataset/kmnist/",
|
9
|
+
]
|
10
|
+
end
|
11
|
+
|
8
12
|
def dataset_name
|
9
13
|
"Kuzushiji-MNIST"
|
10
14
|
end
|
data/lib/datasets/lazy.rb
CHANGED
@@ -57,6 +57,8 @@ module Datasets
|
|
57
57
|
LAZY_LOADER.register(:FuelEconomy, "datasets/fuel-economy")
|
58
58
|
LAZY_LOADER.register(:Geolonia, "datasets/geolonia")
|
59
59
|
LAZY_LOADER.register(:Hepatitis, "datasets/hepatitis")
|
60
|
+
LAZY_LOADER.register(:HouseOfCouncillor, "datasets/house-of-councillor")
|
61
|
+
LAZY_LOADER.register(:HouseOfRepresentative, "datasets/house-of-representative")
|
60
62
|
LAZY_LOADER.register(:Iris, "datasets/iris")
|
61
63
|
LAZY_LOADER.register(:ITACorpus, "datasets/ita-corpus")
|
62
64
|
LAZY_LOADER.register(:KuzushijiMNIST, "datasets/kuzushiji-mnist")
|
data/lib/datasets/mnist.rb
CHANGED
@@ -4,8 +4,6 @@ require_relative "dataset"
|
|
4
4
|
|
5
5
|
module Datasets
|
6
6
|
class MNIST < Dataset
|
7
|
-
BASE_URL = "http://yann.lecun.com/exdb/mnist/"
|
8
|
-
|
9
7
|
class Record < Struct.new(:data, :label)
|
10
8
|
def pixels
|
11
9
|
data.unpack("C*")
|
@@ -27,7 +25,7 @@ module Datasets
|
|
27
25
|
|
28
26
|
@metadata.id = "#{dataset_name.downcase}-#{type}"
|
29
27
|
@metadata.name = "#{dataset_name}: #{type}"
|
30
|
-
@metadata.url =
|
28
|
+
@metadata.url = base_urls.first
|
31
29
|
@metadata.licenses = licenses
|
32
30
|
@type = type
|
33
31
|
|
@@ -44,15 +42,23 @@ module Datasets
|
|
44
42
|
|
45
43
|
image_path = cache_dir_path + target_file(:image)
|
46
44
|
label_path = cache_dir_path + target_file(:label)
|
47
|
-
base_url = self.class::BASE_URL
|
48
45
|
|
49
|
-
download(image_path,
|
50
|
-
|
46
|
+
download(image_path,
|
47
|
+
*base_urls.collect { |base_url| base_url + target_file(:image) })
|
48
|
+
download(label_path,
|
49
|
+
*base_urls.collect { |base_url| base_url + target_file(:label) })
|
51
50
|
|
52
51
|
open_data(image_path, label_path, &block)
|
53
52
|
end
|
54
53
|
|
55
54
|
private
|
55
|
+
def base_urls
|
56
|
+
[
|
57
|
+
"http://yann.lecun.com/exdb/mnist/",
|
58
|
+
"https://ossci-datasets.s3.amazonaws.com/mnist/",
|
59
|
+
]
|
60
|
+
end
|
61
|
+
|
56
62
|
def licenses
|
57
63
|
[]
|
58
64
|
end
|