red-datasets 0.1.7 → 0.1.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -0
- data/Rakefile +10 -0
- data/doc/text/news.md +29 -0
- data/lib/datasets/california-housing.rb +1 -1
- data/lib/datasets/dataset.rb +2 -2
- data/lib/datasets/downloader.rb +34 -16
- data/lib/datasets/fashion-mnist.rb +6 -2
- data/lib/datasets/ggplot2-dataset.rb +3 -3
- data/lib/datasets/house-of-councillor.rb +169 -0
- data/lib/datasets/house-of-representative.rb +107 -0
- data/lib/datasets/japanese-date-parser.rb +38 -0
- data/lib/datasets/kuzushiji-mnist.rb +6 -2
- data/lib/datasets/lazy.rb +2 -0
- data/lib/datasets/libsvm-dataset-list.rb +1 -1
- data/lib/datasets/mnist.rb +12 -6
- data/lib/datasets/nagoya-university-conversation-corpus.rb +2 -2
- data/lib/datasets/postal-code-japan.rb +3 -3
- data/lib/datasets/quora-duplicate-question-pair.rb +1 -1
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia-kyoto-japanese-english.rb +2 -2
- data/lib/datasets/wikipedia.rb +2 -2
- data/test/japanese-date-parser-test.rb +27 -0
- data/test/test-adult.rb +36 -86
- data/test/test-aozora-bunko.rb +5 -5
- data/test/test-california-housing.rb +12 -31
- data/test/test-cldr-plurals.rb +1 -1
- data/test/test-diamonds.rb +13 -33
- data/test/test-downloader.rb +1 -1
- data/test/test-geolonia.rb +17 -41
- data/test/test-house-of-councillor.rb +223 -0
- data/test/test-house-of-representative.rb +54 -0
- data/test/test-nagoya-university-conversation-corpus.rb +17 -69
- data/test/test-postal-code-japan.rb +7 -0
- data/test/test-quora-duplicate-question-pair.rb +7 -21
- data/test/test-rdataset.rb +24 -22
- data/test/test-sudachi-synonym-dictionary.rb +12 -31
- data/test/test-wikipedia.rb +5 -5
- metadata +12 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: deff1c4d13294030c25c06691e272881eb049274decd9e2a958d0486c792ef26
|
4
|
+
data.tar.gz: e48151e045fbf343291a5f5770409dea7672ff39c75bcc617152b52a39890f31
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7ae5a39a716bfa719f937c6ff139bd90ccf625e8a7ce72298507506f1a2c6a100e81c1273bf395a40d0ae8f1a13c1b1fb554e123e49dff718607a734ffd6c67b
|
7
|
+
data.tar.gz: 72e664ba5f2a569a92d9d4237588f92f7398621642486e4c9e6c930bade4c17e4a68d30da14cb277fcd602025f959c42284462bf4370872f9826c9634ff78aca
|
data/README.md
CHANGED
@@ -29,6 +29,8 @@ You can use datasets easily because you can access each dataset with multiple wa
|
|
29
29
|
* Fuel Economy Dataset
|
30
30
|
* Geolonia Japanese Addresses
|
31
31
|
* Hepatitis
|
32
|
+
* House of Councillors of Japan
|
33
|
+
* House of Representatives of Japan
|
32
34
|
* Iris Dataset
|
33
35
|
* Libsvm
|
34
36
|
* MNIST database
|
data/Rakefile
CHANGED
@@ -13,6 +13,16 @@ end
|
|
13
13
|
helper.install
|
14
14
|
spec = helper.gemspec
|
15
15
|
|
16
|
+
release_task = Rake.application["release"]
|
17
|
+
# We use Trusted Publishing.
|
18
|
+
release_task.prerequisites.delete("build")
|
19
|
+
release_task.prerequisites.delete("release:rubygem_push")
|
20
|
+
release_task_comment = release_task.comment
|
21
|
+
if release_task_comment
|
22
|
+
release_task.clear_comments
|
23
|
+
release_task.comment = release_task_comment.gsub(/ and build.*$/, "")
|
24
|
+
end
|
25
|
+
|
16
26
|
task default: :test
|
17
27
|
|
18
28
|
desc "Run tests"
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,34 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 0.1.8 - 2025-02-07
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* Suppressed "literal string will be frozen" warnings.
|
8
|
+
|
9
|
+
* Patch by Tsutomu Katsube
|
10
|
+
|
11
|
+
* `Datasets::HouseOfCouncillor`: Added.
|
12
|
+
|
13
|
+
* [GH-143](https://github.com/red-data-tools/red-datasets/issues/143)
|
14
|
+
|
15
|
+
* [GH-181](https://github.com/red-data-tools/red-datasets/issues/181)
|
16
|
+
|
17
|
+
* Patch by Tsutomu Katsube
|
18
|
+
|
19
|
+
* `Datasets::HouseOfRepresentative`: Added.
|
20
|
+
|
21
|
+
* [GH-143](https://github.com/red-data-tools/red-datasets/issues/142)
|
22
|
+
|
23
|
+
* [GH-181](https://github.com/red-data-tools/red-datasets/issues/184)
|
24
|
+
|
25
|
+
* Patch by Tsutomu Katsube
|
26
|
+
|
27
|
+
### Thanks
|
28
|
+
|
29
|
+
* Tsutomu Katsube
|
30
|
+
|
31
|
+
|
3
32
|
## 0.1.7 - 2023-05-29
|
4
33
|
|
5
34
|
### Improvements
|
@@ -36,7 +36,7 @@ Available from http://lib.stat.cmu.edu/datasets/.
|
|
36
36
|
file_name = "cadata.txt"
|
37
37
|
download(data_path, data_url)
|
38
38
|
open_data(data_path, file_name) do |input|
|
39
|
-
data = ""
|
39
|
+
data = +""
|
40
40
|
input.each_line do |line|
|
41
41
|
next unless line.start_with?(" ")
|
42
42
|
data << line.lstrip.gsub(/ +/, ",")
|
data/lib/datasets/dataset.rb
CHANGED
@@ -33,8 +33,8 @@ module Datasets
|
|
33
33
|
@cache_path ||= CachePath.new(@metadata.id)
|
34
34
|
end
|
35
35
|
|
36
|
-
def download(output_path, url, &block)
|
37
|
-
downloader = Downloader.new(url)
|
36
|
+
def download(output_path, url, *fallback_urls, &block)
|
37
|
+
downloader = Downloader.new(url, *fallback_urls)
|
38
38
|
downloader.download(output_path, &block)
|
39
39
|
end
|
40
40
|
|
data/lib/datasets/downloader.rb
CHANGED
@@ -6,20 +6,15 @@ end
|
|
6
6
|
require "net/http"
|
7
7
|
require "pathname"
|
8
8
|
|
9
|
+
require_relative "error"
|
10
|
+
|
9
11
|
module Datasets
|
10
12
|
class Downloader
|
11
|
-
class TooManyRedirects <
|
13
|
+
class TooManyRedirects < Error; end
|
12
14
|
|
13
|
-
def initialize(url)
|
14
|
-
|
15
|
-
|
16
|
-
else
|
17
|
-
url = URI.parse(url)
|
18
|
-
end
|
19
|
-
@url = url
|
20
|
-
unless @url.is_a?(URI::HTTP)
|
21
|
-
raise ArgumentError, "download URL must be HTTP or HTTPS: <#{@url}>"
|
22
|
-
end
|
15
|
+
def initialize(url, *fallback_urls)
|
16
|
+
@url = normalize_url(url)
|
17
|
+
@fallback_urls = fallback_urls.collect { |fallback_url| normalize_url(fallback_url) }
|
23
18
|
end
|
24
19
|
|
25
20
|
def download(output_path, &block)
|
@@ -45,7 +40,7 @@ module Datasets
|
|
45
40
|
headers["Range"] = "bytes=#{start}-"
|
46
41
|
end
|
47
42
|
|
48
|
-
start_http(@url, headers) do |response|
|
43
|
+
start_http(@url, @fallback_urls, headers) do |response|
|
49
44
|
if response.is_a?(Net::HTTPPartialContent)
|
50
45
|
mode = "ab"
|
51
46
|
else
|
@@ -85,6 +80,18 @@ module Datasets
|
|
85
80
|
end
|
86
81
|
end
|
87
82
|
|
83
|
+
private def normalize_url(url)
|
84
|
+
if url.is_a?(URI::Generic)
|
85
|
+
url = url.dup
|
86
|
+
else
|
87
|
+
url = URI.parse(url)
|
88
|
+
end
|
89
|
+
unless url.is_a?(URI::HTTP)
|
90
|
+
raise ArgumentError, "download URL must be HTTP or HTTPS: <#{url}>"
|
91
|
+
end
|
92
|
+
url
|
93
|
+
end
|
94
|
+
|
88
95
|
private def synchronize(output_path, partial_output_path)
|
89
96
|
begin
|
90
97
|
Process.getpgid(Process.pid)
|
@@ -104,7 +111,8 @@ module Datasets
|
|
104
111
|
rescue ArgumentError
|
105
112
|
# The process that acquired the lock will be exited before
|
106
113
|
# it stores its process ID.
|
107
|
-
|
114
|
+
elapsed_time = Time.now - lock_path.mtime
|
115
|
+
valid_lock_path = (elapsed_time > 10)
|
108
116
|
else
|
109
117
|
begin
|
110
118
|
Process.getpgid(pid)
|
@@ -133,7 +141,7 @@ module Datasets
|
|
133
141
|
end
|
134
142
|
end
|
135
143
|
|
136
|
-
private def start_http(url, headers, limit = 10, &block)
|
144
|
+
private def start_http(url, fallback_urls, headers, limit = 10, &block)
|
137
145
|
if limit == 0
|
138
146
|
raise TooManyRedirects, "too many redirections: #{url}"
|
139
147
|
end
|
@@ -151,8 +159,18 @@ module Datasets
|
|
151
159
|
when Net::HTTPRedirection
|
152
160
|
url = URI.parse(response[:location])
|
153
161
|
$stderr.puts "Redirect to #{url}"
|
154
|
-
return start_http(url, headers, limit - 1, &block)
|
162
|
+
return start_http(url, fallback_urls, headers, limit - 1, &block)
|
155
163
|
else
|
164
|
+
if response.is_a?(Net::HTTPForbidden)
|
165
|
+
next_url, *rest_fallback_urls = fallback_urls
|
166
|
+
if next_url
|
167
|
+
message = "#{response.code}: #{response.message}: " +
|
168
|
+
"fallback: <#{url}> -> <#{next_url}>"
|
169
|
+
$stderr.puts(message)
|
170
|
+
return start_http(next_url, rest_fallback_urls, headers, &block)
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
156
174
|
message = response.code
|
157
175
|
if response.message and not response.message.empty?
|
158
176
|
message += ": #{response.message}"
|
@@ -167,7 +185,7 @@ module Datasets
|
|
167
185
|
private def yield_chunks(path)
|
168
186
|
path.open("rb") do |output|
|
169
187
|
chunk_size = 1024 * 1024
|
170
|
-
chunk = ""
|
188
|
+
chunk = +""
|
171
189
|
while output.read(chunk_size, chunk)
|
172
190
|
yield(chunk)
|
173
191
|
end
|
@@ -2,9 +2,13 @@ require_relative 'mnist'
|
|
2
2
|
|
3
3
|
module Datasets
|
4
4
|
class FashionMNIST < MNIST
|
5
|
-
BASE_URL = "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/"
|
6
|
-
|
7
5
|
private
|
6
|
+
def base_urls
|
7
|
+
[
|
8
|
+
"http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/",
|
9
|
+
]
|
10
|
+
end
|
11
|
+
|
8
12
|
def dataset_name
|
9
13
|
"Fashion-MNIST"
|
10
14
|
end
|
@@ -17,7 +17,7 @@ module Datasets
|
|
17
17
|
data_path = cache_dir_path + data_base_name
|
18
18
|
data_url = "#{download_base_url}/data-raw/#{data_base_name}"
|
19
19
|
download(data_path, data_url)
|
20
|
-
CSV.open(data_path, headers: :first_row, converters: :
|
20
|
+
CSV.open(data_path, headers: :first_row, converters: :numeric) do |csv|
|
21
21
|
record_class = self.class::Record
|
22
22
|
csv.each do |row|
|
23
23
|
record = record_class.new(*row.fields)
|
@@ -37,7 +37,7 @@ module Datasets
|
|
37
37
|
data_r_url = "#{download_base_url}/R/#{data_r_base_name}"
|
38
38
|
download(data_r_path, data_r_url)
|
39
39
|
descriptions = {}
|
40
|
-
comment = ""
|
40
|
+
comment = +""
|
41
41
|
File.open(data_r_path) do |data_r|
|
42
42
|
data_r.each_line do |line|
|
43
43
|
case line.chomp
|
@@ -51,7 +51,7 @@ module Datasets
|
|
51
51
|
when /\A"(.+)"\z/
|
52
52
|
name = Regexp.last_match[1]
|
53
53
|
descriptions[name] = parse_roxygen(comment.rstrip)
|
54
|
-
comment = ""
|
54
|
+
comment = +""
|
55
55
|
end
|
56
56
|
end
|
57
57
|
descriptions[@ggplot2_dataset_name]
|
@@ -0,0 +1,169 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class HouseOfCouncillor < Dataset
|
5
|
+
Bill = Struct.new(:council_time,
|
6
|
+
:bill_type,
|
7
|
+
:submit_time,
|
8
|
+
:submit_number,
|
9
|
+
:title,
|
10
|
+
:bill_url,
|
11
|
+
:bill_summary_url,
|
12
|
+
:proposed_bill_url,
|
13
|
+
:proposed_on,
|
14
|
+
:proposed_on_from_house_of_representatives,
|
15
|
+
:proposed_on_to_house_of_representatives,
|
16
|
+
:prior_deliberations_type,
|
17
|
+
:continuation_type,
|
18
|
+
:proposers,
|
19
|
+
:submitter,
|
20
|
+
:submitter_type,
|
21
|
+
:progress_of_house_of_councillors_committees_etc_refer_on,
|
22
|
+
:progress_of_house_of_councillors_committees_etc_committee_etc,
|
23
|
+
:progress_of_house_of_councillors_committees_etc_pass_on,
|
24
|
+
:progress_of_house_of_councillors_committees_etc_result,
|
25
|
+
:progress_of_house_of_councillors_plenary_sitting_pass_on,
|
26
|
+
:progress_of_house_of_councillors_plenary_sitting_result,
|
27
|
+
:progress_of_house_of_councillors_plenary_sitting_committees,
|
28
|
+
:progress_of_house_of_councillors_plenary_sitting_vote_type,
|
29
|
+
:progress_of_house_of_councillors_plenary_sitting_vote_method,
|
30
|
+
:progress_of_house_of_councillors_plenary_sitting_result_url,
|
31
|
+
:progress_of_house_of_representatives_committees_etc_refer_on,
|
32
|
+
:progress_of_house_of_representatives_committees_etc_committee_etc,
|
33
|
+
:progress_of_house_of_representatives_committees_etc_pass_on,
|
34
|
+
:progress_of_house_of_representatives_committees_etc_result,
|
35
|
+
:progress_of_house_of_representatives_plenary_sitting_pass_on,
|
36
|
+
:progress_of_house_of_representatives_plenary_sitting_result,
|
37
|
+
:progress_of_house_of_representatives_plenary_sitting_committees,
|
38
|
+
:progress_of_house_of_representatives_plenary_sitting_vote_type,
|
39
|
+
:progress_of_house_of_representatives_plenary_sitting_vote_method,
|
40
|
+
:promulgated_on,
|
41
|
+
:law_number,
|
42
|
+
:entracted_law_url,
|
43
|
+
:notes)
|
44
|
+
|
45
|
+
InHouseGroup = Struct.new(:in_house_group_name_and_abbreviation_on,
|
46
|
+
:in_house_group_name,
|
47
|
+
:in_house_group_abbreviation,
|
48
|
+
:number_of_members_on,
|
49
|
+
:number_of_members,
|
50
|
+
:number_of_women_members,
|
51
|
+
:first_term_expires_on,
|
52
|
+
:first_term_proportional_representation_number_of_members,
|
53
|
+
:first_term_proportional_representation_number_of_women_members,
|
54
|
+
:first_term_election_district_number_of_members,
|
55
|
+
:first_term_election_district_number_of_women_members,
|
56
|
+
:first_term_total_number_of_members,
|
57
|
+
:first_term_total_number_of_women_members,
|
58
|
+
:second_term_expires_on,
|
59
|
+
:second_term_proportional_representation_number_of_members,
|
60
|
+
:second_term_proportional_representation_number_of_women_members,
|
61
|
+
:second_term_election_district_number_of_members,
|
62
|
+
:second_term_election_district_number_of_women_members,
|
63
|
+
:second_term_total_number_of_members,
|
64
|
+
:second_term_total_number_of_women_members)
|
65
|
+
|
66
|
+
Member = Struct.new(:professional_name,
|
67
|
+
:true_name,
|
68
|
+
:profile_url,
|
69
|
+
:professional_name_reading,
|
70
|
+
:in_house_group_abbreviation,
|
71
|
+
:constituency,
|
72
|
+
:expiration_of_term,
|
73
|
+
:photo_url,
|
74
|
+
:elected_years,
|
75
|
+
:elected_number,
|
76
|
+
:responsibilities,
|
77
|
+
:responsibility_on,
|
78
|
+
:career,
|
79
|
+
:career_on)
|
80
|
+
|
81
|
+
Question = Struct.new(:submit_time,
|
82
|
+
:submit_number,
|
83
|
+
:title,
|
84
|
+
:submitter,
|
85
|
+
:number_of_submissions,
|
86
|
+
:question_for_text_html_url,
|
87
|
+
:answer_for_text_html_url,
|
88
|
+
:question_for_text_pdf_url,
|
89
|
+
:answer_for_text_pdf_url,
|
90
|
+
:question_url,
|
91
|
+
:submitted_on,
|
92
|
+
:transfered_on,
|
93
|
+
:received_answer_on,
|
94
|
+
:notes)
|
95
|
+
|
96
|
+
VALID_TYPES = [
|
97
|
+
:bill,
|
98
|
+
:in_house_group,
|
99
|
+
:member,
|
100
|
+
:question
|
101
|
+
]
|
102
|
+
|
103
|
+
def initialize(type: :bill)
|
104
|
+
super()
|
105
|
+
@type = type
|
106
|
+
unless VALID_TYPES.include?(type)
|
107
|
+
message = +":type must be one of ["
|
108
|
+
message << VALID_TYPES.collect(&:inspect).join(", ")
|
109
|
+
message << "]: #{@type.inspect}"
|
110
|
+
raise ArgumentError, message
|
111
|
+
end
|
112
|
+
|
113
|
+
@metadata.id = "house-of-councillor"
|
114
|
+
@metadata.name = "Bill, in-House group, member and question of the House of Councillors of Japan"
|
115
|
+
@metadata.url = "https://smartnews-smri.github.io/house-of-councillors"
|
116
|
+
@metadata.licenses = ["MIT"]
|
117
|
+
@metadata.description = "The House of Councillors of Japan (type: #{@type})"
|
118
|
+
end
|
119
|
+
|
120
|
+
def each
|
121
|
+
return to_enum(__method__) unless block_given?
|
122
|
+
|
123
|
+
open_data do |csv|
|
124
|
+
csv.each do |row|
|
125
|
+
case @type
|
126
|
+
when :bill
|
127
|
+
record = Bill.new(*row.fields)
|
128
|
+
when :in_house_group
|
129
|
+
record = InHouseGroup.new(*row.fields)
|
130
|
+
when :member
|
131
|
+
%w(当選年).each do |ints_column_name|
|
132
|
+
row[ints_column_name] = parse_ints(row[ints_column_name])
|
133
|
+
end
|
134
|
+
record = Member.new(*row.fields)
|
135
|
+
when :question
|
136
|
+
record = Question.new(*row.fields)
|
137
|
+
end
|
138
|
+
yield(record)
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
private
|
144
|
+
|
145
|
+
def open_data
|
146
|
+
data_url = +"https://smartnews-smri.github.io/house-of-councillors/data"
|
147
|
+
case @type
|
148
|
+
when :bill
|
149
|
+
data_url << "/gian.csv"
|
150
|
+
when :in_house_group
|
151
|
+
data_url << "/kaiha.csv"
|
152
|
+
when :member
|
153
|
+
data_url << "/giin.csv"
|
154
|
+
when :question
|
155
|
+
data_url << "/syuisyo.csv"
|
156
|
+
end
|
157
|
+
data_path = cache_dir_path + "#{@type}.csv"
|
158
|
+
download(data_path, data_url)
|
159
|
+
|
160
|
+
CSV.open(data_path, col_sep: ",", headers: true, converters: %i(date integer)) do |csv|
|
161
|
+
yield(csv)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
def parse_ints(column_value)
|
166
|
+
column_value.to_s.split("、").collect(&:to_i)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
require_relative "japanese-date-parser"
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class HouseOfRepresentative < Dataset
|
6
|
+
Record = Struct.new(:carry_time,
|
7
|
+
:caption,
|
8
|
+
:type,
|
9
|
+
:submit_time,
|
10
|
+
:submit_number,
|
11
|
+
:title,
|
12
|
+
:discussion_status,
|
13
|
+
:progress,
|
14
|
+
:progress_url,
|
15
|
+
:text,
|
16
|
+
:text_url,
|
17
|
+
:bill_type,
|
18
|
+
:submitter,
|
19
|
+
:submitter_in_house_groups,
|
20
|
+
:house_of_representatives_of_accepted_bill_on_preliminary_consideration,
|
21
|
+
:house_of_representatives_of_preliminary_refer_on,
|
22
|
+
:house_of_representatives_of_preliminary_refer_commission,
|
23
|
+
:house_of_representatives_of_accepted_bill_on,
|
24
|
+
:house_of_representatives_of_refer_on,
|
25
|
+
:house_of_representatives_of_refer_commission,
|
26
|
+
:house_of_representatives_of_finished_consideration_on,
|
27
|
+
:house_of_representatives_of_consideration_result,
|
28
|
+
:house_of_representatives_of_finished_deliberation_on,
|
29
|
+
:house_of_representatives_of_deliberation_result,
|
30
|
+
:house_of_representatives_of_attitude_of_in_house_group_during_deliberation,
|
31
|
+
:house_of_representatives_of_support_in_house_group_during_deliberation,
|
32
|
+
:house_of_representatives_of_opposition_in_house_group_during_deliberation,
|
33
|
+
:house_of_councillors_of_accepted_bill_on_preliminary_consideration,
|
34
|
+
:house_of_councillors_of_preliminary_refer_on,
|
35
|
+
:house_of_councillors_of_preliminary_refer_commission,
|
36
|
+
:house_of_councillors_of_accepted_bill_on,
|
37
|
+
:house_of_councillors_of_refer_on,
|
38
|
+
:house_of_councillors_of_refer_commission,
|
39
|
+
:house_of_councillors_of_finished_consideration_on,
|
40
|
+
:house_of_councillors_of_consideration_result,
|
41
|
+
:house_of_councillors_of_finished_deliberation_on,
|
42
|
+
:house_of_councillors_of_deliberation_result,
|
43
|
+
:promulgated_on,
|
44
|
+
:law_number,
|
45
|
+
:submitters,
|
46
|
+
:supporters_of_submitted_bill)
|
47
|
+
|
48
|
+
def initialize
|
49
|
+
super()
|
50
|
+
|
51
|
+
@metadata.id = "house-of-representative"
|
52
|
+
@metadata.name = "Bill of the House of Representatives of Japan"
|
53
|
+
@metadata.url = "https://smartnews-smri.github.io/house-of-representatives"
|
54
|
+
@metadata.licenses = ["MIT"]
|
55
|
+
@metadata.description = "Bill of the House of Representatives of Japan"
|
56
|
+
end
|
57
|
+
|
58
|
+
def each
|
59
|
+
return to_enum(__method__) unless block_given?
|
60
|
+
|
61
|
+
open_data do |csv|
|
62
|
+
csv.each do |row|
|
63
|
+
record = Record.new(*row.fields)
|
64
|
+
yield(record)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
def open_data
|
72
|
+
data_url = "https://raw.githubusercontent.com/smartnews-smri/house-of-representatives/main/data/gian.csv"
|
73
|
+
data_path = cache_dir_path + "gian.csv"
|
74
|
+
download(data_path, data_url)
|
75
|
+
|
76
|
+
parser = JapaneseDateParser.new
|
77
|
+
japanese_date_converter = lambda do |field, info|
|
78
|
+
if info.header.end_with?("年月日")
|
79
|
+
parser.parse(field)
|
80
|
+
else
|
81
|
+
field
|
82
|
+
end
|
83
|
+
end
|
84
|
+
array_converter = lambda do |field, info|
|
85
|
+
case info.header
|
86
|
+
when "議案提出会派", "衆議院審議時賛成会派", "衆議院審議時反対会派", "議案提出者一覧", "議案提出の賛成者"
|
87
|
+
parse_array(field)
|
88
|
+
else
|
89
|
+
field
|
90
|
+
end
|
91
|
+
end
|
92
|
+
File.open(data_path) do |data_file|
|
93
|
+
options = {
|
94
|
+
col_sep: ",",
|
95
|
+
headers: true,
|
96
|
+
converters: [:integer, japanese_date_converter, array_converter],
|
97
|
+
}
|
98
|
+
# There are two columns within one column. To split into two columns, `#gsub` is necessary.
|
99
|
+
yield(CSV.new(data_file.read.gsub("/", ","), **options))
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def parse_array(column_value)
|
104
|
+
column_value&.split("; ")
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module Datasets
|
2
|
+
class JapaneseDateParser
|
3
|
+
class UnsupportedEraInitialRange < Error; end
|
4
|
+
|
5
|
+
ERA_INITIALS = {
|
6
|
+
"平成" => "H",
|
7
|
+
"令和" => "R",
|
8
|
+
}.freeze
|
9
|
+
|
10
|
+
def parse(string)
|
11
|
+
case string
|
12
|
+
when nil
|
13
|
+
nil
|
14
|
+
when /\A(平成|令和|..)\s*(\d{1,2}|元)年\s*(\d{1,2})月\s*(\d{1,2})日\z/
|
15
|
+
match_data = Regexp.last_match
|
16
|
+
era_initial = ERA_INITIALS[match_data[1]]
|
17
|
+
if era_initial.nil?
|
18
|
+
message = +"era must be one of ["
|
19
|
+
message << ERA_INITIALS.keys.join(", ")
|
20
|
+
message << "]: #{match_data[1]}"
|
21
|
+
raise UnsupportedEraInitialRange, message
|
22
|
+
end
|
23
|
+
|
24
|
+
year = match_data[2]
|
25
|
+
if year == "元"
|
26
|
+
year = "01"
|
27
|
+
else
|
28
|
+
year = year.rjust(2, "0")
|
29
|
+
end
|
30
|
+
month = match_data[3].rjust(2, "0")
|
31
|
+
day = match_data[4].rjust(2, "0")
|
32
|
+
Date.jisx0301("#{era_initial}#{year}.#{month}.#{day}")
|
33
|
+
else
|
34
|
+
string
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -2,9 +2,13 @@ require_relative 'mnist'
|
|
2
2
|
|
3
3
|
module Datasets
|
4
4
|
class KuzushijiMNIST < MNIST
|
5
|
-
BASE_URL = "http://codh.rois.ac.jp/kmnist/dataset/kmnist/"
|
6
|
-
|
7
5
|
private
|
6
|
+
def base_urls
|
7
|
+
[
|
8
|
+
"http://codh.rois.ac.jp/kmnist/dataset/kmnist/",
|
9
|
+
]
|
10
|
+
end
|
11
|
+
|
8
12
|
def dataset_name
|
9
13
|
"Kuzushiji-MNIST"
|
10
14
|
end
|
data/lib/datasets/lazy.rb
CHANGED
@@ -57,6 +57,8 @@ module Datasets
|
|
57
57
|
LAZY_LOADER.register(:FuelEconomy, "datasets/fuel-economy")
|
58
58
|
LAZY_LOADER.register(:Geolonia, "datasets/geolonia")
|
59
59
|
LAZY_LOADER.register(:Hepatitis, "datasets/hepatitis")
|
60
|
+
LAZY_LOADER.register(:HouseOfCouncillor, "datasets/house-of-councillor")
|
61
|
+
LAZY_LOADER.register(:HouseOfRepresentative, "datasets/house-of-representative")
|
60
62
|
LAZY_LOADER.register(:Iris, "datasets/iris")
|
61
63
|
LAZY_LOADER.register(:ITACorpus, "datasets/ita-corpus")
|
62
64
|
LAZY_LOADER.register(:KuzushijiMNIST, "datasets/kuzushiji-mnist")
|
data/lib/datasets/mnist.rb
CHANGED
@@ -4,8 +4,6 @@ require_relative "dataset"
|
|
4
4
|
|
5
5
|
module Datasets
|
6
6
|
class MNIST < Dataset
|
7
|
-
BASE_URL = "http://yann.lecun.com/exdb/mnist/"
|
8
|
-
|
9
7
|
class Record < Struct.new(:data, :label)
|
10
8
|
def pixels
|
11
9
|
data.unpack("C*")
|
@@ -27,7 +25,7 @@ module Datasets
|
|
27
25
|
|
28
26
|
@metadata.id = "#{dataset_name.downcase}-#{type}"
|
29
27
|
@metadata.name = "#{dataset_name}: #{type}"
|
30
|
-
@metadata.url =
|
28
|
+
@metadata.url = base_urls.first
|
31
29
|
@metadata.licenses = licenses
|
32
30
|
@type = type
|
33
31
|
|
@@ -44,15 +42,23 @@ module Datasets
|
|
44
42
|
|
45
43
|
image_path = cache_dir_path + target_file(:image)
|
46
44
|
label_path = cache_dir_path + target_file(:label)
|
47
|
-
base_url = self.class::BASE_URL
|
48
45
|
|
49
|
-
download(image_path,
|
50
|
-
|
46
|
+
download(image_path,
|
47
|
+
*base_urls.collect { |base_url| base_url + target_file(:image) })
|
48
|
+
download(label_path,
|
49
|
+
*base_urls.collect { |base_url| base_url + target_file(:label) })
|
51
50
|
|
52
51
|
open_data(image_path, label_path, &block)
|
53
52
|
end
|
54
53
|
|
55
54
|
private
|
55
|
+
def base_urls
|
56
|
+
[
|
57
|
+
"http://yann.lecun.com/exdb/mnist/",
|
58
|
+
"https://ossci-datasets.s3.amazonaws.com/mnist/",
|
59
|
+
]
|
60
|
+
end
|
61
|
+
|
56
62
|
def licenses
|
57
63
|
[]
|
58
64
|
end
|
@@ -28,8 +28,8 @@ module Datasets
|
|
28
28
|
|
29
29
|
def initialize
|
30
30
|
super()
|
31
|
-
@metadata.id = 'nagoya-university-conversation-
|
32
|
-
@metadata.name = 'Nagoya University Conversation
|
31
|
+
@metadata.id = 'nagoya-university-conversation-corpus'
|
32
|
+
@metadata.name = 'Nagoya University Conversation Corpus'
|
33
33
|
@metadata.url = 'https://mmsrv.ninjal.ac.jp/nucc/'
|
34
34
|
@metadata.licenses = ['CC-BY-NC-ND-4.0']
|
35
35
|
@metadata.description = <<~DESCRIPTION
|
@@ -41,7 +41,7 @@ module Datasets
|
|
41
41
|
super()
|
42
42
|
@reading = reading
|
43
43
|
unless VALID_READINGS.include?(@reading)
|
44
|
-
message = ":reading must be one of ["
|
44
|
+
message = +":reading must be one of ["
|
45
45
|
message << VALID_READINGS.collect(&:inspect).join(", ")
|
46
46
|
message << "]: #{@reading.inspect}"
|
47
47
|
raise ArgumentError, message
|
@@ -104,14 +104,14 @@ module Datasets
|
|
104
104
|
|
105
105
|
private
|
106
106
|
def open_data
|
107
|
-
data_url = "https://www.post.japanpost.jp/zipcode/dl"
|
107
|
+
data_url = +"https://www.post.japanpost.jp/zipcode/dl"
|
108
108
|
case @reading
|
109
109
|
when :lowercase
|
110
110
|
data_url << "/kogaki/zip/ken_all.zip"
|
111
111
|
when :uppercase
|
112
112
|
data_url << "/oogaki/zip/ken_all.zip"
|
113
113
|
when :romaji
|
114
|
-
data_url << "/roman/
|
114
|
+
data_url << "/roman/KEN_ALL_ROME.zip"
|
115
115
|
end
|
116
116
|
data_path = cache_dir_path + "#{@reading}-ken-all.zip"
|
117
117
|
download(data_path, data_url)
|