red-datasets 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -0
- data/Rakefile +10 -0
- data/doc/text/news.md +29 -0
- data/lib/datasets/california-housing.rb +1 -1
- data/lib/datasets/dataset.rb +2 -2
- data/lib/datasets/downloader.rb +34 -16
- data/lib/datasets/fashion-mnist.rb +6 -2
- data/lib/datasets/ggplot2-dataset.rb +3 -3
- data/lib/datasets/house-of-councillor.rb +169 -0
- data/lib/datasets/house-of-representative.rb +107 -0
- data/lib/datasets/japanese-date-parser.rb +38 -0
- data/lib/datasets/kuzushiji-mnist.rb +6 -2
- data/lib/datasets/lazy.rb +2 -0
- data/lib/datasets/libsvm-dataset-list.rb +1 -1
- data/lib/datasets/mnist.rb +12 -6
- data/lib/datasets/nagoya-university-conversation-corpus.rb +2 -2
- data/lib/datasets/postal-code-japan.rb +3 -3
- data/lib/datasets/quora-duplicate-question-pair.rb +1 -1
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia-kyoto-japanese-english.rb +2 -2
- data/lib/datasets/wikipedia.rb +2 -2
- data/test/japanese-date-parser-test.rb +27 -0
- data/test/test-adult.rb +36 -86
- data/test/test-aozora-bunko.rb +5 -5
- data/test/test-california-housing.rb +12 -31
- data/test/test-cldr-plurals.rb +1 -1
- data/test/test-diamonds.rb +13 -33
- data/test/test-downloader.rb +1 -1
- data/test/test-geolonia.rb +17 -41
- data/test/test-house-of-councillor.rb +223 -0
- data/test/test-house-of-representative.rb +54 -0
- data/test/test-nagoya-university-conversation-corpus.rb +17 -69
- data/test/test-postal-code-japan.rb +7 -0
- data/test/test-quora-duplicate-question-pair.rb +7 -21
- data/test/test-rdataset.rb +24 -22
- data/test/test-sudachi-synonym-dictionary.rb +12 -31
- data/test/test-wikipedia.rb +5 -5
- metadata +12 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: deff1c4d13294030c25c06691e272881eb049274decd9e2a958d0486c792ef26
|
4
|
+
data.tar.gz: e48151e045fbf343291a5f5770409dea7672ff39c75bcc617152b52a39890f31
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7ae5a39a716bfa719f937c6ff139bd90ccf625e8a7ce72298507506f1a2c6a100e81c1273bf395a40d0ae8f1a13c1b1fb554e123e49dff718607a734ffd6c67b
|
7
|
+
data.tar.gz: 72e664ba5f2a569a92d9d4237588f92f7398621642486e4c9e6c930bade4c17e4a68d30da14cb277fcd602025f959c42284462bf4370872f9826c9634ff78aca
|
data/README.md
CHANGED
@@ -29,6 +29,8 @@ You can use datasets easily because you can access each dataset with multiple wa
|
|
29
29
|
* Fuel Economy Dataset
|
30
30
|
* Geolonia Japanese Addresses
|
31
31
|
* Hepatitis
|
32
|
+
* House of Councillors of Japan
|
33
|
+
* House of Representatives of Japan
|
32
34
|
* Iris Dataset
|
33
35
|
* Libsvm
|
34
36
|
* MNIST database
|
data/Rakefile
CHANGED
@@ -13,6 +13,16 @@ end
|
|
13
13
|
helper.install
|
14
14
|
spec = helper.gemspec
|
15
15
|
|
16
|
+
release_task = Rake.application["release"]
|
17
|
+
# We use Trusted Publishing.
|
18
|
+
release_task.prerequisites.delete("build")
|
19
|
+
release_task.prerequisites.delete("release:rubygem_push")
|
20
|
+
release_task_comment = release_task.comment
|
21
|
+
if release_task_comment
|
22
|
+
release_task.clear_comments
|
23
|
+
release_task.comment = release_task_comment.gsub(/ and build.*$/, "")
|
24
|
+
end
|
25
|
+
|
16
26
|
task default: :test
|
17
27
|
|
18
28
|
desc "Run tests"
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,34 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 0.1.8 - 2025-02-07
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* Suppressed "literal string will be frozen" warnings.
|
8
|
+
|
9
|
+
* Patch by Tsutomu Katsube
|
10
|
+
|
11
|
+
* `Datasets::HouseOfCouncillor`: Added.
|
12
|
+
|
13
|
+
* [GH-143](https://github.com/red-data-tools/red-datasets/issues/143)
|
14
|
+
|
15
|
+
* [GH-181](https://github.com/red-data-tools/red-datasets/issues/181)
|
16
|
+
|
17
|
+
* Patch by Tsutomu Katsube
|
18
|
+
|
19
|
+
* `Datasets::HouseOfRepresentative`: Added.
|
20
|
+
|
21
|
+
* [GH-143](https://github.com/red-data-tools/red-datasets/issues/142)
|
22
|
+
|
23
|
+
* [GH-181](https://github.com/red-data-tools/red-datasets/issues/184)
|
24
|
+
|
25
|
+
* Patch by Tsutomu Katsube
|
26
|
+
|
27
|
+
### Thanks
|
28
|
+
|
29
|
+
* Tsutomu Katsube
|
30
|
+
|
31
|
+
|
3
32
|
## 0.1.7 - 2023-05-29
|
4
33
|
|
5
34
|
### Improvements
|
@@ -36,7 +36,7 @@ Available from http://lib.stat.cmu.edu/datasets/.
|
|
36
36
|
file_name = "cadata.txt"
|
37
37
|
download(data_path, data_url)
|
38
38
|
open_data(data_path, file_name) do |input|
|
39
|
-
data = ""
|
39
|
+
data = +""
|
40
40
|
input.each_line do |line|
|
41
41
|
next unless line.start_with?(" ")
|
42
42
|
data << line.lstrip.gsub(/ +/, ",")
|
data/lib/datasets/dataset.rb
CHANGED
@@ -33,8 +33,8 @@ module Datasets
|
|
33
33
|
@cache_path ||= CachePath.new(@metadata.id)
|
34
34
|
end
|
35
35
|
|
36
|
-
def download(output_path, url, &block)
|
37
|
-
downloader = Downloader.new(url)
|
36
|
+
def download(output_path, url, *fallback_urls, &block)
|
37
|
+
downloader = Downloader.new(url, *fallback_urls)
|
38
38
|
downloader.download(output_path, &block)
|
39
39
|
end
|
40
40
|
|
data/lib/datasets/downloader.rb
CHANGED
@@ -6,20 +6,15 @@ end
|
|
6
6
|
require "net/http"
|
7
7
|
require "pathname"
|
8
8
|
|
9
|
+
require_relative "error"
|
10
|
+
|
9
11
|
module Datasets
|
10
12
|
class Downloader
|
11
|
-
class TooManyRedirects <
|
13
|
+
class TooManyRedirects < Error; end
|
12
14
|
|
13
|
-
def initialize(url)
|
14
|
-
|
15
|
-
|
16
|
-
else
|
17
|
-
url = URI.parse(url)
|
18
|
-
end
|
19
|
-
@url = url
|
20
|
-
unless @url.is_a?(URI::HTTP)
|
21
|
-
raise ArgumentError, "download URL must be HTTP or HTTPS: <#{@url}>"
|
22
|
-
end
|
15
|
+
def initialize(url, *fallback_urls)
|
16
|
+
@url = normalize_url(url)
|
17
|
+
@fallback_urls = fallback_urls.collect { |fallback_url| normalize_url(fallback_url) }
|
23
18
|
end
|
24
19
|
|
25
20
|
def download(output_path, &block)
|
@@ -45,7 +40,7 @@ module Datasets
|
|
45
40
|
headers["Range"] = "bytes=#{start}-"
|
46
41
|
end
|
47
42
|
|
48
|
-
start_http(@url, headers) do |response|
|
43
|
+
start_http(@url, @fallback_urls, headers) do |response|
|
49
44
|
if response.is_a?(Net::HTTPPartialContent)
|
50
45
|
mode = "ab"
|
51
46
|
else
|
@@ -85,6 +80,18 @@ module Datasets
|
|
85
80
|
end
|
86
81
|
end
|
87
82
|
|
83
|
+
private def normalize_url(url)
|
84
|
+
if url.is_a?(URI::Generic)
|
85
|
+
url = url.dup
|
86
|
+
else
|
87
|
+
url = URI.parse(url)
|
88
|
+
end
|
89
|
+
unless url.is_a?(URI::HTTP)
|
90
|
+
raise ArgumentError, "download URL must be HTTP or HTTPS: <#{url}>"
|
91
|
+
end
|
92
|
+
url
|
93
|
+
end
|
94
|
+
|
88
95
|
private def synchronize(output_path, partial_output_path)
|
89
96
|
begin
|
90
97
|
Process.getpgid(Process.pid)
|
@@ -104,7 +111,8 @@ module Datasets
|
|
104
111
|
rescue ArgumentError
|
105
112
|
# The process that acquired the lock will be exited before
|
106
113
|
# it stores its process ID.
|
107
|
-
|
114
|
+
elapsed_time = Time.now - lock_path.mtime
|
115
|
+
valid_lock_path = (elapsed_time > 10)
|
108
116
|
else
|
109
117
|
begin
|
110
118
|
Process.getpgid(pid)
|
@@ -133,7 +141,7 @@ module Datasets
|
|
133
141
|
end
|
134
142
|
end
|
135
143
|
|
136
|
-
private def start_http(url, headers, limit = 10, &block)
|
144
|
+
private def start_http(url, fallback_urls, headers, limit = 10, &block)
|
137
145
|
if limit == 0
|
138
146
|
raise TooManyRedirects, "too many redirections: #{url}"
|
139
147
|
end
|
@@ -151,8 +159,18 @@ module Datasets
|
|
151
159
|
when Net::HTTPRedirection
|
152
160
|
url = URI.parse(response[:location])
|
153
161
|
$stderr.puts "Redirect to #{url}"
|
154
|
-
return start_http(url, headers, limit - 1, &block)
|
162
|
+
return start_http(url, fallback_urls, headers, limit - 1, &block)
|
155
163
|
else
|
164
|
+
if response.is_a?(Net::HTTPForbidden)
|
165
|
+
next_url, *rest_fallback_urls = fallback_urls
|
166
|
+
if next_url
|
167
|
+
message = "#{response.code}: #{response.message}: " +
|
168
|
+
"fallback: <#{url}> -> <#{next_url}>"
|
169
|
+
$stderr.puts(message)
|
170
|
+
return start_http(next_url, rest_fallback_urls, headers, &block)
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
156
174
|
message = response.code
|
157
175
|
if response.message and not response.message.empty?
|
158
176
|
message += ": #{response.message}"
|
@@ -167,7 +185,7 @@ module Datasets
|
|
167
185
|
private def yield_chunks(path)
|
168
186
|
path.open("rb") do |output|
|
169
187
|
chunk_size = 1024 * 1024
|
170
|
-
chunk = ""
|
188
|
+
chunk = +""
|
171
189
|
while output.read(chunk_size, chunk)
|
172
190
|
yield(chunk)
|
173
191
|
end
|
@@ -2,9 +2,13 @@ require_relative 'mnist'
|
|
2
2
|
|
3
3
|
module Datasets
|
4
4
|
class FashionMNIST < MNIST
|
5
|
-
BASE_URL = "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/"
|
6
|
-
|
7
5
|
private
|
6
|
+
def base_urls
|
7
|
+
[
|
8
|
+
"http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/",
|
9
|
+
]
|
10
|
+
end
|
11
|
+
|
8
12
|
def dataset_name
|
9
13
|
"Fashion-MNIST"
|
10
14
|
end
|
@@ -17,7 +17,7 @@ module Datasets
|
|
17
17
|
data_path = cache_dir_path + data_base_name
|
18
18
|
data_url = "#{download_base_url}/data-raw/#{data_base_name}"
|
19
19
|
download(data_path, data_url)
|
20
|
-
CSV.open(data_path, headers: :first_row, converters: :
|
20
|
+
CSV.open(data_path, headers: :first_row, converters: :numeric) do |csv|
|
21
21
|
record_class = self.class::Record
|
22
22
|
csv.each do |row|
|
23
23
|
record = record_class.new(*row.fields)
|
@@ -37,7 +37,7 @@ module Datasets
|
|
37
37
|
data_r_url = "#{download_base_url}/R/#{data_r_base_name}"
|
38
38
|
download(data_r_path, data_r_url)
|
39
39
|
descriptions = {}
|
40
|
-
comment = ""
|
40
|
+
comment = +""
|
41
41
|
File.open(data_r_path) do |data_r|
|
42
42
|
data_r.each_line do |line|
|
43
43
|
case line.chomp
|
@@ -51,7 +51,7 @@ module Datasets
|
|
51
51
|
when /\A"(.+)"\z/
|
52
52
|
name = Regexp.last_match[1]
|
53
53
|
descriptions[name] = parse_roxygen(comment.rstrip)
|
54
|
-
comment = ""
|
54
|
+
comment = +""
|
55
55
|
end
|
56
56
|
end
|
57
57
|
descriptions[@ggplot2_dataset_name]
|
@@ -0,0 +1,169 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class HouseOfCouncillor < Dataset
|
5
|
+
Bill = Struct.new(:council_time,
|
6
|
+
:bill_type,
|
7
|
+
:submit_time,
|
8
|
+
:submit_number,
|
9
|
+
:title,
|
10
|
+
:bill_url,
|
11
|
+
:bill_summary_url,
|
12
|
+
:proposed_bill_url,
|
13
|
+
:proposed_on,
|
14
|
+
:proposed_on_from_house_of_representatives,
|
15
|
+
:proposed_on_to_house_of_representatives,
|
16
|
+
:prior_deliberations_type,
|
17
|
+
:continuation_type,
|
18
|
+
:proposers,
|
19
|
+
:submitter,
|
20
|
+
:submitter_type,
|
21
|
+
:progress_of_house_of_councillors_committees_etc_refer_on,
|
22
|
+
:progress_of_house_of_councillors_committees_etc_committee_etc,
|
23
|
+
:progress_of_house_of_councillors_committees_etc_pass_on,
|
24
|
+
:progress_of_house_of_councillors_committees_etc_result,
|
25
|
+
:progress_of_house_of_councillors_plenary_sitting_pass_on,
|
26
|
+
:progress_of_house_of_councillors_plenary_sitting_result,
|
27
|
+
:progress_of_house_of_councillors_plenary_sitting_committees,
|
28
|
+
:progress_of_house_of_councillors_plenary_sitting_vote_type,
|
29
|
+
:progress_of_house_of_councillors_plenary_sitting_vote_method,
|
30
|
+
:progress_of_house_of_councillors_plenary_sitting_result_url,
|
31
|
+
:progress_of_house_of_representatives_committees_etc_refer_on,
|
32
|
+
:progress_of_house_of_representatives_committees_etc_committee_etc,
|
33
|
+
:progress_of_house_of_representatives_committees_etc_pass_on,
|
34
|
+
:progress_of_house_of_representatives_committees_etc_result,
|
35
|
+
:progress_of_house_of_representatives_plenary_sitting_pass_on,
|
36
|
+
:progress_of_house_of_representatives_plenary_sitting_result,
|
37
|
+
:progress_of_house_of_representatives_plenary_sitting_committees,
|
38
|
+
:progress_of_house_of_representatives_plenary_sitting_vote_type,
|
39
|
+
:progress_of_house_of_representatives_plenary_sitting_vote_method,
|
40
|
+
:promulgated_on,
|
41
|
+
:law_number,
|
42
|
+
:entracted_law_url,
|
43
|
+
:notes)
|
44
|
+
|
45
|
+
InHouseGroup = Struct.new(:in_house_group_name_and_abbreviation_on,
|
46
|
+
:in_house_group_name,
|
47
|
+
:in_house_group_abbreviation,
|
48
|
+
:number_of_members_on,
|
49
|
+
:number_of_members,
|
50
|
+
:number_of_women_members,
|
51
|
+
:first_term_expires_on,
|
52
|
+
:first_term_proportional_representation_number_of_members,
|
53
|
+
:first_term_proportional_representation_number_of_women_members,
|
54
|
+
:first_term_election_district_number_of_members,
|
55
|
+
:first_term_election_district_number_of_women_members,
|
56
|
+
:first_term_total_number_of_members,
|
57
|
+
:first_term_total_number_of_women_members,
|
58
|
+
:second_term_expires_on,
|
59
|
+
:second_term_proportional_representation_number_of_members,
|
60
|
+
:second_term_proportional_representation_number_of_women_members,
|
61
|
+
:second_term_election_district_number_of_members,
|
62
|
+
:second_term_election_district_number_of_women_members,
|
63
|
+
:second_term_total_number_of_members,
|
64
|
+
:second_term_total_number_of_women_members)
|
65
|
+
|
66
|
+
Member = Struct.new(:professional_name,
|
67
|
+
:true_name,
|
68
|
+
:profile_url,
|
69
|
+
:professional_name_reading,
|
70
|
+
:in_house_group_abbreviation,
|
71
|
+
:constituency,
|
72
|
+
:expiration_of_term,
|
73
|
+
:photo_url,
|
74
|
+
:elected_years,
|
75
|
+
:elected_number,
|
76
|
+
:responsibilities,
|
77
|
+
:responsibility_on,
|
78
|
+
:career,
|
79
|
+
:career_on)
|
80
|
+
|
81
|
+
Question = Struct.new(:submit_time,
|
82
|
+
:submit_number,
|
83
|
+
:title,
|
84
|
+
:submitter,
|
85
|
+
:number_of_submissions,
|
86
|
+
:question_for_text_html_url,
|
87
|
+
:answer_for_text_html_url,
|
88
|
+
:question_for_text_pdf_url,
|
89
|
+
:answer_for_text_pdf_url,
|
90
|
+
:question_url,
|
91
|
+
:submitted_on,
|
92
|
+
:transfered_on,
|
93
|
+
:received_answer_on,
|
94
|
+
:notes)
|
95
|
+
|
96
|
+
VALID_TYPES = [
|
97
|
+
:bill,
|
98
|
+
:in_house_group,
|
99
|
+
:member,
|
100
|
+
:question
|
101
|
+
]
|
102
|
+
|
103
|
+
def initialize(type: :bill)
|
104
|
+
super()
|
105
|
+
@type = type
|
106
|
+
unless VALID_TYPES.include?(type)
|
107
|
+
message = +":type must be one of ["
|
108
|
+
message << VALID_TYPES.collect(&:inspect).join(", ")
|
109
|
+
message << "]: #{@type.inspect}"
|
110
|
+
raise ArgumentError, message
|
111
|
+
end
|
112
|
+
|
113
|
+
@metadata.id = "house-of-councillor"
|
114
|
+
@metadata.name = "Bill, in-House group, member and question of the House of Councillors of Japan"
|
115
|
+
@metadata.url = "https://smartnews-smri.github.io/house-of-councillors"
|
116
|
+
@metadata.licenses = ["MIT"]
|
117
|
+
@metadata.description = "The House of Councillors of Japan (type: #{@type})"
|
118
|
+
end
|
119
|
+
|
120
|
+
def each
|
121
|
+
return to_enum(__method__) unless block_given?
|
122
|
+
|
123
|
+
open_data do |csv|
|
124
|
+
csv.each do |row|
|
125
|
+
case @type
|
126
|
+
when :bill
|
127
|
+
record = Bill.new(*row.fields)
|
128
|
+
when :in_house_group
|
129
|
+
record = InHouseGroup.new(*row.fields)
|
130
|
+
when :member
|
131
|
+
%w(当選年).each do |ints_column_name|
|
132
|
+
row[ints_column_name] = parse_ints(row[ints_column_name])
|
133
|
+
end
|
134
|
+
record = Member.new(*row.fields)
|
135
|
+
when :question
|
136
|
+
record = Question.new(*row.fields)
|
137
|
+
end
|
138
|
+
yield(record)
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
private
|
144
|
+
|
145
|
+
def open_data
|
146
|
+
data_url = +"https://smartnews-smri.github.io/house-of-councillors/data"
|
147
|
+
case @type
|
148
|
+
when :bill
|
149
|
+
data_url << "/gian.csv"
|
150
|
+
when :in_house_group
|
151
|
+
data_url << "/kaiha.csv"
|
152
|
+
when :member
|
153
|
+
data_url << "/giin.csv"
|
154
|
+
when :question
|
155
|
+
data_url << "/syuisyo.csv"
|
156
|
+
end
|
157
|
+
data_path = cache_dir_path + "#{@type}.csv"
|
158
|
+
download(data_path, data_url)
|
159
|
+
|
160
|
+
CSV.open(data_path, col_sep: ",", headers: true, converters: %i(date integer)) do |csv|
|
161
|
+
yield(csv)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
def parse_ints(column_value)
|
166
|
+
column_value.to_s.split("、").collect(&:to_i)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
require_relative "japanese-date-parser"
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class HouseOfRepresentative < Dataset
|
6
|
+
Record = Struct.new(:carry_time,
|
7
|
+
:caption,
|
8
|
+
:type,
|
9
|
+
:submit_time,
|
10
|
+
:submit_number,
|
11
|
+
:title,
|
12
|
+
:discussion_status,
|
13
|
+
:progress,
|
14
|
+
:progress_url,
|
15
|
+
:text,
|
16
|
+
:text_url,
|
17
|
+
:bill_type,
|
18
|
+
:submitter,
|
19
|
+
:submitter_in_house_groups,
|
20
|
+
:house_of_representatives_of_accepted_bill_on_preliminary_consideration,
|
21
|
+
:house_of_representatives_of_preliminary_refer_on,
|
22
|
+
:house_of_representatives_of_preliminary_refer_commission,
|
23
|
+
:house_of_representatives_of_accepted_bill_on,
|
24
|
+
:house_of_representatives_of_refer_on,
|
25
|
+
:house_of_representatives_of_refer_commission,
|
26
|
+
:house_of_representatives_of_finished_consideration_on,
|
27
|
+
:house_of_representatives_of_consideration_result,
|
28
|
+
:house_of_representatives_of_finished_deliberation_on,
|
29
|
+
:house_of_representatives_of_deliberation_result,
|
30
|
+
:house_of_representatives_of_attitude_of_in_house_group_during_deliberation,
|
31
|
+
:house_of_representatives_of_support_in_house_group_during_deliberation,
|
32
|
+
:house_of_representatives_of_opposition_in_house_group_during_deliberation,
|
33
|
+
:house_of_councillors_of_accepted_bill_on_preliminary_consideration,
|
34
|
+
:house_of_councillors_of_preliminary_refer_on,
|
35
|
+
:house_of_councillors_of_preliminary_refer_commission,
|
36
|
+
:house_of_councillors_of_accepted_bill_on,
|
37
|
+
:house_of_councillors_of_refer_on,
|
38
|
+
:house_of_councillors_of_refer_commission,
|
39
|
+
:house_of_councillors_of_finished_consideration_on,
|
40
|
+
:house_of_councillors_of_consideration_result,
|
41
|
+
:house_of_councillors_of_finished_deliberation_on,
|
42
|
+
:house_of_councillors_of_deliberation_result,
|
43
|
+
:promulgated_on,
|
44
|
+
:law_number,
|
45
|
+
:submitters,
|
46
|
+
:supporters_of_submitted_bill)
|
47
|
+
|
48
|
+
def initialize
|
49
|
+
super()
|
50
|
+
|
51
|
+
@metadata.id = "house-of-representative"
|
52
|
+
@metadata.name = "Bill of the House of Representatives of Japan"
|
53
|
+
@metadata.url = "https://smartnews-smri.github.io/house-of-representatives"
|
54
|
+
@metadata.licenses = ["MIT"]
|
55
|
+
@metadata.description = "Bill of the House of Representatives of Japan"
|
56
|
+
end
|
57
|
+
|
58
|
+
def each
|
59
|
+
return to_enum(__method__) unless block_given?
|
60
|
+
|
61
|
+
open_data do |csv|
|
62
|
+
csv.each do |row|
|
63
|
+
record = Record.new(*row.fields)
|
64
|
+
yield(record)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
def open_data
|
72
|
+
data_url = "https://raw.githubusercontent.com/smartnews-smri/house-of-representatives/main/data/gian.csv"
|
73
|
+
data_path = cache_dir_path + "gian.csv"
|
74
|
+
download(data_path, data_url)
|
75
|
+
|
76
|
+
parser = JapaneseDateParser.new
|
77
|
+
japanese_date_converter = lambda do |field, info|
|
78
|
+
if info.header.end_with?("年月日")
|
79
|
+
parser.parse(field)
|
80
|
+
else
|
81
|
+
field
|
82
|
+
end
|
83
|
+
end
|
84
|
+
array_converter = lambda do |field, info|
|
85
|
+
case info.header
|
86
|
+
when "議案提出会派", "衆議院審議時賛成会派", "衆議院審議時反対会派", "議案提出者一覧", "議案提出の賛成者"
|
87
|
+
parse_array(field)
|
88
|
+
else
|
89
|
+
field
|
90
|
+
end
|
91
|
+
end
|
92
|
+
File.open(data_path) do |data_file|
|
93
|
+
options = {
|
94
|
+
col_sep: ",",
|
95
|
+
headers: true,
|
96
|
+
converters: [:integer, japanese_date_converter, array_converter],
|
97
|
+
}
|
98
|
+
# There are two columns within one column. To split into two columns, `#gsub` is necessary.
|
99
|
+
yield(CSV.new(data_file.read.gsub("/", ","), **options))
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def parse_array(column_value)
|
104
|
+
column_value&.split("; ")
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module Datasets
|
2
|
+
class JapaneseDateParser
|
3
|
+
class UnsupportedEraInitialRange < Error; end
|
4
|
+
|
5
|
+
ERA_INITIALS = {
|
6
|
+
"平成" => "H",
|
7
|
+
"令和" => "R",
|
8
|
+
}.freeze
|
9
|
+
|
10
|
+
def parse(string)
|
11
|
+
case string
|
12
|
+
when nil
|
13
|
+
nil
|
14
|
+
when /\A(平成|令和|..)\s*(\d{1,2}|元)年\s*(\d{1,2})月\s*(\d{1,2})日\z/
|
15
|
+
match_data = Regexp.last_match
|
16
|
+
era_initial = ERA_INITIALS[match_data[1]]
|
17
|
+
if era_initial.nil?
|
18
|
+
message = +"era must be one of ["
|
19
|
+
message << ERA_INITIALS.keys.join(", ")
|
20
|
+
message << "]: #{match_data[1]}"
|
21
|
+
raise UnsupportedEraInitialRange, message
|
22
|
+
end
|
23
|
+
|
24
|
+
year = match_data[2]
|
25
|
+
if year == "元"
|
26
|
+
year = "01"
|
27
|
+
else
|
28
|
+
year = year.rjust(2, "0")
|
29
|
+
end
|
30
|
+
month = match_data[3].rjust(2, "0")
|
31
|
+
day = match_data[4].rjust(2, "0")
|
32
|
+
Date.jisx0301("#{era_initial}#{year}.#{month}.#{day}")
|
33
|
+
else
|
34
|
+
string
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -2,9 +2,13 @@ require_relative 'mnist'
|
|
2
2
|
|
3
3
|
module Datasets
|
4
4
|
class KuzushijiMNIST < MNIST
|
5
|
-
BASE_URL = "http://codh.rois.ac.jp/kmnist/dataset/kmnist/"
|
6
|
-
|
7
5
|
private
|
6
|
+
def base_urls
|
7
|
+
[
|
8
|
+
"http://codh.rois.ac.jp/kmnist/dataset/kmnist/",
|
9
|
+
]
|
10
|
+
end
|
11
|
+
|
8
12
|
def dataset_name
|
9
13
|
"Kuzushiji-MNIST"
|
10
14
|
end
|
data/lib/datasets/lazy.rb
CHANGED
@@ -57,6 +57,8 @@ module Datasets
|
|
57
57
|
LAZY_LOADER.register(:FuelEconomy, "datasets/fuel-economy")
|
58
58
|
LAZY_LOADER.register(:Geolonia, "datasets/geolonia")
|
59
59
|
LAZY_LOADER.register(:Hepatitis, "datasets/hepatitis")
|
60
|
+
LAZY_LOADER.register(:HouseOfCouncillor, "datasets/house-of-councillor")
|
61
|
+
LAZY_LOADER.register(:HouseOfRepresentative, "datasets/house-of-representative")
|
60
62
|
LAZY_LOADER.register(:Iris, "datasets/iris")
|
61
63
|
LAZY_LOADER.register(:ITACorpus, "datasets/ita-corpus")
|
62
64
|
LAZY_LOADER.register(:KuzushijiMNIST, "datasets/kuzushiji-mnist")
|
data/lib/datasets/mnist.rb
CHANGED
@@ -4,8 +4,6 @@ require_relative "dataset"
|
|
4
4
|
|
5
5
|
module Datasets
|
6
6
|
class MNIST < Dataset
|
7
|
-
BASE_URL = "http://yann.lecun.com/exdb/mnist/"
|
8
|
-
|
9
7
|
class Record < Struct.new(:data, :label)
|
10
8
|
def pixels
|
11
9
|
data.unpack("C*")
|
@@ -27,7 +25,7 @@ module Datasets
|
|
27
25
|
|
28
26
|
@metadata.id = "#{dataset_name.downcase}-#{type}"
|
29
27
|
@metadata.name = "#{dataset_name}: #{type}"
|
30
|
-
@metadata.url =
|
28
|
+
@metadata.url = base_urls.first
|
31
29
|
@metadata.licenses = licenses
|
32
30
|
@type = type
|
33
31
|
|
@@ -44,15 +42,23 @@ module Datasets
|
|
44
42
|
|
45
43
|
image_path = cache_dir_path + target_file(:image)
|
46
44
|
label_path = cache_dir_path + target_file(:label)
|
47
|
-
base_url = self.class::BASE_URL
|
48
45
|
|
49
|
-
download(image_path,
|
50
|
-
|
46
|
+
download(image_path,
|
47
|
+
*base_urls.collect { |base_url| base_url + target_file(:image) })
|
48
|
+
download(label_path,
|
49
|
+
*base_urls.collect { |base_url| base_url + target_file(:label) })
|
51
50
|
|
52
51
|
open_data(image_path, label_path, &block)
|
53
52
|
end
|
54
53
|
|
55
54
|
private
|
55
|
+
def base_urls
|
56
|
+
[
|
57
|
+
"http://yann.lecun.com/exdb/mnist/",
|
58
|
+
"https://ossci-datasets.s3.amazonaws.com/mnist/",
|
59
|
+
]
|
60
|
+
end
|
61
|
+
|
56
62
|
def licenses
|
57
63
|
[]
|
58
64
|
end
|
@@ -28,8 +28,8 @@ module Datasets
|
|
28
28
|
|
29
29
|
def initialize
|
30
30
|
super()
|
31
|
-
@metadata.id = 'nagoya-university-conversation-
|
32
|
-
@metadata.name = 'Nagoya University Conversation
|
31
|
+
@metadata.id = 'nagoya-university-conversation-corpus'
|
32
|
+
@metadata.name = 'Nagoya University Conversation Corpus'
|
33
33
|
@metadata.url = 'https://mmsrv.ninjal.ac.jp/nucc/'
|
34
34
|
@metadata.licenses = ['CC-BY-NC-ND-4.0']
|
35
35
|
@metadata.description = <<~DESCRIPTION
|
@@ -41,7 +41,7 @@ module Datasets
|
|
41
41
|
super()
|
42
42
|
@reading = reading
|
43
43
|
unless VALID_READINGS.include?(@reading)
|
44
|
-
message = ":reading must be one of ["
|
44
|
+
message = +":reading must be one of ["
|
45
45
|
message << VALID_READINGS.collect(&:inspect).join(", ")
|
46
46
|
message << "]: #{@reading.inspect}"
|
47
47
|
raise ArgumentError, message
|
@@ -104,14 +104,14 @@ module Datasets
|
|
104
104
|
|
105
105
|
private
|
106
106
|
def open_data
|
107
|
-
data_url = "https://www.post.japanpost.jp/zipcode/dl"
|
107
|
+
data_url = +"https://www.post.japanpost.jp/zipcode/dl"
|
108
108
|
case @reading
|
109
109
|
when :lowercase
|
110
110
|
data_url << "/kogaki/zip/ken_all.zip"
|
111
111
|
when :uppercase
|
112
112
|
data_url << "/oogaki/zip/ken_all.zip"
|
113
113
|
when :romaji
|
114
|
-
data_url << "/roman/
|
114
|
+
data_url << "/roman/KEN_ALL_ROME.zip"
|
115
115
|
end
|
116
116
|
data_path = cache_dir_path + "#{@reading}-ken-all.zip"
|
117
117
|
download(data_path, data_url)
|