red-datasets 0.1.6 → 0.1.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -0
- data/Rakefile +10 -0
- data/doc/text/news.md +36 -0
- data/lib/datasets/california-housing.rb +1 -1
- data/lib/datasets/dataset.rb +2 -2
- data/lib/datasets/downloader.rb +34 -16
- data/lib/datasets/fashion-mnist.rb +6 -2
- data/lib/datasets/ggplot2-dataset.rb +3 -3
- data/lib/datasets/house-of-councillor.rb +169 -0
- data/lib/datasets/house-of-representative.rb +107 -0
- data/lib/datasets/japanese-date-parser.rb +38 -0
- data/lib/datasets/kuzushiji-mnist.rb +6 -2
- data/lib/datasets/lazy.rb +2 -0
- data/lib/datasets/libsvm-dataset-list.rb +1 -1
- data/lib/datasets/mnist.rb +12 -6
- data/lib/datasets/nagoya-university-conversation-corpus.rb +6 -6
- data/lib/datasets/postal-code-japan.rb +3 -3
- data/lib/datasets/quora-duplicate-question-pair.rb +1 -1
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia-kyoto-japanese-english.rb +2 -2
- data/lib/datasets/wikipedia.rb +2 -2
- data/test/japanese-date-parser-test.rb +27 -0
- data/test/test-adult.rb +36 -86
- data/test/test-aozora-bunko.rb +5 -5
- data/test/test-california-housing.rb +12 -31
- data/test/test-cldr-plurals.rb +1 -1
- data/test/test-diamonds.rb +13 -33
- data/test/test-downloader.rb +1 -1
- data/test/test-geolonia.rb +17 -41
- data/test/test-house-of-councillor.rb +223 -0
- data/test/test-house-of-representative.rb +54 -0
- data/test/test-nagoya-university-conversation-corpus.rb +17 -69
- data/test/test-postal-code-japan.rb +7 -0
- data/test/test-quora-duplicate-question-pair.rb +7 -21
- data/test/test-rdataset.rb +24 -22
- data/test/test-sudachi-synonym-dictionary.rb +12 -31
- data/test/test-wikipedia.rb +5 -5
- metadata +12 -6
@@ -41,7 +41,7 @@ module Datasets
|
|
41
41
|
super()
|
42
42
|
@reading = reading
|
43
43
|
unless VALID_READINGS.include?(@reading)
|
44
|
-
message = ":reading must be one of ["
|
44
|
+
message = +":reading must be one of ["
|
45
45
|
message << VALID_READINGS.collect(&:inspect).join(", ")
|
46
46
|
message << "]: #{@reading.inspect}"
|
47
47
|
raise ArgumentError, message
|
@@ -104,14 +104,14 @@ module Datasets
|
|
104
104
|
|
105
105
|
private
|
106
106
|
def open_data
|
107
|
-
data_url = "https://www.post.japanpost.jp/zipcode/dl"
|
107
|
+
data_url = +"https://www.post.japanpost.jp/zipcode/dl"
|
108
108
|
case @reading
|
109
109
|
when :lowercase
|
110
110
|
data_url << "/kogaki/zip/ken_all.zip"
|
111
111
|
when :uppercase
|
112
112
|
data_url << "/oogaki/zip/ken_all.zip"
|
113
113
|
when :romaji
|
114
|
-
data_url << "/roman/
|
114
|
+
data_url << "/roman/KEN_ALL_ROME.zip"
|
115
115
|
end
|
116
116
|
data_path = cache_dir_path + "#{@reading}-ken-all.zip"
|
117
117
|
download(data_path, data_url)
|
@@ -43,7 +43,7 @@ module Datasets
|
|
43
43
|
data_path = cache_dir_path + "quora_duplicate_questions.tsv"
|
44
44
|
data_url = "https://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
|
45
45
|
download(data_path, data_url)
|
46
|
-
CSV.open(data_path, col_sep: "\t", headers: true, converters: :
|
46
|
+
CSV.open(data_path, col_sep: "\t", headers: true, converters: :integer) do |csv|
|
47
47
|
yield(csv)
|
48
48
|
end
|
49
49
|
end
|
data/lib/datasets/version.rb
CHANGED
@@ -126,7 +126,7 @@ articles (related to Kyoto) into English.
|
|
126
126
|
@sentence = nil
|
127
127
|
@text_container_stack = []
|
128
128
|
@element_stack = []
|
129
|
-
@text_stack = [""]
|
129
|
+
@text_stack = [+""]
|
130
130
|
end
|
131
131
|
|
132
132
|
def tag_start(name, attributes)
|
@@ -207,7 +207,7 @@ articles (related to Kyoto) into English.
|
|
207
207
|
private
|
208
208
|
def push_stacks(name, attributes)
|
209
209
|
@element_stack.push({name: name, attributes: attributes})
|
210
|
-
@text_stack.push("")
|
210
|
+
@text_stack.push(+"")
|
211
211
|
end
|
212
212
|
|
213
213
|
def pop_stacks
|
data/lib/datasets/wikipedia.rb
CHANGED
@@ -90,7 +90,7 @@ module Datasets
|
|
90
90
|
@contributor = nil
|
91
91
|
@current_tag = nil
|
92
92
|
@tag_stack = []
|
93
|
-
@text_stack = [""]
|
93
|
+
@text_stack = [+""]
|
94
94
|
@first_page = true
|
95
95
|
end
|
96
96
|
|
@@ -172,7 +172,7 @@ module Datasets
|
|
172
172
|
|
173
173
|
def push_stacks(tag)
|
174
174
|
@tag_stack << tag
|
175
|
-
@text_stack << ""
|
175
|
+
@text_stack << +""
|
176
176
|
end
|
177
177
|
|
178
178
|
def pop_stacks
|
@@ -0,0 +1,27 @@
|
|
1
|
+
class JapaneseDateParserTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@parser = Datasets::JapaneseDateParser.new
|
4
|
+
end
|
5
|
+
|
6
|
+
data("month and day with leading a space in Heisei", ["H10.01.01", "平成10年 1月 1日"])
|
7
|
+
data("month with leading a space in Heisei", ["H10.01.10", "平成10年 1月10日"])
|
8
|
+
data(" day with leading a space in Heisei", ["H10.10.01", "平成10年10月 1日"])
|
9
|
+
data(" without leading a space in Heisei", ["H10.10.10", "平成10年10月10日"])
|
10
|
+
data("year, month and day with leading a space in Reiwa", ["R02.01.01", "令和 2年 1月 1日"])
|
11
|
+
data("year, month with leading a space in Reiwa", ["R02.01.10", "令和 2年 1月10日"])
|
12
|
+
data("year, day with leading a space in Reiwa", ["R02.10.01", "令和 2年10月 1日"])
|
13
|
+
data("year, without leading a space in Reiwa", ["R02.10.10", "令和 2年10月10日"])
|
14
|
+
data("boundary within Heisei", ["H31.04.30", "平成31年 4月30日"])
|
15
|
+
data("boundary within Reiwa", ["R01.05.01", "令和元年 5月 1日"])
|
16
|
+
test("#parse") do
|
17
|
+
expected_jisx0301, japanese_date_string = data
|
18
|
+
assert_equal(expected_jisx0301, @parser.parse(japanese_date_string).jisx0301)
|
19
|
+
end
|
20
|
+
|
21
|
+
test("unsupported era initial range") do
|
22
|
+
expected_message = "era must be one of [平成, 令和]: 昭和"
|
23
|
+
assert_raise(Datasets::JapaneseDateParser::UnsupportedEraInitialRange.new(expected_message)) do
|
24
|
+
@parser.parse("昭和元年 1月 1日")
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/test/test-adult.rb
CHANGED
@@ -9,49 +9,24 @@ class AdultTest < Test::Unit::TestCase
|
|
9
9
|
end
|
10
10
|
|
11
11
|
test("#each") do
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
:label => "<=50K"
|
31
|
-
},
|
32
|
-
{
|
33
|
-
:age => 52,
|
34
|
-
:work_class => "Self-emp-inc",
|
35
|
-
:final_weight => 287927,
|
36
|
-
:education => "HS-grad",
|
37
|
-
:n_education_years => 9,
|
38
|
-
:marital_status => "Married-civ-spouse",
|
39
|
-
:occupation => "Exec-managerial",
|
40
|
-
:relationship => "Wife",
|
41
|
-
:race => "White",
|
42
|
-
:sex => "Female",
|
43
|
-
:capital_gain => 15024,
|
44
|
-
:capital_loss => 0,
|
45
|
-
:hours_per_week => 40,
|
46
|
-
:native_country => "United-States",
|
47
|
-
:label => ">50K"
|
48
|
-
}
|
49
|
-
],
|
50
|
-
[
|
51
|
-
records.size,
|
52
|
-
records[0].to_h,
|
53
|
-
records[-1].to_h
|
54
|
-
])
|
12
|
+
assert_equal({
|
13
|
+
:age => 39,
|
14
|
+
:work_class => "State-gov",
|
15
|
+
:final_weight => 77516,
|
16
|
+
:education => "Bachelors",
|
17
|
+
:n_education_years => 13,
|
18
|
+
:marital_status => "Never-married",
|
19
|
+
:occupation => "Adm-clerical",
|
20
|
+
:relationship => "Not-in-family",
|
21
|
+
:race => "White",
|
22
|
+
:sex => "Male",
|
23
|
+
:capital_gain => 2174,
|
24
|
+
:capital_loss => 0,
|
25
|
+
:hours_per_week => 40,
|
26
|
+
:native_country => "United-States",
|
27
|
+
:label => "<=50K"
|
28
|
+
},
|
29
|
+
@dataset.each.next.to_h)
|
55
30
|
end
|
56
31
|
end
|
57
32
|
|
@@ -65,49 +40,24 @@ class AdultTest < Test::Unit::TestCase
|
|
65
40
|
end
|
66
41
|
|
67
42
|
test("#each") do
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
:label => "<=50K."
|
87
|
-
},
|
88
|
-
{
|
89
|
-
:age => 35,
|
90
|
-
:work_class => "Self-emp-inc",
|
91
|
-
:final_weight => 182148,
|
92
|
-
:education => "Bachelors",
|
93
|
-
:n_education_years => 13,
|
94
|
-
:marital_status => "Married-civ-spouse",
|
95
|
-
:occupation => "Exec-managerial",
|
96
|
-
:relationship => "Husband",
|
97
|
-
:race => "White",
|
98
|
-
:sex => "Male",
|
99
|
-
:capital_gain => 0,
|
100
|
-
:capital_loss => 0,
|
101
|
-
:hours_per_week => 60,
|
102
|
-
:native_country => "United-States",
|
103
|
-
:label => ">50K."
|
104
|
-
}
|
105
|
-
],
|
106
|
-
[
|
107
|
-
records.size,
|
108
|
-
records[0].to_h,
|
109
|
-
records[-1].to_h
|
110
|
-
])
|
43
|
+
assert_equal({
|
44
|
+
:age => 25,
|
45
|
+
:work_class => "Private",
|
46
|
+
:final_weight => 226802,
|
47
|
+
:education => "11th",
|
48
|
+
:n_education_years => 7,
|
49
|
+
:marital_status => "Never-married",
|
50
|
+
:occupation => "Machine-op-inspct",
|
51
|
+
:relationship => "Own-child",
|
52
|
+
:race => "Black",
|
53
|
+
:sex => "Male",
|
54
|
+
:capital_gain => 0,
|
55
|
+
:capital_loss => 0,
|
56
|
+
:hours_per_week => 40,
|
57
|
+
:native_country => "United-States",
|
58
|
+
:label => "<=50K.",
|
59
|
+
},
|
60
|
+
@dataset.each.next.to_h)
|
111
61
|
end
|
112
62
|
end
|
113
63
|
|
data/test/test-aozora-bunko.rb
CHANGED
@@ -114,13 +114,13 @@ class AozoraBunkoTest < Test::Unit::TestCase
|
|
114
114
|
book = Datasets::AozoraBunko::Book.new
|
115
115
|
book.cache_path = @cache_path
|
116
116
|
|
117
|
-
book.title_id = '
|
118
|
-
book.person_id = '
|
119
|
-
book.html_file_url = 'http://
|
117
|
+
book.title_id = '061551'
|
118
|
+
book.person_id = '002225'
|
119
|
+
book.html_file_url = 'http://minken.party/2019/03/09/yakunin/'
|
120
120
|
book.html_file_character_encoding = 'UTF-8'
|
121
121
|
|
122
|
-
assert_equal('<title
|
123
|
-
book.html
|
122
|
+
assert_equal('<title>自由民権現代研究会</title>',
|
123
|
+
book.html[/<title>.*?<\/title>/])
|
124
124
|
end
|
125
125
|
end
|
126
126
|
|
@@ -8,37 +8,18 @@ class CaliforniaHousingTest < Test::Unit::TestCase
|
|
8
8
|
end
|
9
9
|
|
10
10
|
test("#each") do
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
longitude: -122.230000
|
24
|
-
},
|
25
|
-
{
|
26
|
-
median_house_value: 89400.000000,
|
27
|
-
median_income: 2.388600,
|
28
|
-
housing_median_age: 16.000000,
|
29
|
-
total_rooms: 2785.000000,
|
30
|
-
total_bedrooms: 616.000000,
|
31
|
-
population: 1387.000000,
|
32
|
-
households: 530.000000,
|
33
|
-
latitude: 39.370000,
|
34
|
-
longitude: -121.240000
|
35
|
-
},
|
36
|
-
],
|
37
|
-
[
|
38
|
-
records.size,
|
39
|
-
records[0].to_h,
|
40
|
-
records[-1].to_h
|
41
|
-
])
|
11
|
+
assert_equal({
|
12
|
+
median_house_value: 452600.000000,
|
13
|
+
median_income: 8.325200,
|
14
|
+
housing_median_age: 41.000000,
|
15
|
+
total_rooms: 880.000000,
|
16
|
+
total_bedrooms: 129.000000,
|
17
|
+
population: 322.000000,
|
18
|
+
households: 126.000000,
|
19
|
+
latitude: 37.880000,
|
20
|
+
longitude: -122.230000,
|
21
|
+
},
|
22
|
+
@dataset.each.next.to_h)
|
42
23
|
end
|
43
24
|
|
44
25
|
sub_test_case("#metadata") do
|
data/test/test-cldr-plurals.rb
CHANGED
data/test/test-diamonds.rb
CHANGED
@@ -8,39 +8,19 @@ class DiamondsTest < Test::Unit::TestCase
|
|
8
8
|
end
|
9
9
|
|
10
10
|
test("#each") do
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
z: 2.43,
|
25
|
-
},
|
26
|
-
{
|
27
|
-
carat: 0.75,
|
28
|
-
clarity: "SI2",
|
29
|
-
color: "D",
|
30
|
-
cut: "Ideal",
|
31
|
-
depth: 62.2,
|
32
|
-
price: 2757,
|
33
|
-
table: 55.0,
|
34
|
-
x: 5.83,
|
35
|
-
y: 5.87,
|
36
|
-
z: 3.64,
|
37
|
-
},
|
38
|
-
],
|
39
|
-
[
|
40
|
-
records.size,
|
41
|
-
records[0].to_h,
|
42
|
-
records[-1].to_h
|
43
|
-
])
|
11
|
+
assert_equal({
|
12
|
+
carat: 0.23,
|
13
|
+
clarity: "SI2",
|
14
|
+
color: "E",
|
15
|
+
cut: "Ideal",
|
16
|
+
depth: 61.5,
|
17
|
+
price: 326,
|
18
|
+
table: 55.0,
|
19
|
+
x: 3.95,
|
20
|
+
y: 3.98,
|
21
|
+
z: 2.43,
|
22
|
+
},
|
23
|
+
@dataset.each.next.to_h)
|
44
24
|
end
|
45
25
|
|
46
26
|
sub_test_case("#metadata") do
|
data/test/test-downloader.rb
CHANGED
@@ -17,7 +17,7 @@ class DownloaderTest < Test::Unit::TestCase
|
|
17
17
|
output_path = @tmp_dir + "file"
|
18
18
|
downloader = Datasets::Downloader.new(first_url)
|
19
19
|
|
20
|
-
downloader.define_singleton_method(:start_http) do |url, headers|
|
20
|
+
downloader.define_singleton_method(:start_http) do |url, fallback_urls, headers|
|
21
21
|
raise Datasets::Downloader::TooManyRedirects, "too many redirections: #{last_url}"
|
22
22
|
end
|
23
23
|
|
data/test/test-geolonia.rb
CHANGED
@@ -4,47 +4,23 @@ class GeoloniaTest < Test::Unit::TestCase
|
|
4
4
|
end
|
5
5
|
|
6
6
|
test('#each') do
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
:longitude => "141.319722"
|
25
|
-
},
|
26
|
-
{
|
27
|
-
:prefecture_code => "47",
|
28
|
-
:prefecture_name => "沖縄県",
|
29
|
-
:prefecture_kana => "オキナワケン",
|
30
|
-
:prefecture_romaji => "OKINAWA KEN",
|
31
|
-
:municipality_code => "47382",
|
32
|
-
:municipality_name => "八重山郡与那国町",
|
33
|
-
:municipality_kana => "ヤエヤマグンヨナグニチョウ",
|
34
|
-
:municipality_romaji => "YAEYAMA GUN YONAGUNI CHO",
|
35
|
-
:street_name => "字与那国",
|
36
|
-
:street_kana => nil,
|
37
|
-
:street_romaji => nil,
|
38
|
-
:alias => nil,
|
39
|
-
:latitude => "24.455925",
|
40
|
-
:longitude => "122.987678",
|
41
|
-
},
|
42
|
-
],
|
43
|
-
[
|
44
|
-
records.size,
|
45
|
-
records[0].to_h,
|
46
|
-
records[-1].to_h,
|
47
|
-
])
|
7
|
+
assert_equal({
|
8
|
+
:prefecture_code => "01",
|
9
|
+
:prefecture_name => "北海道",
|
10
|
+
:prefecture_kana => "ホッカイドウ",
|
11
|
+
:prefecture_romaji => "HOKKAIDO",
|
12
|
+
:municipality_code => "01101",
|
13
|
+
:municipality_name => "札幌市中央区",
|
14
|
+
:municipality_kana => "サッポロシチュウオウク",
|
15
|
+
:municipality_romaji => "SAPPORO SHI CHUO KU",
|
16
|
+
:street_name => "旭ケ丘一丁目",
|
17
|
+
:street_kana => "アサヒガオカ 1",
|
18
|
+
:street_romaji => "ASAHIGAOKA 1",
|
19
|
+
:alias => nil,
|
20
|
+
:latitude => "43.04223",
|
21
|
+
:longitude => "141.319722",
|
22
|
+
},
|
23
|
+
@dataset.each.next.to_h)
|
48
24
|
end
|
49
25
|
|
50
26
|
sub_test_case("#metadata") do
|