red-datasets 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +23 -3
- data/Rakefile +56 -1
- data/doc/text/news.md +102 -0
- data/lib/datasets/adult.rb +6 -9
- data/lib/datasets/afinn.rb +48 -0
- data/lib/datasets/aozora-bunko.rb +196 -0
- data/lib/datasets/cache-path.rb +28 -0
- data/lib/datasets/california-housing.rb +60 -0
- data/lib/datasets/cifar.rb +2 -4
- data/lib/datasets/cldr-plurals.rb +2 -4
- data/lib/datasets/communities.rb +5 -8
- data/lib/datasets/dataset.rb +58 -23
- data/lib/datasets/diamonds.rb +26 -0
- data/lib/datasets/downloader.rb +110 -30
- data/lib/datasets/e-stat-japan.rb +2 -1
- data/lib/datasets/fashion-mnist.rb +4 -0
- data/lib/datasets/fuel-economy.rb +35 -0
- data/lib/datasets/geolonia.rb +67 -0
- data/lib/datasets/ggplot2-dataset.rb +79 -0
- data/lib/datasets/hepatitis.rb +5 -8
- data/lib/datasets/iris.rb +5 -8
- data/lib/datasets/ita-corpus.rb +57 -0
- data/lib/datasets/kuzushiji-mnist.rb +16 -0
- data/lib/datasets/lazy.rb +90 -0
- data/lib/datasets/libsvm-dataset-list.rb +5 -8
- data/lib/datasets/libsvm.rb +3 -4
- data/lib/datasets/license.rb +26 -0
- data/lib/datasets/livedoor-news.rb +80 -0
- data/lib/datasets/metadata.rb +14 -0
- data/lib/datasets/mnist.rb +7 -7
- data/lib/datasets/mushroom.rb +5 -8
- data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
- data/lib/datasets/penguins.rb +6 -8
- data/lib/datasets/penn-treebank.rb +2 -4
- data/lib/datasets/pmjt-dataset-list.rb +67 -0
- data/lib/datasets/postal-code-japan.rb +2 -6
- data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
- data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
- data/lib/datasets/seaborn.rb +90 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
- data/lib/datasets/wikipedia.rb +16 -8
- data/lib/datasets/wine.rb +6 -9
- data/lib/datasets/zip-extractor.rb +48 -0
- data/lib/datasets.rb +2 -22
- data/red-datasets.gemspec +1 -1
- data/test/helper.rb +21 -0
- data/test/test-afinn.rb +60 -0
- data/test/test-aozora-bunko.rb +190 -0
- data/test/test-california-housing.rb +56 -0
- data/test/test-cldr-plurals.rb +1 -1
- data/test/test-dataset.rb +15 -7
- data/test/test-diamonds.rb +71 -0
- data/test/test-fuel-economy.rb +75 -0
- data/test/test-geolonia.rb +65 -0
- data/test/test-ita-corpus.rb +69 -0
- data/test/test-kuzushiji-mnist.rb +137 -0
- data/test/test-license.rb +24 -0
- data/test/test-livedoor-news.rb +351 -0
- data/test/test-metadata.rb +36 -0
- data/test/test-nagoya-university-conversation-corpus.rb +132 -0
- data/test/test-penguins.rb +1 -1
- data/test/test-pmjt-dataset-list.rb +50 -0
- data/test/test-quora-duplicate-question-pair.rb +33 -0
- data/test/test-rdataset.rb +246 -0
- data/test/{test-seaborn-data.rb → test-seaborn.rb} +71 -4
- data/test/test-sudachi-synonym-dictionary.rb +5 -5
- data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
- data/test/test-wikipedia.rb +25 -71
- metadata +62 -14
- data/lib/datasets/seaborn-data.rb +0 -49
- data/test/test-rdatasets.rb +0 -136
data/test/test-afinn.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
class AFINNTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::AFINN.new
|
4
|
+
end
|
5
|
+
|
6
|
+
test('#each') do
|
7
|
+
records = @dataset.each.to_a
|
8
|
+
assert_equal([
|
9
|
+
2477,
|
10
|
+
{
|
11
|
+
:valence => -2,
|
12
|
+
:word => "abandon"
|
13
|
+
},
|
14
|
+
{
|
15
|
+
:valence => 2,
|
16
|
+
:word => "zealous"
|
17
|
+
},
|
18
|
+
],
|
19
|
+
[
|
20
|
+
records.size,
|
21
|
+
records[0].to_h,
|
22
|
+
records[-1].to_h,
|
23
|
+
])
|
24
|
+
end
|
25
|
+
|
26
|
+
sub_test_case('#metadata') do
|
27
|
+
test('#description') do
|
28
|
+
description = @dataset.metadata.description
|
29
|
+
assert_equal(<<-DESCRIPTION.chomp, description)
|
30
|
+
AFINN is a list of English words rated for valence with an integer
|
31
|
+
between minus five (negative) and plus five (positive). The words have
|
32
|
+
been manually labeled by Finn Årup Nielsen in 2009-2011. The file
|
33
|
+
is tab-separated. There are two versions:
|
34
|
+
|
35
|
+
AFINN-111: Newest version with 2477 words and phrases.
|
36
|
+
|
37
|
+
An evaluation of the word list is available in:
|
38
|
+
|
39
|
+
Finn Årup Nielsen, "A new ANEW: Evaluation of a word list for
|
40
|
+
sentiment analysis in microblogs", http://arxiv.org/abs/1103.2903
|
41
|
+
|
42
|
+
The list was used in:
|
43
|
+
|
44
|
+
Lars Kai Hansen, Adam Arvidsson, Finn Årup Nielsen, Elanor Colleoni,
|
45
|
+
Michael Etter, "Good Friends, Bad News - Affect and Virality in
|
46
|
+
Twitter", The 2011 International Workshop on Social Computing,
|
47
|
+
Network, and Services (SocialComNet 2011).
|
48
|
+
|
49
|
+
|
50
|
+
This database of words is copyright protected and distributed under
|
51
|
+
"Open Database License (ODbL) v1.0"
|
52
|
+
http://www.opendatacommons.org/licenses/odbl/1.0/ or a similar
|
53
|
+
copyleft license.
|
54
|
+
|
55
|
+
See comments on the word list here:
|
56
|
+
http://fnielsen.posterous.com/old-anew-a-sentiment-about-sentiment-analysis
|
57
|
+
DESCRIPTION
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,190 @@
|
|
1
|
+
class AozoraBunkoTest < Test::Unit::TestCase
|
2
|
+
include Helper::PathRestorable
|
3
|
+
|
4
|
+
def setup
|
5
|
+
@dataset = Datasets::AozoraBunko.new
|
6
|
+
@cache_path = @dataset.send(:cache_path)
|
7
|
+
end
|
8
|
+
|
9
|
+
test('#new') do
|
10
|
+
assert_equal({
|
11
|
+
title_id: '059898',
|
12
|
+
title: 'ウェストミンスター寺院',
|
13
|
+
title_reading: 'ウェストミンスターじいん',
|
14
|
+
title_reading_collation: 'うえすとみんすたあしいん',
|
15
|
+
subtitle: '',
|
16
|
+
subtitle_reading: '',
|
17
|
+
original_title: '',
|
18
|
+
first_appearance: '',
|
19
|
+
ndc_code: 'NDC 933',
|
20
|
+
syllabary_spelling_type: '新字新仮名',
|
21
|
+
copyrighted: false,
|
22
|
+
published_date: '2020-04-03',
|
23
|
+
last_updated_date: '2020-03-28',
|
24
|
+
detail_url: 'https://www.aozora.gr.jp/cards/001257/card59898.html',
|
25
|
+
person_id: '001257',
|
26
|
+
person_family_name: 'アーヴィング',
|
27
|
+
person_first_name: 'ワシントン',
|
28
|
+
person_family_name_reading: 'アーヴィング',
|
29
|
+
person_first_name_reading: 'ワシントン',
|
30
|
+
person_family_name_reading_collation: 'ああういんく',
|
31
|
+
person_first_name_reading_collation: 'わしんとん',
|
32
|
+
person_family_name_romaji: 'Irving',
|
33
|
+
person_first_name_romaji: 'Washington',
|
34
|
+
person_type: '著者',
|
35
|
+
person_birthday: '1783-04-03',
|
36
|
+
person_date_of_death: '1859-11-28',
|
37
|
+
person_copyrighted: false,
|
38
|
+
original_book_name1: 'スケッチ・ブック',
|
39
|
+
original_book_publisher_name1: '新潮文庫、新潮社',
|
40
|
+
original_book_first_published_date1: '1957(昭和32)年5月20日',
|
41
|
+
used_version_for_registration1: '2000(平成12)年2月20日33刷改版',
|
42
|
+
used_version_for_proofreading1: '2000(平成12)年2月20日33刷改版',
|
43
|
+
base_of_original_book_name1: '',
|
44
|
+
base_of_original_book_publisher_name1: '',
|
45
|
+
base_of_original_book_first_published_date1: '',
|
46
|
+
original_book_name2: '',
|
47
|
+
original_book_publisher_name2: '',
|
48
|
+
original_book_first_published_date2: '',
|
49
|
+
used_version_for_registration2: '',
|
50
|
+
used_version_for_proofreading2: '',
|
51
|
+
base_of_original_book_name2: '',
|
52
|
+
base_of_original_book_publisher_name2: '',
|
53
|
+
base_of_original_book_first_published_date2: '',
|
54
|
+
registered_person_name: 'えにしだ',
|
55
|
+
proofreader_name: '砂場清隆',
|
56
|
+
text_file_url: 'https://www.aozora.gr.jp/cards/001257/files/59898_ruby_70679.zip',
|
57
|
+
last_text_file_updated_date: '2020-03-28',
|
58
|
+
text_file_character_encoding: 'ShiftJIS',
|
59
|
+
text_file_character_set: 'JIS X 0208',
|
60
|
+
text_file_updating_count: '0',
|
61
|
+
html_file_url: 'https://www.aozora.gr.jp/cards/001257/files/59898_70731.html',
|
62
|
+
last_html_file_updated_date: '2020-03-28',
|
63
|
+
html_file_character_encoding: 'ShiftJIS',
|
64
|
+
html_file_character_set: 'JIS X 0208',
|
65
|
+
html_file_updating_count: '0'
|
66
|
+
|
67
|
+
},
|
68
|
+
@dataset.first.to_h)
|
69
|
+
end
|
70
|
+
|
71
|
+
sub_test_case(:Book) do
|
72
|
+
sub_test_case('#text') do
|
73
|
+
test('readable') do
|
74
|
+
book = Datasets::AozoraBunko::Book.new
|
75
|
+
book.cache_path = @cache_path
|
76
|
+
book.title_id = '059898'
|
77
|
+
book.person_id = '001257'
|
78
|
+
book.text_file_url = 'https://www.aozora.gr.jp/cards/001257/files/59898_ruby_70679.zip'
|
79
|
+
book.text_file_character_encoding = 'ShiftJIS'
|
80
|
+
|
81
|
+
assert_equal([
|
82
|
+
'ウェストミンスター寺',
|
83
|
+
"アの皆さんです。\r\n"
|
84
|
+
],
|
85
|
+
[
|
86
|
+
book.text[0, 10],
|
87
|
+
book.text[-10, 10]
|
88
|
+
])
|
89
|
+
end
|
90
|
+
|
91
|
+
test('not readable') do
|
92
|
+
book = Datasets::AozoraBunko::Book.new
|
93
|
+
book.text_file_url = 'https://mega.nz/file/6tMxgAjZ#PglDDyJL0syRhnULqK0qhTMC7cktsgqwObj5fY_knpE'
|
94
|
+
|
95
|
+
assert_equal(nil, book.text)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
sub_test_case('#html') do
|
100
|
+
sub_test_case('readable') do
|
101
|
+
test('encoding is ShiftJIS') do
|
102
|
+
book = Datasets::AozoraBunko::Book.new
|
103
|
+
book.cache_path = @cache_path
|
104
|
+
book.title_id = '059898'
|
105
|
+
book.person_id = '001257'
|
106
|
+
book.html_file_url = 'https://www.aozora.gr.jp/cards/001257/files/59898_70731.html'
|
107
|
+
book.html_file_character_encoding = 'ShiftJIS'
|
108
|
+
|
109
|
+
assert_equal("<title>ワシントン・アーヴィング Washington Irving 吉田甲子太郎訳 ウェストミンスター寺院</title>",
|
110
|
+
book.html.split("\n")[8].strip)
|
111
|
+
end
|
112
|
+
|
113
|
+
test('encoding is UTF-8') do
|
114
|
+
book = Datasets::AozoraBunko::Book.new
|
115
|
+
book.cache_path = @cache_path
|
116
|
+
|
117
|
+
book.title_id = '000750'
|
118
|
+
book.person_id = '000146'
|
119
|
+
book.html_file_url = 'http://www.lcv.ne.jp/~ibs52086/fire/'
|
120
|
+
book.html_file_character_encoding = 'UTF-8'
|
121
|
+
|
122
|
+
assert_equal('<title>種田山頭火句集 | 『草木塔抄』他 FIRE ON THE MOUNTAIN</title>',
|
123
|
+
book.html.split("\n")[7])
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
test('not readable') do
|
128
|
+
book = Datasets::AozoraBunko::Book.new
|
129
|
+
book.html_file_url = ''
|
130
|
+
|
131
|
+
assert_equal(nil, book.html)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
sub_test_case('converting boolean') do
|
136
|
+
test('#person_copyrighted?') do
|
137
|
+
book = @dataset.first
|
138
|
+
assert_equal([
|
139
|
+
false,
|
140
|
+
false,
|
141
|
+
false,
|
142
|
+
],
|
143
|
+
[
|
144
|
+
book.person_copyrighted?,
|
145
|
+
book.person_copyrighted,
|
146
|
+
book.to_h[:person_copyrighted],
|
147
|
+
])
|
148
|
+
end
|
149
|
+
|
150
|
+
test('#copyrighted?') do
|
151
|
+
book = @dataset.first
|
152
|
+
assert_equal([
|
153
|
+
false,
|
154
|
+
false,
|
155
|
+
false,
|
156
|
+
],
|
157
|
+
[
|
158
|
+
book.copyrighted?,
|
159
|
+
book.copyrighted,
|
160
|
+
book.to_h[:copyrighted],
|
161
|
+
])
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
test('#clear_cache! removes all cache files') do
|
166
|
+
book = Datasets::AozoraBunko::Book.new
|
167
|
+
book.cache_path = @cache_path
|
168
|
+
|
169
|
+
book.title_id = '059898'
|
170
|
+
book.person_id = '001257'
|
171
|
+
book.text_file_url = 'https://www.aozora.gr.jp/cards/001257/files/59898_ruby_70679.zip'
|
172
|
+
book.text_file_character_encoding = 'ShiftJIS'
|
173
|
+
book.html_file_url = 'https://www.aozora.gr.jp/cards/001257/files/59898_70731.html'
|
174
|
+
book.html_file_character_encoding = 'ShiftJIS'
|
175
|
+
|
176
|
+
book.text
|
177
|
+
book.html
|
178
|
+
|
179
|
+
restore_path(@cache_path.base_dir) do
|
180
|
+
assert_equal(true, @cache_path.base_dir.exist?)
|
181
|
+
assert_equal(true, book.send(:text_file_output_path).exist?)
|
182
|
+
assert_equal(true, book.send(:html_file_output_path).exist?)
|
183
|
+
@dataset.clear_cache!
|
184
|
+
assert_equal(false, book.send(:html_file_output_path).exist?)
|
185
|
+
assert_equal(false, book.send(:text_file_output_path).exist?)
|
186
|
+
assert_equal(false, @cache_path.base_dir.exist?)
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
class CaliforniaHousingTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::CaliforniaHousing.new
|
4
|
+
end
|
5
|
+
|
6
|
+
def record(*args)
|
7
|
+
Datasets::CaliforniaHousing::Record.new(*args)
|
8
|
+
end
|
9
|
+
|
10
|
+
test("#each") do
|
11
|
+
records = @dataset.each.to_a
|
12
|
+
assert_equal([
|
13
|
+
20640,
|
14
|
+
{
|
15
|
+
median_house_value: 452600.000000,
|
16
|
+
median_income: 8.325200,
|
17
|
+
housing_median_age: 41.000000,
|
18
|
+
total_rooms: 880.000000,
|
19
|
+
total_bedrooms: 129.000000,
|
20
|
+
population: 322.000000,
|
21
|
+
households: 126.000000,
|
22
|
+
latitude: 37.880000,
|
23
|
+
longitude: -122.230000
|
24
|
+
},
|
25
|
+
{
|
26
|
+
median_house_value: 89400.000000,
|
27
|
+
median_income: 2.388600,
|
28
|
+
housing_median_age: 16.000000,
|
29
|
+
total_rooms: 2785.000000,
|
30
|
+
total_bedrooms: 616.000000,
|
31
|
+
population: 1387.000000,
|
32
|
+
households: 530.000000,
|
33
|
+
latitude: 39.370000,
|
34
|
+
longitude: -121.240000
|
35
|
+
},
|
36
|
+
],
|
37
|
+
[
|
38
|
+
records.size,
|
39
|
+
records[0].to_h,
|
40
|
+
records[-1].to_h
|
41
|
+
])
|
42
|
+
end
|
43
|
+
|
44
|
+
sub_test_case("#metadata") do
|
45
|
+
test("#description") do
|
46
|
+
description = @dataset.metadata.description
|
47
|
+
assert_equal(<<-DESCRIPTION, description)
|
48
|
+
Housing information from the 1990 census used in
|
49
|
+
Pace, R. Kelley and Ronald Barry,
|
50
|
+
"Sparse Spatial Autoregressions",
|
51
|
+
Statistics and Probability Letters, 33 (1997) 291-297.
|
52
|
+
Available from http://lib.stat.cmu.edu/datasets/.
|
53
|
+
DESCRIPTION
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
data/test/test-cldr-plurals.rb
CHANGED
data/test/test-dataset.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
class TestDataset < Test::Unit::TestCase
|
2
2
|
sub_test_case("#clear_cache!") do
|
3
|
+
include Helper::PathRestorable
|
4
|
+
|
3
5
|
def setup
|
4
6
|
@dataset = Datasets::Iris.new
|
5
7
|
@cache_dir_path = @dataset.send(:cache_dir_path)
|
@@ -9,18 +11,24 @@ class TestDataset < Test::Unit::TestCase
|
|
9
11
|
@dataset.first # This ensures the dataset downloaded
|
10
12
|
existence = {before: @cache_dir_path.join("iris.csv").exist?}
|
11
13
|
|
12
|
-
@
|
13
|
-
|
14
|
+
restore_path(@cache_dir_path) do
|
15
|
+
@dataset.clear_cache!
|
16
|
+
existence[:after] = @cache_dir_path.join("iris.csv").exist?
|
14
17
|
|
15
|
-
|
16
|
-
|
18
|
+
assert_equal({before: true, after: false},
|
19
|
+
existence)
|
20
|
+
end
|
17
21
|
end
|
18
22
|
|
19
23
|
test("when the dataset is not downloaded") do
|
20
|
-
|
24
|
+
restore_path(@cache_dir_path) do
|
25
|
+
if @cache_dir_path.exist?
|
26
|
+
FileUtils.rmtree(@cache_dir_path.to_s, secure: true)
|
27
|
+
end
|
21
28
|
|
22
|
-
|
23
|
-
|
29
|
+
assert_nothing_raised do
|
30
|
+
@dataset.clear_cache!
|
31
|
+
end
|
24
32
|
end
|
25
33
|
end
|
26
34
|
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
class DiamondsTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::Diamonds.new
|
4
|
+
end
|
5
|
+
|
6
|
+
def record(*args)
|
7
|
+
Datasets::Diamonds::Record.new(*args)
|
8
|
+
end
|
9
|
+
|
10
|
+
test("#each") do
|
11
|
+
records = @dataset.each.to_a
|
12
|
+
assert_equal([
|
13
|
+
53940,
|
14
|
+
{
|
15
|
+
carat: 0.23,
|
16
|
+
clarity: "SI2",
|
17
|
+
color: "E",
|
18
|
+
cut: "Ideal",
|
19
|
+
depth: 61.5,
|
20
|
+
price: 326,
|
21
|
+
table: 55.0,
|
22
|
+
x: 3.95,
|
23
|
+
y: 3.98,
|
24
|
+
z: 2.43,
|
25
|
+
},
|
26
|
+
{
|
27
|
+
carat: 0.75,
|
28
|
+
clarity: "SI2",
|
29
|
+
color: "D",
|
30
|
+
cut: "Ideal",
|
31
|
+
depth: 62.2,
|
32
|
+
price: 2757,
|
33
|
+
table: 55.0,
|
34
|
+
x: 5.83,
|
35
|
+
y: 5.87,
|
36
|
+
z: 3.64,
|
37
|
+
},
|
38
|
+
],
|
39
|
+
[
|
40
|
+
records.size,
|
41
|
+
records[0].to_h,
|
42
|
+
records[-1].to_h
|
43
|
+
])
|
44
|
+
end
|
45
|
+
|
46
|
+
sub_test_case("#metadata") do
|
47
|
+
test("#description") do
|
48
|
+
description = @dataset.metadata.description
|
49
|
+
assert_equal(<<-DESCRIPTION, description)
|
50
|
+
Prices of over 50,000 round cut diamonds
|
51
|
+
|
52
|
+
A dataset containing the prices and other attributes of almost 54,000
|
53
|
+
diamonds. The variables are as follows:
|
54
|
+
|
55
|
+
A data frame with 53940 rows and 10 variables:
|
56
|
+
|
57
|
+
* price: price in US dollars ($326--$18,823)
|
58
|
+
* carat: weight of the diamond (0.2--5.01)
|
59
|
+
* cut: quality of the cut (Fair, Good, Very Good, Premium, Ideal)
|
60
|
+
* color: diamond colour, from D (best) to J (worst)
|
61
|
+
* clarity: a measurement of how clear the diamond is (I1 (worst), SI2,
|
62
|
+
SI1, VS2, VS1, VVS2, VVS1, IF (best))
|
63
|
+
* x: length in mm (0--10.74)
|
64
|
+
* y: width in mm (0--58.9)
|
65
|
+
* z: depth in mm (0--31.8)
|
66
|
+
* depth: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)
|
67
|
+
* table: width of top of diamond relative to widest point (43--95)
|
68
|
+
DESCRIPTION
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
class FuelEconomyTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::FuelEconomy.new
|
4
|
+
end
|
5
|
+
|
6
|
+
def record(*args)
|
7
|
+
Datasets::FuelEconomy::Record.new(*args)
|
8
|
+
end
|
9
|
+
|
10
|
+
test("#each") do
|
11
|
+
records = @dataset.each.to_a
|
12
|
+
assert_equal([
|
13
|
+
234,
|
14
|
+
{
|
15
|
+
city_mpg: 18,
|
16
|
+
displacement: 1.8,
|
17
|
+
drive_train: "f",
|
18
|
+
fuel: "p",
|
19
|
+
highway_mpg: 29,
|
20
|
+
manufacturer: "audi",
|
21
|
+
model: "a4",
|
22
|
+
n_cylinders: 4,
|
23
|
+
transmission: "auto(l5)",
|
24
|
+
type: "compact",
|
25
|
+
year: 1999
|
26
|
+
},
|
27
|
+
{
|
28
|
+
city_mpg: 17,
|
29
|
+
displacement: 3.6,
|
30
|
+
drive_train: "f",
|
31
|
+
fuel: "p",
|
32
|
+
highway_mpg: 26,
|
33
|
+
manufacturer: "volkswagen",
|
34
|
+
model: "passat",
|
35
|
+
n_cylinders: 6,
|
36
|
+
transmission: "auto(s6)",
|
37
|
+
type: "midsize",
|
38
|
+
year: 2008
|
39
|
+
},
|
40
|
+
],
|
41
|
+
[
|
42
|
+
records.size,
|
43
|
+
records[0].to_h,
|
44
|
+
records[-1].to_h
|
45
|
+
])
|
46
|
+
end
|
47
|
+
|
48
|
+
sub_test_case("#metadata") do
|
49
|
+
test("#description") do
|
50
|
+
description = @dataset.metadata.description
|
51
|
+
assert_equal(<<-DESCRIPTION, description)
|
52
|
+
Fuel economy data from 1999 to 2008 for 38 popular models of cars
|
53
|
+
|
54
|
+
This dataset contains a subset of the fuel economy data that the EPA makes
|
55
|
+
available on https://fueleconomy.gov/. It contains only models which
|
56
|
+
had a new release every year between 1999 and 2008 - this was used as a
|
57
|
+
proxy for the popularity of the car.
|
58
|
+
|
59
|
+
A data frame with 234 rows and 11 variables:
|
60
|
+
|
61
|
+
* manufacturer: manufacturer name
|
62
|
+
* model: model name
|
63
|
+
* displacement: engine displacement, in litres
|
64
|
+
* year: year of manufacture
|
65
|
+
* n_cylinders: number of cylinders
|
66
|
+
* transmissions: type of transmission
|
67
|
+
* drive_train: the type of drive train, where f = front-wheel drive, r = rear wheel drive, 4 = 4wd
|
68
|
+
* city_mpg: city miles per gallon
|
69
|
+
* highway_mpg: highway miles per gallon
|
70
|
+
* fuel: fuel type
|
71
|
+
* type: "type" of car
|
72
|
+
DESCRIPTION
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
class GeoloniaTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::Geolonia.new
|
4
|
+
end
|
5
|
+
|
6
|
+
test('#each') do
|
7
|
+
records = @dataset.each.to_a
|
8
|
+
assert_equal([
|
9
|
+
277616,
|
10
|
+
{
|
11
|
+
:prefecture_code => "01",
|
12
|
+
:prefecture_name => "北海道",
|
13
|
+
:prefecture_kana => "ホッカイドウ",
|
14
|
+
:prefecture_romaji => "HOKKAIDO",
|
15
|
+
:municipality_code => "01101",
|
16
|
+
:municipality_name => "札幌市中央区",
|
17
|
+
:municipality_kana => "サッポロシチュウオウク",
|
18
|
+
:municipality_romaji => "SAPPORO SHI CHUO KU",
|
19
|
+
:street_name => "旭ケ丘一丁目",
|
20
|
+
:street_kana => "アサヒガオカ 1",
|
21
|
+
:street_romaji => "ASAHIGAOKA 1",
|
22
|
+
:alias => nil,
|
23
|
+
:latitude => "43.04223",
|
24
|
+
:longitude => "141.319722"
|
25
|
+
},
|
26
|
+
{
|
27
|
+
:prefecture_code => "47",
|
28
|
+
:prefecture_name => "沖縄県",
|
29
|
+
:prefecture_kana => "オキナワケン",
|
30
|
+
:prefecture_romaji => "OKINAWA KEN",
|
31
|
+
:municipality_code => "47382",
|
32
|
+
:municipality_name => "八重山郡与那国町",
|
33
|
+
:municipality_kana => "ヤエヤマグンヨナグニチョウ",
|
34
|
+
:municipality_romaji => "YAEYAMA GUN YONAGUNI CHO",
|
35
|
+
:street_name => "字与那国",
|
36
|
+
:street_kana => nil,
|
37
|
+
:street_romaji => nil,
|
38
|
+
:alias => nil,
|
39
|
+
:latitude => "24.455925",
|
40
|
+
:longitude => "122.987678",
|
41
|
+
},
|
42
|
+
],
|
43
|
+
[
|
44
|
+
records.size,
|
45
|
+
records[0].to_h,
|
46
|
+
records[-1].to_h,
|
47
|
+
])
|
48
|
+
end
|
49
|
+
|
50
|
+
sub_test_case("#metadata") do
|
51
|
+
test("#description") do
|
52
|
+
description = @dataset.metadata.description
|
53
|
+
assert_equal([
|
54
|
+
"# Geolonia 住所データ",
|
55
|
+
"## 住所データ仕様",
|
56
|
+
"### ファイルフォーマット",
|
57
|
+
"### 列",
|
58
|
+
"### ソート順",
|
59
|
+
],
|
60
|
+
description.scan(/^#.*$/),
|
61
|
+
description)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
class ITACorpusTest < Test::Unit::TestCase
|
2
|
+
|
3
|
+
sub_test_case("type") do
|
4
|
+
test("emotion") do
|
5
|
+
dataset = Datasets::ITACorpus.new(type: :emotion)
|
6
|
+
records = dataset.to_a
|
7
|
+
assert_equal([
|
8
|
+
100,
|
9
|
+
{
|
10
|
+
:id => "EMOTION100_001",
|
11
|
+
:sentence => "えっ嘘でしょ。,エッウソデショ。"
|
12
|
+
},
|
13
|
+
{
|
14
|
+
:id => "EMOTION100_100",
|
15
|
+
:sentence => "ラーテャン。,ラーテャン。",
|
16
|
+
},
|
17
|
+
],
|
18
|
+
[
|
19
|
+
records.size,
|
20
|
+
records[0].to_h,
|
21
|
+
records[-1].to_h,
|
22
|
+
])
|
23
|
+
end
|
24
|
+
|
25
|
+
test("recitation") do
|
26
|
+
dataset = Datasets::ITACorpus.new(type: :recitation)
|
27
|
+
records = dataset.to_a
|
28
|
+
assert_equal([
|
29
|
+
324,
|
30
|
+
{
|
31
|
+
:id => "RECITATION324_001",
|
32
|
+
:sentence => "女の子がキッキッ嬉しそう。,オンナノコガキッキッウレシソー。"
|
33
|
+
},
|
34
|
+
{
|
35
|
+
:id => "RECITATION324_324",
|
36
|
+
:sentence => "チュクンの波長は、パツンと共通している。,チュクンノハチョーワ、パツントキョーツウシテイル。",
|
37
|
+
},
|
38
|
+
],
|
39
|
+
[
|
40
|
+
records.size,
|
41
|
+
records[0].to_h,
|
42
|
+
records[-1].to_h,
|
43
|
+
])
|
44
|
+
end
|
45
|
+
|
46
|
+
test("invalid") do
|
47
|
+
message = "Please set type :emotion or :recitation: :invalid"
|
48
|
+
assert_raise(ArgumentError.new(message)) do
|
49
|
+
Datasets::ITACorpus.new(type: :invalid)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
sub_test_case("#metadata") do
|
56
|
+
test("#description") do
|
57
|
+
dataset = Datasets::ITACorpus.new(type: :emotion)
|
58
|
+
description = dataset.metadata.description
|
59
|
+
assert_equal([
|
60
|
+
"# ITAコーパスの文章リスト公開用リポジトリ",
|
61
|
+
"## ITAコーパスとは",
|
62
|
+
"## ITAコーパスの文献情報"
|
63
|
+
],
|
64
|
+
description.scan(/^#.*$/),
|
65
|
+
description)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|