red-datasets 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +23 -2
- data/doc/text/news.md +86 -0
- data/lib/datasets/adult.rb +6 -9
- data/lib/datasets/afinn.rb +48 -0
- data/lib/datasets/aozora-bunko.rb +196 -0
- data/lib/datasets/cache-path.rb +28 -0
- data/lib/datasets/california-housing.rb +60 -0
- data/lib/datasets/cifar.rb +2 -4
- data/lib/datasets/cldr-plurals.rb +2 -4
- data/lib/datasets/communities.rb +5 -8
- data/lib/datasets/dataset.rb +8 -12
- data/lib/datasets/diamonds.rb +26 -0
- data/lib/datasets/downloader.rb +6 -1
- data/lib/datasets/e-stat-japan.rb +2 -1
- data/lib/datasets/fashion-mnist.rb +4 -0
- data/lib/datasets/fuel-economy.rb +35 -0
- data/lib/datasets/geolonia.rb +67 -0
- data/lib/datasets/ggplot2-dataset.rb +79 -0
- data/lib/datasets/hepatitis.rb +5 -8
- data/lib/datasets/iris.rb +5 -8
- data/lib/datasets/ita-corpus.rb +57 -0
- data/lib/datasets/kuzushiji-mnist.rb +16 -0
- data/lib/datasets/libsvm-dataset-list.rb +5 -8
- data/lib/datasets/libsvm.rb +3 -4
- data/lib/datasets/license.rb +26 -0
- data/lib/datasets/livedoor-news.rb +80 -0
- data/lib/datasets/metadata.rb +14 -0
- data/lib/datasets/mnist.rb +7 -7
- data/lib/datasets/mushroom.rb +5 -8
- data/lib/datasets/penguins.rb +4 -8
- data/lib/datasets/penn-treebank.rb +2 -4
- data/lib/datasets/pmjt-dataset-list.rb +67 -0
- data/lib/datasets/postal-code-japan.rb +2 -6
- data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
- data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
- data/lib/datasets/seaborn.rb +90 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
- data/lib/datasets/wikipedia.rb +4 -5
- data/lib/datasets/wine.rb +6 -9
- data/lib/datasets/zip-extractor.rb +36 -0
- data/lib/datasets.rb +14 -2
- data/red-datasets.gemspec +1 -1
- data/test/helper.rb +21 -0
- data/test/test-afinn.rb +60 -0
- data/test/test-aozora-bunko.rb +190 -0
- data/test/test-california-housing.rb +56 -0
- data/test/test-cldr-plurals.rb +1 -1
- data/test/test-dataset.rb +15 -7
- data/test/test-diamonds.rb +71 -0
- data/test/test-fuel-economy.rb +75 -0
- data/test/test-geolonia.rb +64 -0
- data/test/test-ita-corpus.rb +69 -0
- data/test/test-kuzushiji-mnist.rb +137 -0
- data/test/test-license.rb +24 -0
- data/test/test-livedoor-news.rb +351 -0
- data/test/test-metadata.rb +36 -0
- data/test/test-penguins.rb +1 -1
- data/test/test-pmjt-dataset-list.rb +50 -0
- data/test/test-quora-duplicate-question-pair.rb +33 -0
- data/test/test-rdataset.rb +246 -0
- data/test/{test-seaborn-data.rb → test-seaborn.rb} +70 -4
- data/test/test-sudachi-synonym-dictionary.rb +5 -5
- data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
- metadata +58 -14
- data/lib/datasets/seaborn-data.rb +0 -49
- data/test/test-rdatasets.rb +0 -136
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'zip'
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class ZipExtractor
|
5
|
+
def initialize(path)
|
6
|
+
@path = path
|
7
|
+
end
|
8
|
+
|
9
|
+
def extract_first_file
|
10
|
+
Zip::File.open(@path) do |zip_file|
|
11
|
+
zip_file.each do |entry|
|
12
|
+
next unless entry.file?
|
13
|
+
|
14
|
+
entry.get_input_stream do |input|
|
15
|
+
return yield(input)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
nil
|
20
|
+
end
|
21
|
+
|
22
|
+
def extract_file(file_path)
|
23
|
+
Zip::File.open(@path) do |zip_file|
|
24
|
+
zip_file.each do |entry|
|
25
|
+
next unless entry.file?
|
26
|
+
next unless entry.name == file_path
|
27
|
+
|
28
|
+
entry.get_input_stream do |input|
|
29
|
+
return yield(input)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
nil
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
data/lib/datasets.rb
CHANGED
@@ -1,22 +1,34 @@
|
|
1
1
|
require_relative "datasets/version"
|
2
2
|
|
3
3
|
require_relative "datasets/adult"
|
4
|
+
require_relative "datasets/afinn"
|
5
|
+
require_relative "datasets/aozora-bunko"
|
6
|
+
require_relative "datasets/california-housing"
|
4
7
|
require_relative "datasets/cifar"
|
5
8
|
require_relative "datasets/cldr-plurals"
|
6
9
|
require_relative "datasets/communities"
|
10
|
+
require_relative "datasets/diamonds"
|
7
11
|
require_relative "datasets/e-stat-japan"
|
8
12
|
require_relative "datasets/fashion-mnist"
|
13
|
+
require_relative "datasets/fuel-economy"
|
14
|
+
require_relative "datasets/geolonia"
|
9
15
|
require_relative "datasets/hepatitis"
|
10
16
|
require_relative "datasets/iris"
|
17
|
+
require_relative "datasets/ita-corpus"
|
18
|
+
require_relative "datasets/kuzushiji-mnist"
|
11
19
|
require_relative "datasets/libsvm"
|
12
20
|
require_relative "datasets/libsvm-dataset-list"
|
21
|
+
require_relative "datasets/livedoor-news"
|
13
22
|
require_relative "datasets/mnist"
|
14
23
|
require_relative "datasets/mushroom"
|
15
24
|
require_relative "datasets/penguins"
|
16
25
|
require_relative "datasets/penn-treebank"
|
26
|
+
require_relative "datasets/pmjt-dataset-list"
|
17
27
|
require_relative "datasets/postal-code-japan"
|
18
|
-
require_relative "datasets/
|
19
|
-
require_relative "datasets/
|
28
|
+
require_relative "datasets/quora-duplicate-question-pair"
|
29
|
+
require_relative "datasets/rdataset"
|
30
|
+
require_relative "datasets/seaborn"
|
20
31
|
require_relative "datasets/sudachi-synonym-dictionary"
|
21
32
|
require_relative "datasets/wikipedia"
|
33
|
+
require_relative "datasets/wikipedia-kyoto-japanese-english"
|
22
34
|
require_relative "datasets/wine"
|
data/red-datasets.gemspec
CHANGED
@@ -34,7 +34,7 @@ Gem::Specification.new do |spec|
|
|
34
34
|
spec.files += Dir.glob("doc/text/*")
|
35
35
|
spec.test_files += Dir.glob("test/**/*")
|
36
36
|
|
37
|
-
spec.add_runtime_dependency("csv", ">= 3.
|
37
|
+
spec.add_runtime_dependency("csv", ">= 3.2.4")
|
38
38
|
spec.add_runtime_dependency("rexml")
|
39
39
|
spec.add_runtime_dependency("rubyzip")
|
40
40
|
|
data/test/helper.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require "fileutils"
|
2
2
|
require "pathname"
|
3
3
|
require "time"
|
4
|
+
require "tmpdir"
|
4
5
|
|
5
6
|
require "datasets"
|
6
7
|
|
@@ -18,4 +19,24 @@ module Helper
|
|
18
19
|
FileUtils.rm_rf(@tmp_dir)
|
19
20
|
end
|
20
21
|
end
|
22
|
+
|
23
|
+
module PathRestorable
|
24
|
+
def restore_path(path)
|
25
|
+
unless path.exist?
|
26
|
+
return yield
|
27
|
+
end
|
28
|
+
|
29
|
+
Dir.mktmpdir do |dir|
|
30
|
+
FileUtils.cp_r(path, dir, preserve: true)
|
31
|
+
begin
|
32
|
+
yield
|
33
|
+
ensure
|
34
|
+
FileUtils.rmtree(path, secure: true) if path.exist?
|
35
|
+
FileUtils.cp_r(Pathname(dir) + path.basename,
|
36
|
+
path,
|
37
|
+
preserve: true)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
21
42
|
end
|
data/test/test-afinn.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
class AFINNTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::AFINN.new
|
4
|
+
end
|
5
|
+
|
6
|
+
test('#each') do
|
7
|
+
records = @dataset.each.to_a
|
8
|
+
assert_equal([
|
9
|
+
2477,
|
10
|
+
{
|
11
|
+
:valence => -2,
|
12
|
+
:word => "abandon"
|
13
|
+
},
|
14
|
+
{
|
15
|
+
:valence => 2,
|
16
|
+
:word => "zealous"
|
17
|
+
},
|
18
|
+
],
|
19
|
+
[
|
20
|
+
records.size,
|
21
|
+
records[0].to_h,
|
22
|
+
records[-1].to_h,
|
23
|
+
])
|
24
|
+
end
|
25
|
+
|
26
|
+
sub_test_case('#metadata') do
|
27
|
+
test('#description') do
|
28
|
+
description = @dataset.metadata.description
|
29
|
+
assert_equal(<<-DESCRIPTION.chomp, description)
|
30
|
+
AFINN is a list of English words rated for valence with an integer
|
31
|
+
between minus five (negative) and plus five (positive). The words have
|
32
|
+
been manually labeled by Finn Årup Nielsen in 2009-2011. The file
|
33
|
+
is tab-separated. There are two versions:
|
34
|
+
|
35
|
+
AFINN-111: Newest version with 2477 words and phrases.
|
36
|
+
|
37
|
+
An evaluation of the word list is available in:
|
38
|
+
|
39
|
+
Finn Årup Nielsen, "A new ANEW: Evaluation of a word list for
|
40
|
+
sentiment analysis in microblogs", http://arxiv.org/abs/1103.2903
|
41
|
+
|
42
|
+
The list was used in:
|
43
|
+
|
44
|
+
Lars Kai Hansen, Adam Arvidsson, Finn Årup Nielsen, Elanor Colleoni,
|
45
|
+
Michael Etter, "Good Friends, Bad News - Affect and Virality in
|
46
|
+
Twitter", The 2011 International Workshop on Social Computing,
|
47
|
+
Network, and Services (SocialComNet 2011).
|
48
|
+
|
49
|
+
|
50
|
+
This database of words is copyright protected and distributed under
|
51
|
+
"Open Database License (ODbL) v1.0"
|
52
|
+
http://www.opendatacommons.org/licenses/odbl/1.0/ or a similar
|
53
|
+
copyleft license.
|
54
|
+
|
55
|
+
See comments on the word list here:
|
56
|
+
http://fnielsen.posterous.com/old-anew-a-sentiment-about-sentiment-analysis
|
57
|
+
DESCRIPTION
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,190 @@
|
|
1
|
+
class AozoraBunkoTest < Test::Unit::TestCase
|
2
|
+
include Helper::PathRestorable
|
3
|
+
|
4
|
+
def setup
|
5
|
+
@dataset = Datasets::AozoraBunko.new
|
6
|
+
@cache_path = @dataset.send(:cache_path)
|
7
|
+
end
|
8
|
+
|
9
|
+
test('#new') do
|
10
|
+
assert_equal({
|
11
|
+
title_id: '059898',
|
12
|
+
title: 'ウェストミンスター寺院',
|
13
|
+
title_reading: 'ウェストミンスターじいん',
|
14
|
+
title_reading_collation: 'うえすとみんすたあしいん',
|
15
|
+
subtitle: '',
|
16
|
+
subtitle_reading: '',
|
17
|
+
original_title: '',
|
18
|
+
first_appearance: '',
|
19
|
+
ndc_code: 'NDC 933',
|
20
|
+
syllabary_spelling_type: '新字新仮名',
|
21
|
+
copyrighted: false,
|
22
|
+
published_date: '2020-04-03',
|
23
|
+
last_updated_date: '2020-03-28',
|
24
|
+
detail_url: 'https://www.aozora.gr.jp/cards/001257/card59898.html',
|
25
|
+
person_id: '001257',
|
26
|
+
person_family_name: 'アーヴィング',
|
27
|
+
person_first_name: 'ワシントン',
|
28
|
+
person_family_name_reading: 'アーヴィング',
|
29
|
+
person_first_name_reading: 'ワシントン',
|
30
|
+
person_family_name_reading_collation: 'ああういんく',
|
31
|
+
person_first_name_reading_collation: 'わしんとん',
|
32
|
+
person_family_name_romaji: 'Irving',
|
33
|
+
person_first_name_romaji: 'Washington',
|
34
|
+
person_type: '著者',
|
35
|
+
person_birthday: '1783-04-03',
|
36
|
+
person_date_of_death: '1859-11-28',
|
37
|
+
person_copyrighted: false,
|
38
|
+
original_book_name1: 'スケッチ・ブック',
|
39
|
+
original_book_publisher_name1: '新潮文庫、新潮社',
|
40
|
+
original_book_first_published_date1: '1957(昭和32)年5月20日',
|
41
|
+
used_version_for_registration1: '2000(平成12)年2月20日33刷改版',
|
42
|
+
used_version_for_proofreading1: '2000(平成12)年2月20日33刷改版',
|
43
|
+
base_of_original_book_name1: '',
|
44
|
+
base_of_original_book_publisher_name1: '',
|
45
|
+
base_of_original_book_first_published_date1: '',
|
46
|
+
original_book_name2: '',
|
47
|
+
original_book_publisher_name2: '',
|
48
|
+
original_book_first_published_date2: '',
|
49
|
+
used_version_for_registration2: '',
|
50
|
+
used_version_for_proofreading2: '',
|
51
|
+
base_of_original_book_name2: '',
|
52
|
+
base_of_original_book_publisher_name2: '',
|
53
|
+
base_of_original_book_first_published_date2: '',
|
54
|
+
registered_person_name: 'えにしだ',
|
55
|
+
proofreader_name: '砂場清隆',
|
56
|
+
text_file_url: 'https://www.aozora.gr.jp/cards/001257/files/59898_ruby_70679.zip',
|
57
|
+
last_text_file_updated_date: '2020-03-28',
|
58
|
+
text_file_character_encoding: 'ShiftJIS',
|
59
|
+
text_file_character_set: 'JIS X 0208',
|
60
|
+
text_file_updating_count: '0',
|
61
|
+
html_file_url: 'https://www.aozora.gr.jp/cards/001257/files/59898_70731.html',
|
62
|
+
last_html_file_updated_date: '2020-03-28',
|
63
|
+
html_file_character_encoding: 'ShiftJIS',
|
64
|
+
html_file_character_set: 'JIS X 0208',
|
65
|
+
html_file_updating_count: '0'
|
66
|
+
|
67
|
+
},
|
68
|
+
@dataset.first.to_h)
|
69
|
+
end
|
70
|
+
|
71
|
+
sub_test_case(:Book) do
|
72
|
+
sub_test_case('#text') do
|
73
|
+
test('readable') do
|
74
|
+
book = Datasets::AozoraBunko::Book.new
|
75
|
+
book.cache_path = @cache_path
|
76
|
+
book.title_id = '059898'
|
77
|
+
book.person_id = '001257'
|
78
|
+
book.text_file_url = 'https://www.aozora.gr.jp/cards/001257/files/59898_ruby_70679.zip'
|
79
|
+
book.text_file_character_encoding = 'ShiftJIS'
|
80
|
+
|
81
|
+
assert_equal([
|
82
|
+
'ウェストミンスター寺',
|
83
|
+
"アの皆さんです。\r\n"
|
84
|
+
],
|
85
|
+
[
|
86
|
+
book.text[0, 10],
|
87
|
+
book.text[-10, 10]
|
88
|
+
])
|
89
|
+
end
|
90
|
+
|
91
|
+
test('not readable') do
|
92
|
+
book = Datasets::AozoraBunko::Book.new
|
93
|
+
book.text_file_url = 'https://mega.nz/file/6tMxgAjZ#PglDDyJL0syRhnULqK0qhTMC7cktsgqwObj5fY_knpE'
|
94
|
+
|
95
|
+
assert_equal(nil, book.text)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
sub_test_case('#html') do
|
100
|
+
sub_test_case('readable') do
|
101
|
+
test('encoding is ShiftJIS') do
|
102
|
+
book = Datasets::AozoraBunko::Book.new
|
103
|
+
book.cache_path = @cache_path
|
104
|
+
book.title_id = '059898'
|
105
|
+
book.person_id = '001257'
|
106
|
+
book.html_file_url = 'https://www.aozora.gr.jp/cards/001257/files/59898_70731.html'
|
107
|
+
book.html_file_character_encoding = 'ShiftJIS'
|
108
|
+
|
109
|
+
assert_equal("<title>ワシントン・アーヴィング Washington Irving 吉田甲子太郎訳 ウェストミンスター寺院</title>",
|
110
|
+
book.html.split("\n")[8].strip)
|
111
|
+
end
|
112
|
+
|
113
|
+
test('encoding is UTF-8') do
|
114
|
+
book = Datasets::AozoraBunko::Book.new
|
115
|
+
book.cache_path = @cache_path
|
116
|
+
|
117
|
+
book.title_id = '000750'
|
118
|
+
book.person_id = '000146'
|
119
|
+
book.html_file_url = 'http://www.lcv.ne.jp/~ibs52086/fire/'
|
120
|
+
book.html_file_character_encoding = 'UTF-8'
|
121
|
+
|
122
|
+
assert_equal('<title>種田山頭火句集 | 『草木塔抄』他 FIRE ON THE MOUNTAIN</title>',
|
123
|
+
book.html.split("\n")[7])
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
test('not readable') do
|
128
|
+
book = Datasets::AozoraBunko::Book.new
|
129
|
+
book.html_file_url = ''
|
130
|
+
|
131
|
+
assert_equal(nil, book.html)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
sub_test_case('converting boolean') do
|
136
|
+
test('#person_copyrighted?') do
|
137
|
+
book = @dataset.first
|
138
|
+
assert_equal([
|
139
|
+
false,
|
140
|
+
false,
|
141
|
+
false,
|
142
|
+
],
|
143
|
+
[
|
144
|
+
book.person_copyrighted?,
|
145
|
+
book.person_copyrighted,
|
146
|
+
book.to_h[:person_copyrighted],
|
147
|
+
])
|
148
|
+
end
|
149
|
+
|
150
|
+
test('#copyrighted?') do
|
151
|
+
book = @dataset.first
|
152
|
+
assert_equal([
|
153
|
+
false,
|
154
|
+
false,
|
155
|
+
false,
|
156
|
+
],
|
157
|
+
[
|
158
|
+
book.copyrighted?,
|
159
|
+
book.copyrighted,
|
160
|
+
book.to_h[:copyrighted],
|
161
|
+
])
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
test('#clear_cache! removes all cache files') do
|
166
|
+
book = Datasets::AozoraBunko::Book.new
|
167
|
+
book.cache_path = @cache_path
|
168
|
+
|
169
|
+
book.title_id = '059898'
|
170
|
+
book.person_id = '001257'
|
171
|
+
book.text_file_url = 'https://www.aozora.gr.jp/cards/001257/files/59898_ruby_70679.zip'
|
172
|
+
book.text_file_character_encoding = 'ShiftJIS'
|
173
|
+
book.html_file_url = 'https://www.aozora.gr.jp/cards/001257/files/59898_70731.html'
|
174
|
+
book.html_file_character_encoding = 'ShiftJIS'
|
175
|
+
|
176
|
+
book.text
|
177
|
+
book.html
|
178
|
+
|
179
|
+
restore_path(@cache_path.base_dir) do
|
180
|
+
assert_equal(true, @cache_path.base_dir.exist?)
|
181
|
+
assert_equal(true, book.send(:text_file_output_path).exist?)
|
182
|
+
assert_equal(true, book.send(:html_file_output_path).exist?)
|
183
|
+
@dataset.clear_cache!
|
184
|
+
assert_equal(false, book.send(:html_file_output_path).exist?)
|
185
|
+
assert_equal(false, book.send(:text_file_output_path).exist?)
|
186
|
+
assert_equal(false, @cache_path.base_dir.exist?)
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
class CaliforniaHousingTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::CaliforniaHousing.new
|
4
|
+
end
|
5
|
+
|
6
|
+
def record(*args)
|
7
|
+
Datasets::CaliforniaHousing::Record.new(*args)
|
8
|
+
end
|
9
|
+
|
10
|
+
test("#each") do
|
11
|
+
records = @dataset.each.to_a
|
12
|
+
assert_equal([
|
13
|
+
20640,
|
14
|
+
{
|
15
|
+
median_house_value: 452600.000000,
|
16
|
+
median_income: 8.325200,
|
17
|
+
housing_median_age: 41.000000,
|
18
|
+
total_rooms: 880.000000,
|
19
|
+
total_bedrooms: 129.000000,
|
20
|
+
population: 322.000000,
|
21
|
+
households: 126.000000,
|
22
|
+
latitude: 37.880000,
|
23
|
+
longitude: -122.230000
|
24
|
+
},
|
25
|
+
{
|
26
|
+
median_house_value: 89400.000000,
|
27
|
+
median_income: 2.388600,
|
28
|
+
housing_median_age: 16.000000,
|
29
|
+
total_rooms: 2785.000000,
|
30
|
+
total_bedrooms: 616.000000,
|
31
|
+
population: 1387.000000,
|
32
|
+
households: 530.000000,
|
33
|
+
latitude: 39.370000,
|
34
|
+
longitude: -121.240000
|
35
|
+
},
|
36
|
+
],
|
37
|
+
[
|
38
|
+
records.size,
|
39
|
+
records[0].to_h,
|
40
|
+
records[-1].to_h
|
41
|
+
])
|
42
|
+
end
|
43
|
+
|
44
|
+
sub_test_case("#metadata") do
|
45
|
+
test("#description") do
|
46
|
+
description = @dataset.metadata.description
|
47
|
+
assert_equal(<<-DESCRIPTION, description)
|
48
|
+
Housing information from the 1990 census used in
|
49
|
+
Pace, R. Kelley and Ronald Barry,
|
50
|
+
"Sparse Spatial Autoregressions",
|
51
|
+
Statistics and Probability Letters, 33 (1997) 291-297.
|
52
|
+
Available from http://lib.stat.cmu.edu/datasets/.
|
53
|
+
DESCRIPTION
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
data/test/test-cldr-plurals.rb
CHANGED
data/test/test-dataset.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
class TestDataset < Test::Unit::TestCase
|
2
2
|
sub_test_case("#clear_cache!") do
|
3
|
+
include Helper::PathRestorable
|
4
|
+
|
3
5
|
def setup
|
4
6
|
@dataset = Datasets::Iris.new
|
5
7
|
@cache_dir_path = @dataset.send(:cache_dir_path)
|
@@ -9,18 +11,24 @@ class TestDataset < Test::Unit::TestCase
|
|
9
11
|
@dataset.first # This ensures the dataset downloaded
|
10
12
|
existence = {before: @cache_dir_path.join("iris.csv").exist?}
|
11
13
|
|
12
|
-
@
|
13
|
-
|
14
|
+
restore_path(@cache_dir_path) do
|
15
|
+
@dataset.clear_cache!
|
16
|
+
existence[:after] = @cache_dir_path.join("iris.csv").exist?
|
14
17
|
|
15
|
-
|
16
|
-
|
18
|
+
assert_equal({before: true, after: false},
|
19
|
+
existence)
|
20
|
+
end
|
17
21
|
end
|
18
22
|
|
19
23
|
test("when the dataset is not downloaded") do
|
20
|
-
|
24
|
+
restore_path(@cache_dir_path) do
|
25
|
+
if @cache_dir_path.exist?
|
26
|
+
FileUtils.rmtree(@cache_dir_path.to_s, secure: true)
|
27
|
+
end
|
21
28
|
|
22
|
-
|
23
|
-
|
29
|
+
assert_nothing_raised do
|
30
|
+
@dataset.clear_cache!
|
31
|
+
end
|
24
32
|
end
|
25
33
|
end
|
26
34
|
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
class DiamondsTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::Diamonds.new
|
4
|
+
end
|
5
|
+
|
6
|
+
def record(*args)
|
7
|
+
Datasets::Diamonds::Record.new(*args)
|
8
|
+
end
|
9
|
+
|
10
|
+
test("#each") do
|
11
|
+
records = @dataset.each.to_a
|
12
|
+
assert_equal([
|
13
|
+
53940,
|
14
|
+
{
|
15
|
+
carat: 0.23,
|
16
|
+
clarity: "SI2",
|
17
|
+
color: "E",
|
18
|
+
cut: "Ideal",
|
19
|
+
depth: 61.5,
|
20
|
+
price: 326,
|
21
|
+
table: 55.0,
|
22
|
+
x: 3.95,
|
23
|
+
y: 3.98,
|
24
|
+
z: 2.43,
|
25
|
+
},
|
26
|
+
{
|
27
|
+
carat: 0.75,
|
28
|
+
clarity: "SI2",
|
29
|
+
color: "D",
|
30
|
+
cut: "Ideal",
|
31
|
+
depth: 62.2,
|
32
|
+
price: 2757,
|
33
|
+
table: 55.0,
|
34
|
+
x: 5.83,
|
35
|
+
y: 5.87,
|
36
|
+
z: 3.64,
|
37
|
+
},
|
38
|
+
],
|
39
|
+
[
|
40
|
+
records.size,
|
41
|
+
records[0].to_h,
|
42
|
+
records[-1].to_h
|
43
|
+
])
|
44
|
+
end
|
45
|
+
|
46
|
+
sub_test_case("#metadata") do
|
47
|
+
test("#description") do
|
48
|
+
description = @dataset.metadata.description
|
49
|
+
assert_equal(<<-DESCRIPTION, description)
|
50
|
+
Prices of over 50,000 round cut diamonds
|
51
|
+
|
52
|
+
A dataset containing the prices and other attributes of almost 54,000
|
53
|
+
diamonds. The variables are as follows:
|
54
|
+
|
55
|
+
A data frame with 53940 rows and 10 variables:
|
56
|
+
|
57
|
+
* price: price in US dollars ($326--$18,823)
|
58
|
+
* carat: weight of the diamond (0.2--5.01)
|
59
|
+
* cut: quality of the cut (Fair, Good, Very Good, Premium, Ideal)
|
60
|
+
* color: diamond colour, from D (best) to J (worst)
|
61
|
+
* clarity: a measurement of how clear the diamond is (I1 (worst), SI2,
|
62
|
+
SI1, VS2, VS1, VVS2, VVS1, IF (best))
|
63
|
+
* x: length in mm (0--10.74)
|
64
|
+
* y: width in mm (0--58.9)
|
65
|
+
* z: depth in mm (0--31.8)
|
66
|
+
* depth: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)
|
67
|
+
* table: width of top of diamond relative to widest point (43--95)
|
68
|
+
DESCRIPTION
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
class FuelEconomyTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::FuelEconomy.new
|
4
|
+
end
|
5
|
+
|
6
|
+
def record(*args)
|
7
|
+
Datasets::FuelEconomy::Record.new(*args)
|
8
|
+
end
|
9
|
+
|
10
|
+
test("#each") do
|
11
|
+
records = @dataset.each.to_a
|
12
|
+
assert_equal([
|
13
|
+
234,
|
14
|
+
{
|
15
|
+
city_mpg: 18,
|
16
|
+
displacement: 1.8,
|
17
|
+
drive_train: "f",
|
18
|
+
fuel: "p",
|
19
|
+
highway_mpg: 29,
|
20
|
+
manufacturer: "audi",
|
21
|
+
model: "a4",
|
22
|
+
n_cylinders: 4,
|
23
|
+
transmission: "auto(l5)",
|
24
|
+
type: "compact",
|
25
|
+
year: 1999
|
26
|
+
},
|
27
|
+
{
|
28
|
+
city_mpg: 17,
|
29
|
+
displacement: 3.6,
|
30
|
+
drive_train: "f",
|
31
|
+
fuel: "p",
|
32
|
+
highway_mpg: 26,
|
33
|
+
manufacturer: "volkswagen",
|
34
|
+
model: "passat",
|
35
|
+
n_cylinders: 6,
|
36
|
+
transmission: "auto(s6)",
|
37
|
+
type: "midsize",
|
38
|
+
year: 2008
|
39
|
+
},
|
40
|
+
],
|
41
|
+
[
|
42
|
+
records.size,
|
43
|
+
records[0].to_h,
|
44
|
+
records[-1].to_h
|
45
|
+
])
|
46
|
+
end
|
47
|
+
|
48
|
+
sub_test_case("#metadata") do
|
49
|
+
test("#description") do
|
50
|
+
description = @dataset.metadata.description
|
51
|
+
assert_equal(<<-DESCRIPTION, description)
|
52
|
+
Fuel economy data from 1999 to 2008 for 38 popular models of cars
|
53
|
+
|
54
|
+
This dataset contains a subset of the fuel economy data that the EPA makes
|
55
|
+
available on https://fueleconomy.gov/. It contains only models which
|
56
|
+
had a new release every year between 1999 and 2008 - this was used as a
|
57
|
+
proxy for the popularity of the car.
|
58
|
+
|
59
|
+
A data frame with 234 rows and 11 variables:
|
60
|
+
|
61
|
+
* manufacturer: manufacturer name
|
62
|
+
* model: model name
|
63
|
+
* displacement: engine displacement, in litres
|
64
|
+
* year: year of manufacture
|
65
|
+
* n_cylinders: number of cylinders
|
66
|
+
* transmissions: type of transmission
|
67
|
+
* drive_train: the type of drive train, where f = front-wheel drive, r = rear wheel drive, 4 = 4wd
|
68
|
+
* city_mpg: city miles per gallon
|
69
|
+
* highway_mpg: highway miles per gallon
|
70
|
+
* fuel: fuel type
|
71
|
+
* type: "type" of car
|
72
|
+
DESCRIPTION
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
class GeoloniaTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::Geolonia.new
|
4
|
+
end
|
5
|
+
|
6
|
+
test('#each') do
|
7
|
+
records = @dataset.each.to_a
|
8
|
+
assert_equal([
|
9
|
+
277191,
|
10
|
+
{
|
11
|
+
:prefecture_code => "01",
|
12
|
+
:prefecture_name => "北海道",
|
13
|
+
:prefecture_kana => "ホッカイドウ",
|
14
|
+
:prefecture_romaji => "HOKKAIDO",
|
15
|
+
:municipality_code => "01101",
|
16
|
+
:municipality_name => "札幌市中央区",
|
17
|
+
:municipality_kana => "サッポロシチュウオウク",
|
18
|
+
:municipality_romaji => "SAPPORO SHI CHUO KU",
|
19
|
+
:street_name => "旭ケ丘一丁目",
|
20
|
+
:street_kana => "アサヒガオカ 1",
|
21
|
+
:street_romaji => "ASAHIGAOKA 1",
|
22
|
+
:alias => nil,
|
23
|
+
:latitude => "43.04223",
|
24
|
+
:longitude => "141.319722"
|
25
|
+
},
|
26
|
+
{
|
27
|
+
:prefecture_code => "47",
|
28
|
+
:prefecture_name => "沖縄県",
|
29
|
+
:prefecture_kana => "オキナワケン",
|
30
|
+
:prefecture_romaji => "OKINAWA KEN",
|
31
|
+
:municipality_code => "47325",
|
32
|
+
:municipality_name => "中頭郡嘉手納町",
|
33
|
+
:municipality_kana => "ナカガミグンカデナチョウ",
|
34
|
+
:municipality_romaji => "NAKAGAMI GUN KADENA CHO",
|
35
|
+
:street_name => "字兼久",
|
36
|
+
:street_kana => nil,
|
37
|
+
:street_romaji => nil,
|
38
|
+
:alias => "下原",
|
39
|
+
:latitude => "26.351841",
|
40
|
+
:longitude => "127.744975",
|
41
|
+
},
|
42
|
+
],
|
43
|
+
[
|
44
|
+
records.size,
|
45
|
+
records[0].to_h,
|
46
|
+
records[-1].to_h,
|
47
|
+
])
|
48
|
+
end
|
49
|
+
|
50
|
+
sub_test_case("#metadata") do
|
51
|
+
test("#description") do
|
52
|
+
description = @dataset.metadata.description
|
53
|
+
assert_equal([
|
54
|
+
"# Geolonia 住所データ",
|
55
|
+
"## 住所データ仕様",
|
56
|
+
"### ファイルフォーマット",
|
57
|
+
"### 列",
|
58
|
+
],
|
59
|
+
description.scan(/^#.*$/),
|
60
|
+
description)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|