red-datasets 0.0.8 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +6 -0
- data/doc/text/news.md +93 -0
- data/lib/datasets.rb +9 -0
- data/lib/datasets/adult.rb +4 -3
- data/lib/datasets/cifar.rb +4 -12
- data/lib/datasets/cldr-plurals.rb +385 -0
- data/lib/datasets/communities.rb +198 -0
- data/lib/datasets/dataset.rb +20 -1
- data/lib/datasets/downloader.rb +54 -26
- data/lib/datasets/e-stat-japan.rb +320 -0
- data/lib/datasets/error.rb +4 -0
- data/lib/datasets/hepatitis.rb +207 -0
- data/lib/datasets/libsvm-dataset-list.rb +194 -54
- data/lib/datasets/libsvm.rb +1 -9
- data/lib/datasets/mnist.rb +6 -4
- data/lib/datasets/mushroom.rb +256 -0
- data/lib/datasets/penguins.rb +146 -0
- data/lib/datasets/rdatasets.rb +95 -0
- data/lib/datasets/seaborn-data.rb +49 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +169 -0
- data/lib/datasets/table.rb +83 -3
- data/lib/datasets/tar-gz-readable.rb +14 -0
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia.rb +2 -10
- data/red-datasets.gemspec +1 -0
- data/test/run-test.rb +2 -0
- data/test/test-cldr-plurals.rb +180 -0
- data/test/test-communities.rb +290 -0
- data/test/test-dataset.rb +27 -0
- data/test/test-downloader.rb +29 -0
- data/test/test-e-stat-japan.rb +383 -0
- data/test/test-hepatitis.rb +74 -0
- data/test/test-mushroom.rb +80 -0
- data/test/test-penguins.rb +251 -0
- data/test/test-rdatasets.rb +136 -0
- data/test/test-seaborn-data.rb +97 -0
- data/test/test-sudachi-synonym-dictionary.rb +48 -0
- data/test/test-table.rb +123 -18
- metadata +61 -15
@@ -0,0 +1,80 @@
|
|
1
|
+
class MushroomTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::Mushroom.new
|
4
|
+
end
|
5
|
+
|
6
|
+
def record(*args)
|
7
|
+
Datasets::Mushroom::Record.new(*args)
|
8
|
+
end
|
9
|
+
|
10
|
+
test("#each") do
|
11
|
+
records = @dataset.each.to_a
|
12
|
+
assert_equal([
|
13
|
+
8124,
|
14
|
+
{
|
15
|
+
:label => "poisonous",
|
16
|
+
:cap_shape => "convex",
|
17
|
+
:cap_surface => "smooth",
|
18
|
+
:cap_color => "brown",
|
19
|
+
:bruises => "bruises",
|
20
|
+
:odor => "pungent",
|
21
|
+
:gill_attachment => "free",
|
22
|
+
:gill_spacing => "close",
|
23
|
+
:gill_size => "narrow",
|
24
|
+
:gill_color => "black",
|
25
|
+
:stalk_shape => "enlarging",
|
26
|
+
:stalk_root => "equal",
|
27
|
+
:stalk_surface_above_ring => "smooth",
|
28
|
+
:stalk_surface_below_ring => "smooth",
|
29
|
+
:stalk_color_above_ring => "white",
|
30
|
+
:stalk_color_below_ring => "white",
|
31
|
+
:veil_type => "partial",
|
32
|
+
:veil_color => "white",
|
33
|
+
:n_rings => 1,
|
34
|
+
:ring_type => "pendant",
|
35
|
+
:spore_print_color => "black",
|
36
|
+
:population => "scattered",
|
37
|
+
:habitat => "urban"
|
38
|
+
},
|
39
|
+
{
|
40
|
+
:label => "edible",
|
41
|
+
:cap_shape => "convex",
|
42
|
+
:cap_surface => "smooth",
|
43
|
+
:cap_color => "brown",
|
44
|
+
:bruises => "no",
|
45
|
+
:odor => "none",
|
46
|
+
:gill_attachment => "attached",
|
47
|
+
:gill_spacing => "close",
|
48
|
+
:gill_size => "broad",
|
49
|
+
:gill_color => "yellow",
|
50
|
+
:stalk_shape => "enlarging",
|
51
|
+
:stalk_root => "missing",
|
52
|
+
:stalk_surface_above_ring => "smooth",
|
53
|
+
:stalk_surface_below_ring => "smooth",
|
54
|
+
:stalk_color_above_ring => "orange",
|
55
|
+
:stalk_color_below_ring => "orange",
|
56
|
+
:veil_type => "partial",
|
57
|
+
:veil_color => "orange",
|
58
|
+
:n_rings => 1,
|
59
|
+
:ring_type => "pendant",
|
60
|
+
:spore_print_color => "orange",
|
61
|
+
:population => "clustered",
|
62
|
+
:habitat => "leaves"
|
63
|
+
}
|
64
|
+
],
|
65
|
+
[
|
66
|
+
records.size,
|
67
|
+
records[0].to_h,
|
68
|
+
records[-1].to_h
|
69
|
+
])
|
70
|
+
end
|
71
|
+
|
72
|
+
sub_test_case("#metadata") do
|
73
|
+
test("#description") do
|
74
|
+
description = @dataset.metadata.description
|
75
|
+
assert do
|
76
|
+
description.start_with?("1. Title: Mushroom Database")
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,251 @@
|
|
1
|
+
class PenguinsTest < Test::Unit::TestCase
|
2
|
+
sub_test_case("PenguinsRawData::SpeciesBase") do
|
3
|
+
test("#data_path") do
|
4
|
+
data_paths = [ Datasets::PenguinsRawData::Adelie,
|
5
|
+
Datasets::PenguinsRawData::Gentoo,
|
6
|
+
Datasets::PenguinsRawData::Chinstrap ].map {|cls|
|
7
|
+
dataset = cls.new
|
8
|
+
dataset.data_path.relative_path_from(dataset.send(:cache_dir_path)).to_s
|
9
|
+
}
|
10
|
+
assert_equal(["penguins/adelie.csv", "penguins/gentoo.csv", "penguins/chinstrap.csv"],
|
11
|
+
data_paths)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
sub_test_case("Adelie") do
|
16
|
+
def setup
|
17
|
+
@dataset = Datasets::PenguinsRawData::Adelie.new
|
18
|
+
end
|
19
|
+
|
20
|
+
test("#each") do
|
21
|
+
records = @dataset.each.to_a
|
22
|
+
assert_equal([ 152,
|
23
|
+
{
|
24
|
+
study_name: "PAL0708",
|
25
|
+
sample_number: 1,
|
26
|
+
species: "Adelie Penguin (Pygoscelis adeliae)",
|
27
|
+
region: "Anvers",
|
28
|
+
island: "Torgersen",
|
29
|
+
stage: "Adult, 1 Egg Stage",
|
30
|
+
individual_id: "N1A1",
|
31
|
+
clutch_completion: "Yes",
|
32
|
+
date_egg: DateTime.new(2007, 11, 11),
|
33
|
+
culmen_length_mm: 39.1,
|
34
|
+
culmen_depth_mm: 18.7,
|
35
|
+
flipper_length_mm: 181,
|
36
|
+
body_mass_g: 3750,
|
37
|
+
sex: "MALE",
|
38
|
+
delta_15_n_permil: nil,
|
39
|
+
delta_13_c_permil: nil,
|
40
|
+
comments: "Not enough blood for isotopes."
|
41
|
+
},
|
42
|
+
{
|
43
|
+
study_name: "PAL0910",
|
44
|
+
sample_number: 152,
|
45
|
+
species: "Adelie Penguin (Pygoscelis adeliae)",
|
46
|
+
region: "Anvers",
|
47
|
+
island: "Dream",
|
48
|
+
stage: "Adult, 1 Egg Stage",
|
49
|
+
individual_id: "N85A2",
|
50
|
+
clutch_completion: "Yes",
|
51
|
+
date_egg: DateTime.new(2009, 11, 17),
|
52
|
+
culmen_length_mm: 41.5,
|
53
|
+
culmen_depth_mm: 18.5,
|
54
|
+
flipper_length_mm: 201,
|
55
|
+
body_mass_g: 4000,
|
56
|
+
sex: "MALE",
|
57
|
+
delta_15_n_permil: 8.89640,
|
58
|
+
delta_13_c_permil: -26.06967,
|
59
|
+
comments: nil
|
60
|
+
}
|
61
|
+
],
|
62
|
+
[
|
63
|
+
records.size,
|
64
|
+
records[0].to_h,
|
65
|
+
records[-1].to_h
|
66
|
+
])
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
sub_test_case("Gentoo") do
|
71
|
+
def setup
|
72
|
+
@dataset = Datasets::PenguinsRawData::Gentoo.new
|
73
|
+
end
|
74
|
+
|
75
|
+
test("#each") do
|
76
|
+
records = @dataset.each.to_a
|
77
|
+
assert_equal([ 124,
|
78
|
+
{
|
79
|
+
study_name: "PAL0708",
|
80
|
+
sample_number: 1,
|
81
|
+
species: "Gentoo penguin (Pygoscelis papua)",
|
82
|
+
region: "Anvers",
|
83
|
+
island: "Biscoe",
|
84
|
+
stage: "Adult, 1 Egg Stage",
|
85
|
+
individual_id: "N31A1",
|
86
|
+
clutch_completion: "Yes",
|
87
|
+
date_egg: DateTime.new(2007, 11, 27),
|
88
|
+
culmen_length_mm: 46.1,
|
89
|
+
culmen_depth_mm: 13.2,
|
90
|
+
flipper_length_mm: 211,
|
91
|
+
body_mass_g: 4500,
|
92
|
+
sex: "FEMALE",
|
93
|
+
delta_15_n_permil: 7.993,
|
94
|
+
delta_13_c_permil: -25.5139,
|
95
|
+
comments: nil
|
96
|
+
},
|
97
|
+
{
|
98
|
+
study_name: "PAL0910",
|
99
|
+
sample_number: 124,
|
100
|
+
species: "Gentoo penguin (Pygoscelis papua)",
|
101
|
+
region: "Anvers",
|
102
|
+
island: "Biscoe",
|
103
|
+
stage: "Adult, 1 Egg Stage",
|
104
|
+
individual_id: "N43A2",
|
105
|
+
clutch_completion: "Yes",
|
106
|
+
date_egg: DateTime.new(2009, 11, 22),
|
107
|
+
culmen_length_mm: 49.9,
|
108
|
+
culmen_depth_mm: 16.1,
|
109
|
+
flipper_length_mm: 213,
|
110
|
+
body_mass_g: 5400,
|
111
|
+
sex: "MALE",
|
112
|
+
delta_15_n_permil: 8.3639,
|
113
|
+
delta_13_c_permil: -26.15531,
|
114
|
+
comments: nil
|
115
|
+
}
|
116
|
+
],
|
117
|
+
[
|
118
|
+
records.size,
|
119
|
+
records[0].to_h,
|
120
|
+
records[-1].to_h
|
121
|
+
])
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
sub_test_case("Chinstrap") do
|
126
|
+
def setup
|
127
|
+
@dataset = Datasets::PenguinsRawData::Chinstrap.new
|
128
|
+
end
|
129
|
+
|
130
|
+
test("#each") do
|
131
|
+
records = @dataset.each.to_a
|
132
|
+
assert_equal([ 68,
|
133
|
+
{
|
134
|
+
study_name: "PAL0708",
|
135
|
+
sample_number: 1,
|
136
|
+
species: "Chinstrap penguin (Pygoscelis antarctica)",
|
137
|
+
region: "Anvers",
|
138
|
+
island: "Dream",
|
139
|
+
stage: "Adult, 1 Egg Stage",
|
140
|
+
individual_id: "N61A1",
|
141
|
+
clutch_completion: "No",
|
142
|
+
date_egg: DateTime.new(2007, 11, 19),
|
143
|
+
culmen_length_mm: 46.5,
|
144
|
+
culmen_depth_mm: 17.9,
|
145
|
+
flipper_length_mm: 192,
|
146
|
+
body_mass_g: 3500,
|
147
|
+
sex: "FEMALE",
|
148
|
+
delta_15_n_permil: 9.03935,
|
149
|
+
delta_13_c_permil: -24.30229,
|
150
|
+
comments: "Nest never observed with full clutch."
|
151
|
+
},
|
152
|
+
{
|
153
|
+
study_name: "PAL0910",
|
154
|
+
sample_number: 68,
|
155
|
+
species: "Chinstrap penguin (Pygoscelis antarctica)",
|
156
|
+
region: "Anvers",
|
157
|
+
island: "Dream",
|
158
|
+
stage: "Adult, 1 Egg Stage",
|
159
|
+
individual_id: "N100A2",
|
160
|
+
clutch_completion: "Yes",
|
161
|
+
date_egg: DateTime.new(2009, 11, 21),
|
162
|
+
culmen_length_mm: 50.2,
|
163
|
+
culmen_depth_mm: 18.7,
|
164
|
+
flipper_length_mm: 198,
|
165
|
+
body_mass_g: 3775,
|
166
|
+
sex: "FEMALE",
|
167
|
+
delta_15_n_permil: 9.39305,
|
168
|
+
delta_13_c_permil: -24.25255,
|
169
|
+
comments: nil
|
170
|
+
}
|
171
|
+
],
|
172
|
+
[
|
173
|
+
records.size,
|
174
|
+
records[0].to_h,
|
175
|
+
records[-1].to_h
|
176
|
+
])
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
sub_test_case("Penguins") do
|
181
|
+
def setup
|
182
|
+
@dataset = Datasets::Penguins.new
|
183
|
+
end
|
184
|
+
|
185
|
+
test("order of species") do
|
186
|
+
species_values = @dataset.map {|r| r.species }.uniq
|
187
|
+
assert_equal(["Adelie", "Chinstrap", "Gentoo"],
|
188
|
+
species_values)
|
189
|
+
end
|
190
|
+
|
191
|
+
test("data cleansing") do
|
192
|
+
sex_values = @dataset.map {|r| r.sex }.uniq.compact.sort
|
193
|
+
assert_equal(["female", "male"],
|
194
|
+
sex_values)
|
195
|
+
end
|
196
|
+
|
197
|
+
test("#each") do
|
198
|
+
records = @dataset.each.to_a
|
199
|
+
assert_equal([
|
200
|
+
344,
|
201
|
+
{
|
202
|
+
species: "Adelie",
|
203
|
+
island: "Torgersen",
|
204
|
+
bill_length_mm: 39.1,
|
205
|
+
bill_depth_mm: 18.7,
|
206
|
+
flipper_length_mm: 181,
|
207
|
+
body_mass_g: 3750,
|
208
|
+
sex: "male",
|
209
|
+
year: 2007
|
210
|
+
},
|
211
|
+
{
|
212
|
+
species: "Chinstrap",
|
213
|
+
island: "Dream",
|
214
|
+
bill_length_mm: 46.5,
|
215
|
+
bill_depth_mm: 17.9,
|
216
|
+
flipper_length_mm: 192,
|
217
|
+
body_mass_g: 3500,
|
218
|
+
sex: "female",
|
219
|
+
year: 2007
|
220
|
+
},
|
221
|
+
{
|
222
|
+
species: "Gentoo",
|
223
|
+
island: "Biscoe",
|
224
|
+
bill_length_mm: 46.1,
|
225
|
+
bill_depth_mm: 13.2,
|
226
|
+
flipper_length_mm: 211,
|
227
|
+
body_mass_g: 4500,
|
228
|
+
sex: "female",
|
229
|
+
year: 2007
|
230
|
+
},
|
231
|
+
{
|
232
|
+
species: "Gentoo",
|
233
|
+
island: "Biscoe",
|
234
|
+
bill_length_mm: 49.9,
|
235
|
+
bill_depth_mm: 16.1,
|
236
|
+
flipper_length_mm: 213,
|
237
|
+
body_mass_g: 5400,
|
238
|
+
sex: "male",
|
239
|
+
year: 2009
|
240
|
+
}
|
241
|
+
],
|
242
|
+
[
|
243
|
+
records.size,
|
244
|
+
records[0].to_h,
|
245
|
+
records[152].to_h,
|
246
|
+
records[220].to_h,
|
247
|
+
records[-1].to_h,
|
248
|
+
])
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
class RdatasetsTest < Test::Unit::TestCase
|
2
|
+
sub_test_case("RdatasetsList") do
|
3
|
+
def setup
|
4
|
+
@dataset = Datasets::RdatasetsList.new
|
5
|
+
end
|
6
|
+
|
7
|
+
sub_test_case("#each") do
|
8
|
+
test("with package_name") do
|
9
|
+
records = @dataset.filter(package: "datasets").to_a
|
10
|
+
assert_equal([
|
11
|
+
84,
|
12
|
+
{
|
13
|
+
package: "datasets",
|
14
|
+
dataset: "ability.cov",
|
15
|
+
title: "Ability and Intelligence Tests",
|
16
|
+
rows: 6,
|
17
|
+
cols: 8,
|
18
|
+
n_binary: 0,
|
19
|
+
n_character: 0,
|
20
|
+
n_factor: 0,
|
21
|
+
n_logical: 0,
|
22
|
+
n_numeric: 8,
|
23
|
+
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/ability.cov.csv",
|
24
|
+
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/datasets/ability.cov.html"
|
25
|
+
},
|
26
|
+
{
|
27
|
+
package: "datasets",
|
28
|
+
dataset: "WWWusage",
|
29
|
+
title: "Internet Usage per Minute",
|
30
|
+
rows: 100,
|
31
|
+
cols: 2,
|
32
|
+
n_binary: 0,
|
33
|
+
n_character: 0,
|
34
|
+
n_factor: 0,
|
35
|
+
n_logical: 0,
|
36
|
+
n_numeric: 2,
|
37
|
+
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/WWWusage.csv",
|
38
|
+
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/datasets/WWWusage.html"
|
39
|
+
}
|
40
|
+
],
|
41
|
+
[
|
42
|
+
records.size,
|
43
|
+
records[0].to_h,
|
44
|
+
records[-1].to_h
|
45
|
+
])
|
46
|
+
end
|
47
|
+
|
48
|
+
test("without package_name") do
|
49
|
+
records = @dataset.each.to_a
|
50
|
+
assert_equal([
|
51
|
+
1714,
|
52
|
+
{
|
53
|
+
package: "AER",
|
54
|
+
dataset: "Affairs",
|
55
|
+
title: "Fair's Extramarital Affairs Data",
|
56
|
+
rows: 601,
|
57
|
+
cols: 9,
|
58
|
+
n_binary: 2,
|
59
|
+
n_character: 0,
|
60
|
+
n_factor: 2,
|
61
|
+
n_logical: 0,
|
62
|
+
n_numeric: 7,
|
63
|
+
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/AER/Affairs.csv",
|
64
|
+
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/AER/Affairs.html"
|
65
|
+
},
|
66
|
+
{
|
67
|
+
package: "vcd",
|
68
|
+
dataset: "WomenQueue",
|
69
|
+
title: "Women in Queues",
|
70
|
+
rows: 11,
|
71
|
+
cols: 2,
|
72
|
+
n_binary: 0,
|
73
|
+
n_character: 0,
|
74
|
+
n_factor: 1,
|
75
|
+
n_logical: 0,
|
76
|
+
n_numeric: 1,
|
77
|
+
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/vcd/WomenQueue.csv",
|
78
|
+
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/vcd/WomenQueue.html"
|
79
|
+
},
|
80
|
+
],
|
81
|
+
[
|
82
|
+
records.size,
|
83
|
+
records[0].to_h,
|
84
|
+
records[-1].to_h
|
85
|
+
])
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
sub_test_case("Rdatasets") do
|
91
|
+
sub_test_case("datasets") do
|
92
|
+
sub_test_case("AirPassengers") do
|
93
|
+
def setup
|
94
|
+
@dataset = Datasets::Rdatasets.new("datasets", "AirPassengers")
|
95
|
+
end
|
96
|
+
|
97
|
+
test("#each") do
|
98
|
+
records = @dataset.each.to_a
|
99
|
+
assert_equal([
|
100
|
+
144,
|
101
|
+
{ time: 1949, value: 112 },
|
102
|
+
{ time: 1960.91666666667, value: 432 },
|
103
|
+
],
|
104
|
+
[
|
105
|
+
records.size,
|
106
|
+
records[0],
|
107
|
+
records[-1]
|
108
|
+
])
|
109
|
+
end
|
110
|
+
|
111
|
+
test("#metadata.id") do
|
112
|
+
assert_equal("rdatasets-datasets-AirPassengers", @dataset.metadata.id)
|
113
|
+
end
|
114
|
+
|
115
|
+
test("#metadata.description") do
|
116
|
+
description = @dataset.metadata.description
|
117
|
+
assert do
|
118
|
+
description.include?("Monthly Airline Passenger Numbers 1949-1960")
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
test("invalid dataset name") do
|
124
|
+
assert_raise(ArgumentError) do
|
125
|
+
Datasets::Rdatasets.new("datasets", "invalid datasets name")
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
test("invalid package name") do
|
131
|
+
assert_raise(ArgumentError) do
|
132
|
+
Datasets::Rdatasets.new("invalid package name", "AirPassengers")
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|