red-datasets 0.0.7 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,74 @@
1
+ class HepatitisTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::Hepatitis.new
4
+ end
5
+
6
+ def record(*args)
7
+ Datasets::Hepatitis::Record.new(*args)
8
+ end
9
+
10
+ test("#each") do
11
+ records = @dataset.each.to_a
12
+ assert_equal([
13
+ 155,
14
+ {
15
+ :label => :live,
16
+ :age => 30,
17
+ :sex => :female,
18
+ :steroid => false,
19
+ :antivirals => true,
20
+ :fatigue => true,
21
+ :malaise => true,
22
+ :anorexia => true,
23
+ :liver_big => false,
24
+ :liver_firm => true,
25
+ :spleen_palpable => true,
26
+ :spiders => true,
27
+ :ascites => true,
28
+ :varices => true,
29
+ :bilirubin => 1.0,
30
+ :alkaline_phosphate => 85,
31
+ :sgot => 18,
32
+ :albumin => 4.0,
33
+ :protime => nil,
34
+ :histology => false,
35
+ },
36
+ {
37
+ :label => :die,
38
+ :age => 43,
39
+ :sex => :male,
40
+ :steroid => true,
41
+ :antivirals => true,
42
+ :fatigue => false,
43
+ :malaise => true,
44
+ :anorexia => true,
45
+ :liver_big => true,
46
+ :liver_firm => true,
47
+ :spleen_palpable => false,
48
+ :spiders => false,
49
+ :ascites => false,
50
+ :varices => true,
51
+ :bilirubin => 1.2,
52
+ :alkaline_phosphate => 100,
53
+ :sgot => 19,
54
+ :albumin => 3.1,
55
+ :protime => 42,
56
+ :histology => true,
57
+ }
58
+ ],
59
+ [
60
+ records.size,
61
+ records[0].to_h,
62
+ records[-1].to_h,
63
+ ])
64
+ end
65
+
66
+ sub_test_case("#metadata") do
67
+ test("#description") do
68
+ description = @dataset.metadata.description
69
+ assert do
70
+ description.start_with?("1. Title: Hepatitis Domain")
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,47 @@
1
+ class LIBSVMDatasetListTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::LIBSVMDatasetList.new
4
+ end
5
+
6
+ test("#each") do
7
+ assert_equal({
8
+ name: "a1a",
9
+ source: "UCI / Adult",
10
+ preprocessing:
11
+ "The original Adult data set has 14 features, " +
12
+ "among which six are continuous and eight are " +
13
+ "categorical. In this data set, continuous features " +
14
+ "are discretized into quantiles, and each quantile is " +
15
+ "represented by a binary feature. Also, a categorical " +
16
+ "feature with m categories is converted to m binary " +
17
+ "features. Details on how each feature is converted " +
18
+ "can be found in the beginning of each file from this " +
19
+ "page. [JP98a]",
20
+ n_classes: 2,
21
+ n_data: 1605,
22
+ n_features: 123,
23
+ files: [
24
+ {
25
+ name: "a1a",
26
+ url: "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a",
27
+ note: nil,
28
+ },
29
+ {
30
+ name: "a1a.t",
31
+ url: "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a.t",
32
+ note: "testing",
33
+ }
34
+ ],
35
+ },
36
+ @dataset.first.to_h)
37
+ end
38
+
39
+ sub_test_case("#metadata") do
40
+ test("#description") do
41
+ description = @dataset.metadata.description
42
+ assert do
43
+ description.start_with?("This page contains many classification, ")
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,205 @@
1
+ class LIBSVMDatasetTest < Test::Unit::TestCase
2
+ test(":note") do
3
+ dataset = Datasets::LIBSVM.new("a1a", note: "testing")
4
+ hash = {label: -1}
5
+ n_features = 123
6
+ n_features.times do |i|
7
+ hash[i] = 0
8
+ end
9
+ [5, 7, 14, 19, 39, 40, 51, 63, 67, 73, 74, 76, 78, 83].each do |i|
10
+ hash[i - 1] = 1
11
+ end
12
+ assert_equal(hash,
13
+ dataset.first.to_h)
14
+ end
15
+
16
+ test(":default_feature_value") do
17
+ dataset = Datasets::LIBSVM.new("a1a", default_feature_value: nil)
18
+ hash = {label: -1}
19
+ n_features = 123
20
+ n_features.times do |i|
21
+ hash[i] = nil
22
+ end
23
+ [3, 11, 14, 19, 39, 42, 55, 64, 67, 73, 75, 76, 80, 83].each do |i|
24
+ hash[i - 1] = 1
25
+ end
26
+ assert_equal(hash,
27
+ dataset.first.to_h)
28
+ end
29
+
30
+ test("classification") do
31
+ dataset = Datasets::LIBSVM.new("a1a")
32
+ hash = {label: -1}
33
+ n_features = 123
34
+ n_features.times do |i|
35
+ hash[i] = 0
36
+ end
37
+ [3, 11, 14, 19, 39, 42, 55, 64, 67, 73, 75, 76, 80, 83].each do |i|
38
+ hash[i - 1] = 1
39
+ end
40
+ assert_equal(hash,
41
+ dataset.first.to_h)
42
+ end
43
+
44
+ test("regression") do
45
+ dataset = Datasets::LIBSVM.new("abalone")
46
+ hash = {label: 15}
47
+ n_features = 8
48
+ n_features.times do |i|
49
+ hash[i] = 0
50
+ end
51
+ [
52
+ [1, 1],
53
+ [2, 0.455],
54
+ [3, 0.365],
55
+ [4, 0.095],
56
+ [5, 0.514],
57
+ [6, 0.2245],
58
+ [7, 0.101],
59
+ [8, 0.15],
60
+ ].each do |i, value|
61
+ hash[i - 1] = value
62
+ end
63
+ assert_equal(hash,
64
+ dataset.first.to_h)
65
+ end
66
+
67
+ test("multi-label") do
68
+ dataset = Datasets::LIBSVM.new("mediamill (exp1)")
69
+ hash = {label: [65, 67, 11, 31]}
70
+ n_features = 120
71
+ n_features.times do |i|
72
+ hash[i] = 0
73
+ end
74
+ [
75
+ [1, 0.380877],
76
+ [2, 0.494079],
77
+ [3, 0.540009],
78
+ [4, 0.422926],
79
+ [5, 0.158318],
80
+ [6, 0.326975],
81
+ [7, 0.390861],
82
+ [8, 0.527121],
83
+ [9, 0.254052],
84
+ [10, 0.223731],
85
+ [11, 0.040285],
86
+ [12, 0.141133],
87
+ [13, 0.112249],
88
+ [14, 0.263171],
89
+ [15, 0.147020],
90
+ [16, 0.472414],
91
+ [17, 0.592614],
92
+ [18, 0.653138],
93
+ [19, 0.499867],
94
+ [20, 0.196520],
95
+ [21, 0.403892],
96
+ [22, 0.482395],
97
+ [23, 0.619219],
98
+ [24, 0.320346],
99
+ [25, 0.281251],
100
+ [26, 0.054750],
101
+ [27, 0.180459],
102
+ [28, 0.139964],
103
+ [29, 0.319925],
104
+ [30, 0.181216],
105
+ [31, 0.364294],
106
+ [32, 0.407211],
107
+ [33, 0.368926],
108
+ [34, 0.427661],
109
+ [35, 0.211391],
110
+ [36, 0.364345],
111
+ [37, 0.370710],
112
+ [38, 0.409107],
113
+ [39, 0.289299],
114
+ [40, 0.243053],
115
+ [41, 0.063121],
116
+ [42, 0.193587],
117
+ [43, 0.158755],
118
+ [44, 0.316054],
119
+ [45, 0.197410],
120
+ [46, 0.656168],
121
+ [47, 0.678760],
122
+ [48, 0.650831],
123
+ [49, 0.674636],
124
+ [50, 0.492428],
125
+ [51, 0.623887],
126
+ [52, 0.610622],
127
+ [53, 0.678219],
128
+ [54, 0.574774],
129
+ [55, 0.523073],
130
+ [56, 0.206804],
131
+ [57, 0.496294],
132
+ [58, 0.429221],
133
+ [59, 0.586611],
134
+ [60, 0.471550],
135
+ [61, 0.284480],
136
+ [62, 0.432466],
137
+ [63, 0.498075],
138
+ [64, 0.408141],
139
+ [65, 0.102713],
140
+ [66, 0.303028],
141
+ [67, 0.309501],
142
+ [68, 0.444855],
143
+ [69, 0.191727],
144
+ [70, 0.174895],
145
+ [71, 0.034143],
146
+ [72, 0.153099],
147
+ [73, 0.068318],
148
+ [74, 0.217020],
149
+ [75, 0.099688],
150
+ [76, 0.409862],
151
+ [77, 0.561918],
152
+ [78, 0.612031],
153
+ [79, 0.514471],
154
+ [80, 0.146015],
155
+ [81, 0.398807],
156
+ [82, 0.383295],
157
+ [83, 0.548485],
158
+ [84, 0.282937],
159
+ [85, 0.252712],
160
+ [86, 0.051008],
161
+ [87, 0.223110],
162
+ [88, 0.098112],
163
+ [89, 0.299672],
164
+ [90, 0.144873],
165
+ [91, 0.308488],
166
+ [92, 0.358478],
167
+ [93, 0.352077],
168
+ [94, 0.394686],
169
+ [95, 0.157513],
170
+ [96, 0.339370],
171
+ [97, 0.321558],
172
+ [98, 0.341373],
173
+ [99, 0.247969],
174
+ [100, 0.206070],
175
+ [101, 0.061001],
176
+ [102, 0.216793],
177
+ [103, 0.112389],
178
+ [104, 0.273648],
179
+ [105, 0.152745],
180
+ [106, 0.598081],
181
+ [107, 0.621687],
182
+ [108, 0.607213],
183
+ [109, 0.644025],
184
+ [110, 0.394948],
185
+ [111, 0.593651],
186
+ [112, 0.551529],
187
+ [113, 0.574392],
188
+ [114, 0.511032],
189
+ [115, 0.463997],
190
+ [116, 0.202034],
191
+ [117, 0.492341],
192
+ [118, 0.317983],
193
+ [119, 0.547807],
194
+ [120, 0.393778],
195
+ ].each do |i, value|
196
+ hash[i - 1] = value
197
+ end
198
+ assert_equal(hash,
199
+ dataset.first.to_h)
200
+ end
201
+
202
+ test("string") do
203
+ # TODO
204
+ end
205
+ end
@@ -0,0 +1,80 @@
1
+ class MushroomTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::Mushroom.new
4
+ end
5
+
6
+ def record(*args)
7
+ Datasets::Mushroom::Record.new(*args)
8
+ end
9
+
10
+ test("#each") do
11
+ records = @dataset.each.to_a
12
+ assert_equal([
13
+ 8124,
14
+ {
15
+ :label => "poisonous",
16
+ :cap_shape => "convex",
17
+ :cap_surface => "smooth",
18
+ :cap_color => "brown",
19
+ :bruises => "bruises",
20
+ :odor => "pungent",
21
+ :gill_attachment => "free",
22
+ :gill_spacing => "close",
23
+ :gill_size => "narrow",
24
+ :gill_color => "black",
25
+ :stalk_shape => "enlarging",
26
+ :stalk_root => "equal",
27
+ :stalk_surface_above_ring => "smooth",
28
+ :stalk_surface_below_ring => "smooth",
29
+ :stalk_color_above_ring => "white",
30
+ :stalk_color_below_ring => "white",
31
+ :veil_type => "partial",
32
+ :veil_color => "white",
33
+ :n_rings => 1,
34
+ :ring_type => "pendant",
35
+ :spore_print_color => "black",
36
+ :population => "scattered",
37
+ :habitat => "urban"
38
+ },
39
+ {
40
+ :label => "edible",
41
+ :cap_shape => "convex",
42
+ :cap_surface => "smooth",
43
+ :cap_color => "brown",
44
+ :bruises => "no",
45
+ :odor => "none",
46
+ :gill_attachment => "attached",
47
+ :gill_spacing => "close",
48
+ :gill_size => "broad",
49
+ :gill_color => "yellow",
50
+ :stalk_shape => "enlarging",
51
+ :stalk_root => "missing",
52
+ :stalk_surface_above_ring => "smooth",
53
+ :stalk_surface_below_ring => "smooth",
54
+ :stalk_color_above_ring => "orange",
55
+ :stalk_color_below_ring => "orange",
56
+ :veil_type => "partial",
57
+ :veil_color => "orange",
58
+ :n_rings => 1,
59
+ :ring_type => "pendant",
60
+ :spore_print_color => "orange",
61
+ :population => "clustered",
62
+ :habitat => "leaves"
63
+ }
64
+ ],
65
+ [
66
+ records.size,
67
+ records[0].to_h,
68
+ records[-1].to_h
69
+ ])
70
+ end
71
+
72
+ sub_test_case("#metadata") do
73
+ test("#description") do
74
+ description = @dataset.metadata.description
75
+ assert do
76
+ description.start_with?("1. Title: Mushroom Database")
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,251 @@
1
+ class PenguinsTest < Test::Unit::TestCase
2
+ sub_test_case("PenguinsRawData::SpeciesBase") do
3
+ test("#data_path") do
4
+ data_paths = [ Datasets::PenguinsRawData::Adelie,
5
+ Datasets::PenguinsRawData::Gentoo,
6
+ Datasets::PenguinsRawData::Chinstrap ].map {|cls|
7
+ dataset = cls.new
8
+ dataset.data_path.relative_path_from(dataset.send(:cache_dir_path)).to_s
9
+ }
10
+ assert_equal(["penguins/adelie.csv", "penguins/gentoo.csv", "penguins/chinstrap.csv"],
11
+ data_paths)
12
+ end
13
+ end
14
+
15
+ sub_test_case("Adelie") do
16
+ def setup
17
+ @dataset = Datasets::PenguinsRawData::Adelie.new
18
+ end
19
+
20
+ test("#each") do
21
+ records = @dataset.each.to_a
22
+ assert_equal([ 152,
23
+ {
24
+ study_name: "PAL0708",
25
+ sample_number: 1,
26
+ species: "Adelie Penguin (Pygoscelis adeliae)",
27
+ region: "Anvers",
28
+ island: "Torgersen",
29
+ stage: "Adult, 1 Egg Stage",
30
+ individual_id: "N1A1",
31
+ clutch_completion: "Yes",
32
+ date_egg: DateTime.new(2007, 11, 11),
33
+ culmen_length_mm: 39.1,
34
+ culmen_depth_mm: 18.7,
35
+ flipper_length_mm: 181,
36
+ body_mass_g: 3750,
37
+ sex: "MALE",
38
+ delta_15_n_permil: nil,
39
+ delta_13_c_permil: nil,
40
+ comments: "Not enough blood for isotopes."
41
+ },
42
+ {
43
+ study_name: "PAL0910",
44
+ sample_number: 152,
45
+ species: "Adelie Penguin (Pygoscelis adeliae)",
46
+ region: "Anvers",
47
+ island: "Dream",
48
+ stage: "Adult, 1 Egg Stage",
49
+ individual_id: "N85A2",
50
+ clutch_completion: "Yes",
51
+ date_egg: DateTime.new(2009, 11, 17),
52
+ culmen_length_mm: 41.5,
53
+ culmen_depth_mm: 18.5,
54
+ flipper_length_mm: 201,
55
+ body_mass_g: 4000,
56
+ sex: "MALE",
57
+ delta_15_n_permil: 8.89640,
58
+ delta_13_c_permil: -26.06967,
59
+ comments: nil
60
+ }
61
+ ],
62
+ [
63
+ records.size,
64
+ records[0].to_h,
65
+ records[-1].to_h
66
+ ])
67
+ end
68
+ end
69
+
70
+ sub_test_case("Gentoo") do
71
+ def setup
72
+ @dataset = Datasets::PenguinsRawData::Gentoo.new
73
+ end
74
+
75
+ test("#each") do
76
+ records = @dataset.each.to_a
77
+ assert_equal([ 124,
78
+ {
79
+ study_name: "PAL0708",
80
+ sample_number: 1,
81
+ species: "Gentoo penguin (Pygoscelis papua)",
82
+ region: "Anvers",
83
+ island: "Biscoe",
84
+ stage: "Adult, 1 Egg Stage",
85
+ individual_id: "N31A1",
86
+ clutch_completion: "Yes",
87
+ date_egg: DateTime.new(2007, 11, 27),
88
+ culmen_length_mm: 46.1,
89
+ culmen_depth_mm: 13.2,
90
+ flipper_length_mm: 211,
91
+ body_mass_g: 4500,
92
+ sex: "FEMALE",
93
+ delta_15_n_permil: 7.993,
94
+ delta_13_c_permil: -25.5139,
95
+ comments: nil
96
+ },
97
+ {
98
+ study_name: "PAL0910",
99
+ sample_number: 124,
100
+ species: "Gentoo penguin (Pygoscelis papua)",
101
+ region: "Anvers",
102
+ island: "Biscoe",
103
+ stage: "Adult, 1 Egg Stage",
104
+ individual_id: "N43A2",
105
+ clutch_completion: "Yes",
106
+ date_egg: DateTime.new(2009, 11, 22),
107
+ culmen_length_mm: 49.9,
108
+ culmen_depth_mm: 16.1,
109
+ flipper_length_mm: 213,
110
+ body_mass_g: 5400,
111
+ sex: "MALE",
112
+ delta_15_n_permil: 8.3639,
113
+ delta_13_c_permil: -26.15531,
114
+ comments: nil
115
+ }
116
+ ],
117
+ [
118
+ records.size,
119
+ records[0].to_h,
120
+ records[-1].to_h
121
+ ])
122
+ end
123
+ end
124
+
125
+ sub_test_case("Chinstrap") do
126
+ def setup
127
+ @dataset = Datasets::PenguinsRawData::Chinstrap.new
128
+ end
129
+
130
+ test("#each") do
131
+ records = @dataset.each.to_a
132
+ assert_equal([ 68,
133
+ {
134
+ study_name: "PAL0708",
135
+ sample_number: 1,
136
+ species: "Chinstrap penguin (Pygoscelis antarctica)",
137
+ region: "Anvers",
138
+ island: "Dream",
139
+ stage: "Adult, 1 Egg Stage",
140
+ individual_id: "N61A1",
141
+ clutch_completion: "No",
142
+ date_egg: DateTime.new(2007, 11, 19),
143
+ culmen_length_mm: 46.5,
144
+ culmen_depth_mm: 17.9,
145
+ flipper_length_mm: 192,
146
+ body_mass_g: 3500,
147
+ sex: "FEMALE",
148
+ delta_15_n_permil: 9.03935,
149
+ delta_13_c_permil: -24.30229,
150
+ comments: "Nest never observed with full clutch."
151
+ },
152
+ {
153
+ study_name: "PAL0910",
154
+ sample_number: 68,
155
+ species: "Chinstrap penguin (Pygoscelis antarctica)",
156
+ region: "Anvers",
157
+ island: "Dream",
158
+ stage: "Adult, 1 Egg Stage",
159
+ individual_id: "N100A2",
160
+ clutch_completion: "Yes",
161
+ date_egg: DateTime.new(2009, 11, 21),
162
+ culmen_length_mm: 50.2,
163
+ culmen_depth_mm: 18.7,
164
+ flipper_length_mm: 198,
165
+ body_mass_g: 3775,
166
+ sex: "FEMALE",
167
+ delta_15_n_permil: 9.39305,
168
+ delta_13_c_permil: -24.25255,
169
+ comments: nil
170
+ }
171
+ ],
172
+ [
173
+ records.size,
174
+ records[0].to_h,
175
+ records[-1].to_h
176
+ ])
177
+ end
178
+ end
179
+
180
+ sub_test_case("Penguins") do
181
+ def setup
182
+ @dataset = Datasets::Penguins.new
183
+ end
184
+
185
+ test("order of species") do
186
+ species_values = @dataset.map {|r| r.species }.uniq
187
+ assert_equal(["Adelie", "Chinstrap", "Gentoo"],
188
+ species_values)
189
+ end
190
+
191
+ test("data cleansing") do
192
+ sex_values = @dataset.map {|r| r.sex }.uniq.compact.sort
193
+ assert_equal(["female", "male"],
194
+ sex_values)
195
+ end
196
+
197
+ test("#each") do
198
+ records = @dataset.each.to_a
199
+ assert_equal([
200
+ 344,
201
+ {
202
+ species: "Adelie",
203
+ island: "Torgersen",
204
+ bill_length_mm: 39.1,
205
+ bill_depth_mm: 18.7,
206
+ flipper_length_mm: 181,
207
+ body_mass_g: 3750,
208
+ sex: "male",
209
+ year: 2007
210
+ },
211
+ {
212
+ species: "Chinstrap",
213
+ island: "Dream",
214
+ bill_length_mm: 46.5,
215
+ bill_depth_mm: 17.9,
216
+ flipper_length_mm: 192,
217
+ body_mass_g: 3500,
218
+ sex: "female",
219
+ year: 2007
220
+ },
221
+ {
222
+ species: "Gentoo",
223
+ island: "Biscoe",
224
+ bill_length_mm: 46.1,
225
+ bill_depth_mm: 13.2,
226
+ flipper_length_mm: 211,
227
+ body_mass_g: 4500,
228
+ sex: "female",
229
+ year: 2007
230
+ },
231
+ {
232
+ species: "Gentoo",
233
+ island: "Biscoe",
234
+ bill_length_mm: 49.9,
235
+ bill_depth_mm: 16.1,
236
+ flipper_length_mm: 213,
237
+ body_mass_g: 5400,
238
+ sex: "male",
239
+ year: 2009
240
+ }
241
+ ],
242
+ [
243
+ records.size,
244
+ records[0].to_h,
245
+ records[152].to_h,
246
+ records[220].to_h,
247
+ records[-1].to_h,
248
+ ])
249
+ end
250
+ end
251
+ end