red-datasets 0.0.7 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,74 @@
1
+ class HepatitisTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::Hepatitis.new
4
+ end
5
+
6
+ def record(*args)
7
+ Datasets::Hepatitis::Record.new(*args)
8
+ end
9
+
10
+ test("#each") do
11
+ records = @dataset.each.to_a
12
+ assert_equal([
13
+ 155,
14
+ {
15
+ :label => :live,
16
+ :age => 30,
17
+ :sex => :female,
18
+ :steroid => false,
19
+ :antivirals => true,
20
+ :fatigue => true,
21
+ :malaise => true,
22
+ :anorexia => true,
23
+ :liver_big => false,
24
+ :liver_firm => true,
25
+ :spleen_palpable => true,
26
+ :spiders => true,
27
+ :ascites => true,
28
+ :varices => true,
29
+ :bilirubin => 1.0,
30
+ :alkaline_phosphate => 85,
31
+ :sgot => 18,
32
+ :albumin => 4.0,
33
+ :protime => nil,
34
+ :histology => false,
35
+ },
36
+ {
37
+ :label => :die,
38
+ :age => 43,
39
+ :sex => :male,
40
+ :steroid => true,
41
+ :antivirals => true,
42
+ :fatigue => false,
43
+ :malaise => true,
44
+ :anorexia => true,
45
+ :liver_big => true,
46
+ :liver_firm => true,
47
+ :spleen_palpable => false,
48
+ :spiders => false,
49
+ :ascites => false,
50
+ :varices => true,
51
+ :bilirubin => 1.2,
52
+ :alkaline_phosphate => 100,
53
+ :sgot => 19,
54
+ :albumin => 3.1,
55
+ :protime => 42,
56
+ :histology => true,
57
+ }
58
+ ],
59
+ [
60
+ records.size,
61
+ records[0].to_h,
62
+ records[-1].to_h,
63
+ ])
64
+ end
65
+
66
+ sub_test_case("#metadata") do
67
+ test("#description") do
68
+ description = @dataset.metadata.description
69
+ assert do
70
+ description.start_with?("1. Title: Hepatitis Domain")
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,47 @@
1
+ class LIBSVMDatasetListTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::LIBSVMDatasetList.new
4
+ end
5
+
6
+ test("#each") do
7
+ assert_equal({
8
+ name: "a1a",
9
+ source: "UCI / Adult",
10
+ preprocessing:
11
+ "The original Adult data set has 14 features, " +
12
+ "among which six are continuous and eight are " +
13
+ "categorical. In this data set, continuous features " +
14
+ "are discretized into quantiles, and each quantile is " +
15
+ "represented by a binary feature. Also, a categorical " +
16
+ "feature with m categories is converted to m binary " +
17
+ "features. Details on how each feature is converted " +
18
+ "can be found in the beginning of each file from this " +
19
+ "page. [JP98a]",
20
+ n_classes: 2,
21
+ n_data: 1605,
22
+ n_features: 123,
23
+ files: [
24
+ {
25
+ name: "a1a",
26
+ url: "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a",
27
+ note: nil,
28
+ },
29
+ {
30
+ name: "a1a.t",
31
+ url: "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a.t",
32
+ note: "testing",
33
+ }
34
+ ],
35
+ },
36
+ @dataset.first.to_h)
37
+ end
38
+
39
+ sub_test_case("#metadata") do
40
+ test("#description") do
41
+ description = @dataset.metadata.description
42
+ assert do
43
+ description.start_with?("This page contains many classification, ")
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,205 @@
1
+ class LIBSVMDatasetTest < Test::Unit::TestCase
2
+ test(":note") do
3
+ dataset = Datasets::LIBSVM.new("a1a", note: "testing")
4
+ hash = {label: -1}
5
+ n_features = 123
6
+ n_features.times do |i|
7
+ hash[i] = 0
8
+ end
9
+ [5, 7, 14, 19, 39, 40, 51, 63, 67, 73, 74, 76, 78, 83].each do |i|
10
+ hash[i - 1] = 1
11
+ end
12
+ assert_equal(hash,
13
+ dataset.first.to_h)
14
+ end
15
+
16
+ test(":default_feature_value") do
17
+ dataset = Datasets::LIBSVM.new("a1a", default_feature_value: nil)
18
+ hash = {label: -1}
19
+ n_features = 123
20
+ n_features.times do |i|
21
+ hash[i] = nil
22
+ end
23
+ [3, 11, 14, 19, 39, 42, 55, 64, 67, 73, 75, 76, 80, 83].each do |i|
24
+ hash[i - 1] = 1
25
+ end
26
+ assert_equal(hash,
27
+ dataset.first.to_h)
28
+ end
29
+
30
+ test("classification") do
31
+ dataset = Datasets::LIBSVM.new("a1a")
32
+ hash = {label: -1}
33
+ n_features = 123
34
+ n_features.times do |i|
35
+ hash[i] = 0
36
+ end
37
+ [3, 11, 14, 19, 39, 42, 55, 64, 67, 73, 75, 76, 80, 83].each do |i|
38
+ hash[i - 1] = 1
39
+ end
40
+ assert_equal(hash,
41
+ dataset.first.to_h)
42
+ end
43
+
44
+ test("regression") do
45
+ dataset = Datasets::LIBSVM.new("abalone")
46
+ hash = {label: 15}
47
+ n_features = 8
48
+ n_features.times do |i|
49
+ hash[i] = 0
50
+ end
51
+ [
52
+ [1, 1],
53
+ [2, 0.455],
54
+ [3, 0.365],
55
+ [4, 0.095],
56
+ [5, 0.514],
57
+ [6, 0.2245],
58
+ [7, 0.101],
59
+ [8, 0.15],
60
+ ].each do |i, value|
61
+ hash[i - 1] = value
62
+ end
63
+ assert_equal(hash,
64
+ dataset.first.to_h)
65
+ end
66
+
67
+ test("multi-label") do
68
+ dataset = Datasets::LIBSVM.new("mediamill (exp1)")
69
+ hash = {label: [65, 67, 11, 31]}
70
+ n_features = 120
71
+ n_features.times do |i|
72
+ hash[i] = 0
73
+ end
74
+ [
75
+ [1, 0.380877],
76
+ [2, 0.494079],
77
+ [3, 0.540009],
78
+ [4, 0.422926],
79
+ [5, 0.158318],
80
+ [6, 0.326975],
81
+ [7, 0.390861],
82
+ [8, 0.527121],
83
+ [9, 0.254052],
84
+ [10, 0.223731],
85
+ [11, 0.040285],
86
+ [12, 0.141133],
87
+ [13, 0.112249],
88
+ [14, 0.263171],
89
+ [15, 0.147020],
90
+ [16, 0.472414],
91
+ [17, 0.592614],
92
+ [18, 0.653138],
93
+ [19, 0.499867],
94
+ [20, 0.196520],
95
+ [21, 0.403892],
96
+ [22, 0.482395],
97
+ [23, 0.619219],
98
+ [24, 0.320346],
99
+ [25, 0.281251],
100
+ [26, 0.054750],
101
+ [27, 0.180459],
102
+ [28, 0.139964],
103
+ [29, 0.319925],
104
+ [30, 0.181216],
105
+ [31, 0.364294],
106
+ [32, 0.407211],
107
+ [33, 0.368926],
108
+ [34, 0.427661],
109
+ [35, 0.211391],
110
+ [36, 0.364345],
111
+ [37, 0.370710],
112
+ [38, 0.409107],
113
+ [39, 0.289299],
114
+ [40, 0.243053],
115
+ [41, 0.063121],
116
+ [42, 0.193587],
117
+ [43, 0.158755],
118
+ [44, 0.316054],
119
+ [45, 0.197410],
120
+ [46, 0.656168],
121
+ [47, 0.678760],
122
+ [48, 0.650831],
123
+ [49, 0.674636],
124
+ [50, 0.492428],
125
+ [51, 0.623887],
126
+ [52, 0.610622],
127
+ [53, 0.678219],
128
+ [54, 0.574774],
129
+ [55, 0.523073],
130
+ [56, 0.206804],
131
+ [57, 0.496294],
132
+ [58, 0.429221],
133
+ [59, 0.586611],
134
+ [60, 0.471550],
135
+ [61, 0.284480],
136
+ [62, 0.432466],
137
+ [63, 0.498075],
138
+ [64, 0.408141],
139
+ [65, 0.102713],
140
+ [66, 0.303028],
141
+ [67, 0.309501],
142
+ [68, 0.444855],
143
+ [69, 0.191727],
144
+ [70, 0.174895],
145
+ [71, 0.034143],
146
+ [72, 0.153099],
147
+ [73, 0.068318],
148
+ [74, 0.217020],
149
+ [75, 0.099688],
150
+ [76, 0.409862],
151
+ [77, 0.561918],
152
+ [78, 0.612031],
153
+ [79, 0.514471],
154
+ [80, 0.146015],
155
+ [81, 0.398807],
156
+ [82, 0.383295],
157
+ [83, 0.548485],
158
+ [84, 0.282937],
159
+ [85, 0.252712],
160
+ [86, 0.051008],
161
+ [87, 0.223110],
162
+ [88, 0.098112],
163
+ [89, 0.299672],
164
+ [90, 0.144873],
165
+ [91, 0.308488],
166
+ [92, 0.358478],
167
+ [93, 0.352077],
168
+ [94, 0.394686],
169
+ [95, 0.157513],
170
+ [96, 0.339370],
171
+ [97, 0.321558],
172
+ [98, 0.341373],
173
+ [99, 0.247969],
174
+ [100, 0.206070],
175
+ [101, 0.061001],
176
+ [102, 0.216793],
177
+ [103, 0.112389],
178
+ [104, 0.273648],
179
+ [105, 0.152745],
180
+ [106, 0.598081],
181
+ [107, 0.621687],
182
+ [108, 0.607213],
183
+ [109, 0.644025],
184
+ [110, 0.394948],
185
+ [111, 0.593651],
186
+ [112, 0.551529],
187
+ [113, 0.574392],
188
+ [114, 0.511032],
189
+ [115, 0.463997],
190
+ [116, 0.202034],
191
+ [117, 0.492341],
192
+ [118, 0.317983],
193
+ [119, 0.547807],
194
+ [120, 0.393778],
195
+ ].each do |i, value|
196
+ hash[i - 1] = value
197
+ end
198
+ assert_equal(hash,
199
+ dataset.first.to_h)
200
+ end
201
+
202
+ test("string") do
203
+ # TODO
204
+ end
205
+ end
@@ -0,0 +1,80 @@
1
+ class MushroomTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::Mushroom.new
4
+ end
5
+
6
+ def record(*args)
7
+ Datasets::Mushroom::Record.new(*args)
8
+ end
9
+
10
+ test("#each") do
11
+ records = @dataset.each.to_a
12
+ assert_equal([
13
+ 8124,
14
+ {
15
+ :label => "poisonous",
16
+ :cap_shape => "convex",
17
+ :cap_surface => "smooth",
18
+ :cap_color => "brown",
19
+ :bruises => "bruises",
20
+ :odor => "pungent",
21
+ :gill_attachment => "free",
22
+ :gill_spacing => "close",
23
+ :gill_size => "narrow",
24
+ :gill_color => "black",
25
+ :stalk_shape => "enlarging",
26
+ :stalk_root => "equal",
27
+ :stalk_surface_above_ring => "smooth",
28
+ :stalk_surface_below_ring => "smooth",
29
+ :stalk_color_above_ring => "white",
30
+ :stalk_color_below_ring => "white",
31
+ :veil_type => "partial",
32
+ :veil_color => "white",
33
+ :n_rings => 1,
34
+ :ring_type => "pendant",
35
+ :spore_print_color => "black",
36
+ :population => "scattered",
37
+ :habitat => "urban"
38
+ },
39
+ {
40
+ :label => "edible",
41
+ :cap_shape => "convex",
42
+ :cap_surface => "smooth",
43
+ :cap_color => "brown",
44
+ :bruises => "no",
45
+ :odor => "none",
46
+ :gill_attachment => "attached",
47
+ :gill_spacing => "close",
48
+ :gill_size => "broad",
49
+ :gill_color => "yellow",
50
+ :stalk_shape => "enlarging",
51
+ :stalk_root => "missing",
52
+ :stalk_surface_above_ring => "smooth",
53
+ :stalk_surface_below_ring => "smooth",
54
+ :stalk_color_above_ring => "orange",
55
+ :stalk_color_below_ring => "orange",
56
+ :veil_type => "partial",
57
+ :veil_color => "orange",
58
+ :n_rings => 1,
59
+ :ring_type => "pendant",
60
+ :spore_print_color => "orange",
61
+ :population => "clustered",
62
+ :habitat => "leaves"
63
+ }
64
+ ],
65
+ [
66
+ records.size,
67
+ records[0].to_h,
68
+ records[-1].to_h
69
+ ])
70
+ end
71
+
72
+ sub_test_case("#metadata") do
73
+ test("#description") do
74
+ description = @dataset.metadata.description
75
+ assert do
76
+ description.start_with?("1. Title: Mushroom Database")
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,251 @@
1
+ class PenguinsTest < Test::Unit::TestCase
2
+ sub_test_case("PenguinsRawData::SpeciesBase") do
3
+ test("#data_path") do
4
+ data_paths = [ Datasets::PenguinsRawData::Adelie,
5
+ Datasets::PenguinsRawData::Gentoo,
6
+ Datasets::PenguinsRawData::Chinstrap ].map {|cls|
7
+ dataset = cls.new
8
+ dataset.data_path.relative_path_from(dataset.send(:cache_dir_path)).to_s
9
+ }
10
+ assert_equal(["penguins/adelie.csv", "penguins/gentoo.csv", "penguins/chinstrap.csv"],
11
+ data_paths)
12
+ end
13
+ end
14
+
15
+ sub_test_case("Adelie") do
16
+ def setup
17
+ @dataset = Datasets::PenguinsRawData::Adelie.new
18
+ end
19
+
20
+ test("#each") do
21
+ records = @dataset.each.to_a
22
+ assert_equal([ 152,
23
+ {
24
+ study_name: "PAL0708",
25
+ sample_number: 1,
26
+ species: "Adelie Penguin (Pygoscelis adeliae)",
27
+ region: "Anvers",
28
+ island: "Torgersen",
29
+ stage: "Adult, 1 Egg Stage",
30
+ individual_id: "N1A1",
31
+ clutch_completion: "Yes",
32
+ date_egg: DateTime.new(2007, 11, 11),
33
+ culmen_length_mm: 39.1,
34
+ culmen_depth_mm: 18.7,
35
+ flipper_length_mm: 181,
36
+ body_mass_g: 3750,
37
+ sex: "MALE",
38
+ delta_15_n_permil: nil,
39
+ delta_13_c_permil: nil,
40
+ comments: "Not enough blood for isotopes."
41
+ },
42
+ {
43
+ study_name: "PAL0910",
44
+ sample_number: 152,
45
+ species: "Adelie Penguin (Pygoscelis adeliae)",
46
+ region: "Anvers",
47
+ island: "Dream",
48
+ stage: "Adult, 1 Egg Stage",
49
+ individual_id: "N85A2",
50
+ clutch_completion: "Yes",
51
+ date_egg: DateTime.new(2009, 11, 17),
52
+ culmen_length_mm: 41.5,
53
+ culmen_depth_mm: 18.5,
54
+ flipper_length_mm: 201,
55
+ body_mass_g: 4000,
56
+ sex: "MALE",
57
+ delta_15_n_permil: 8.89640,
58
+ delta_13_c_permil: -26.06967,
59
+ comments: nil
60
+ }
61
+ ],
62
+ [
63
+ records.size,
64
+ records[0].to_h,
65
+ records[-1].to_h
66
+ ])
67
+ end
68
+ end
69
+
70
+ sub_test_case("Gentoo") do
71
+ def setup
72
+ @dataset = Datasets::PenguinsRawData::Gentoo.new
73
+ end
74
+
75
+ test("#each") do
76
+ records = @dataset.each.to_a
77
+ assert_equal([ 124,
78
+ {
79
+ study_name: "PAL0708",
80
+ sample_number: 1,
81
+ species: "Gentoo penguin (Pygoscelis papua)",
82
+ region: "Anvers",
83
+ island: "Biscoe",
84
+ stage: "Adult, 1 Egg Stage",
85
+ individual_id: "N31A1",
86
+ clutch_completion: "Yes",
87
+ date_egg: DateTime.new(2007, 11, 27),
88
+ culmen_length_mm: 46.1,
89
+ culmen_depth_mm: 13.2,
90
+ flipper_length_mm: 211,
91
+ body_mass_g: 4500,
92
+ sex: "FEMALE",
93
+ delta_15_n_permil: 7.993,
94
+ delta_13_c_permil: -25.5139,
95
+ comments: nil
96
+ },
97
+ {
98
+ study_name: "PAL0910",
99
+ sample_number: 124,
100
+ species: "Gentoo penguin (Pygoscelis papua)",
101
+ region: "Anvers",
102
+ island: "Biscoe",
103
+ stage: "Adult, 1 Egg Stage",
104
+ individual_id: "N43A2",
105
+ clutch_completion: "Yes",
106
+ date_egg: DateTime.new(2009, 11, 22),
107
+ culmen_length_mm: 49.9,
108
+ culmen_depth_mm: 16.1,
109
+ flipper_length_mm: 213,
110
+ body_mass_g: 5400,
111
+ sex: "MALE",
112
+ delta_15_n_permil: 8.3639,
113
+ delta_13_c_permil: -26.15531,
114
+ comments: nil
115
+ }
116
+ ],
117
+ [
118
+ records.size,
119
+ records[0].to_h,
120
+ records[-1].to_h
121
+ ])
122
+ end
123
+ end
124
+
125
+ sub_test_case("Chinstrap") do
126
+ def setup
127
+ @dataset = Datasets::PenguinsRawData::Chinstrap.new
128
+ end
129
+
130
+ test("#each") do
131
+ records = @dataset.each.to_a
132
+ assert_equal([ 68,
133
+ {
134
+ study_name: "PAL0708",
135
+ sample_number: 1,
136
+ species: "Chinstrap penguin (Pygoscelis antarctica)",
137
+ region: "Anvers",
138
+ island: "Dream",
139
+ stage: "Adult, 1 Egg Stage",
140
+ individual_id: "N61A1",
141
+ clutch_completion: "No",
142
+ date_egg: DateTime.new(2007, 11, 19),
143
+ culmen_length_mm: 46.5,
144
+ culmen_depth_mm: 17.9,
145
+ flipper_length_mm: 192,
146
+ body_mass_g: 3500,
147
+ sex: "FEMALE",
148
+ delta_15_n_permil: 9.03935,
149
+ delta_13_c_permil: -24.30229,
150
+ comments: "Nest never observed with full clutch."
151
+ },
152
+ {
153
+ study_name: "PAL0910",
154
+ sample_number: 68,
155
+ species: "Chinstrap penguin (Pygoscelis antarctica)",
156
+ region: "Anvers",
157
+ island: "Dream",
158
+ stage: "Adult, 1 Egg Stage",
159
+ individual_id: "N100A2",
160
+ clutch_completion: "Yes",
161
+ date_egg: DateTime.new(2009, 11, 21),
162
+ culmen_length_mm: 50.2,
163
+ culmen_depth_mm: 18.7,
164
+ flipper_length_mm: 198,
165
+ body_mass_g: 3775,
166
+ sex: "FEMALE",
167
+ delta_15_n_permil: 9.39305,
168
+ delta_13_c_permil: -24.25255,
169
+ comments: nil
170
+ }
171
+ ],
172
+ [
173
+ records.size,
174
+ records[0].to_h,
175
+ records[-1].to_h
176
+ ])
177
+ end
178
+ end
179
+
180
+ sub_test_case("Penguins") do
181
+ def setup
182
+ @dataset = Datasets::Penguins.new
183
+ end
184
+
185
+ test("order of species") do
186
+ species_values = @dataset.map {|r| r.species }.uniq
187
+ assert_equal(["Adelie", "Chinstrap", "Gentoo"],
188
+ species_values)
189
+ end
190
+
191
+ test("data cleansing") do
192
+ sex_values = @dataset.map {|r| r.sex }.uniq.compact.sort
193
+ assert_equal(["female", "male"],
194
+ sex_values)
195
+ end
196
+
197
+ test("#each") do
198
+ records = @dataset.each.to_a
199
+ assert_equal([
200
+ 344,
201
+ {
202
+ species: "Adelie",
203
+ island: "Torgersen",
204
+ bill_length_mm: 39.1,
205
+ bill_depth_mm: 18.7,
206
+ flipper_length_mm: 181,
207
+ body_mass_g: 3750,
208
+ sex: "male",
209
+ year: 2007
210
+ },
211
+ {
212
+ species: "Chinstrap",
213
+ island: "Dream",
214
+ bill_length_mm: 46.5,
215
+ bill_depth_mm: 17.9,
216
+ flipper_length_mm: 192,
217
+ body_mass_g: 3500,
218
+ sex: "female",
219
+ year: 2007
220
+ },
221
+ {
222
+ species: "Gentoo",
223
+ island: "Biscoe",
224
+ bill_length_mm: 46.1,
225
+ bill_depth_mm: 13.2,
226
+ flipper_length_mm: 211,
227
+ body_mass_g: 4500,
228
+ sex: "female",
229
+ year: 2007
230
+ },
231
+ {
232
+ species: "Gentoo",
233
+ island: "Biscoe",
234
+ bill_length_mm: 49.9,
235
+ bill_depth_mm: 16.1,
236
+ flipper_length_mm: 213,
237
+ body_mass_g: 5400,
238
+ sex: "male",
239
+ year: 2009
240
+ }
241
+ ],
242
+ [
243
+ records.size,
244
+ records[0].to_h,
245
+ records[152].to_h,
246
+ records[220].to_h,
247
+ records[-1].to_h,
248
+ ])
249
+ end
250
+ end
251
+ end