red-datasets 0.0.6 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +23 -7
- data/doc/text/news.md +124 -0
- data/lib/datasets.rb +18 -6
- data/lib/datasets/adult.rb +84 -0
- data/lib/datasets/cldr-plurals.rb +385 -0
- data/lib/datasets/communities.rb +198 -0
- data/lib/datasets/dataset.rb +13 -0
- data/lib/datasets/dictionary.rb +59 -0
- data/lib/datasets/downloader.rb +37 -62
- data/lib/datasets/e-stat-japan.rb +320 -0
- data/lib/datasets/error.rb +4 -0
- data/lib/datasets/fashion-mnist.rb +12 -0
- data/lib/datasets/hepatitis.rb +207 -0
- data/lib/datasets/iris.rb +1 -1
- data/lib/datasets/libsvm-dataset-list.rb +277 -0
- data/lib/datasets/libsvm.rb +135 -0
- data/lib/datasets/mnist.rb +11 -8
- data/lib/datasets/mushroom.rb +256 -0
- data/lib/datasets/penguins.rb +125 -0
- data/lib/datasets/penn-treebank.rb +2 -9
- data/lib/datasets/postal-code-japan.rb +154 -0
- data/lib/datasets/table.rb +99 -3
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia.rb +2 -10
- data/lib/datasets/wine.rb +64 -0
- data/red-datasets.gemspec +4 -0
- data/test/helper.rb +1 -0
- data/test/run-test.rb +2 -0
- data/test/test-adult.rb +126 -0
- data/test/test-cldr-plurals.rb +180 -0
- data/test/test-communities.rb +290 -0
- data/test/test-dictionary.rb +43 -0
- data/test/test-e-stat-japan.rb +383 -0
- data/test/test-fashion-mnist.rb +137 -0
- data/test/test-hepatitis.rb +74 -0
- data/test/test-libsvm-dataset-list.rb +47 -0
- data/test/test-libsvm.rb +205 -0
- data/test/test-mnist.rb +95 -70
- data/test/test-mushroom.rb +80 -0
- data/test/test-penguins.rb +239 -0
- data/test/test-penn-treebank.rb +6 -6
- data/test/test-postal-code-japan.rb +69 -0
- data/test/test-table.rb +144 -19
- data/test/test-wine.rb +58 -0
- metadata +89 -8
@@ -0,0 +1,74 @@
|
|
1
|
+
class HepatitisTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::Hepatitis.new
|
4
|
+
end
|
5
|
+
|
6
|
+
def record(*args)
|
7
|
+
Datasets::Hepatitis::Record.new(*args)
|
8
|
+
end
|
9
|
+
|
10
|
+
test("#each") do
|
11
|
+
records = @dataset.each.to_a
|
12
|
+
assert_equal([
|
13
|
+
155,
|
14
|
+
{
|
15
|
+
:label => :live,
|
16
|
+
:age => 30,
|
17
|
+
:sex => :female,
|
18
|
+
:steroid => false,
|
19
|
+
:antivirals => true,
|
20
|
+
:fatigue => true,
|
21
|
+
:malaise => true,
|
22
|
+
:anorexia => true,
|
23
|
+
:liver_big => false,
|
24
|
+
:liver_firm => true,
|
25
|
+
:spleen_palpable => true,
|
26
|
+
:spiders => true,
|
27
|
+
:ascites => true,
|
28
|
+
:varices => true,
|
29
|
+
:bilirubin => 1.0,
|
30
|
+
:alkaline_phosphate => 85,
|
31
|
+
:sgot => 18,
|
32
|
+
:albumin => 4.0,
|
33
|
+
:protime => nil,
|
34
|
+
:histology => false,
|
35
|
+
},
|
36
|
+
{
|
37
|
+
:label => :die,
|
38
|
+
:age => 43,
|
39
|
+
:sex => :male,
|
40
|
+
:steroid => true,
|
41
|
+
:antivirals => true,
|
42
|
+
:fatigue => false,
|
43
|
+
:malaise => true,
|
44
|
+
:anorexia => true,
|
45
|
+
:liver_big => true,
|
46
|
+
:liver_firm => true,
|
47
|
+
:spleen_palpable => false,
|
48
|
+
:spiders => false,
|
49
|
+
:ascites => false,
|
50
|
+
:varices => true,
|
51
|
+
:bilirubin => 1.2,
|
52
|
+
:alkaline_phosphate => 100,
|
53
|
+
:sgot => 19,
|
54
|
+
:albumin => 3.1,
|
55
|
+
:protime => 42,
|
56
|
+
:histology => true,
|
57
|
+
}
|
58
|
+
],
|
59
|
+
[
|
60
|
+
records.size,
|
61
|
+
records[0].to_h,
|
62
|
+
records[-1].to_h,
|
63
|
+
])
|
64
|
+
end
|
65
|
+
|
66
|
+
sub_test_case("#metadata") do
|
67
|
+
test("#description") do
|
68
|
+
description = @dataset.metadata.description
|
69
|
+
assert do
|
70
|
+
description.start_with?("1. Title: Hepatitis Domain")
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
class LIBSVMDatasetListTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::LIBSVMDatasetList.new
|
4
|
+
end
|
5
|
+
|
6
|
+
test("#each") do
|
7
|
+
assert_equal({
|
8
|
+
name: "a1a",
|
9
|
+
source: "UCI / Adult",
|
10
|
+
preprocessing:
|
11
|
+
"The original Adult data set has 14 features, " +
|
12
|
+
"among which six are continuous and eight are " +
|
13
|
+
"categorical. In this data set, continuous features " +
|
14
|
+
"are discretized into quantiles, and each quantile is " +
|
15
|
+
"represented by a binary feature. Also, a categorical " +
|
16
|
+
"feature with m categories is converted to m binary " +
|
17
|
+
"features. Details on how each feature is converted " +
|
18
|
+
"can be found in the beginning of each file from this " +
|
19
|
+
"page. [JP98a]",
|
20
|
+
n_classes: 2,
|
21
|
+
n_data: 1605,
|
22
|
+
n_features: 123,
|
23
|
+
files: [
|
24
|
+
{
|
25
|
+
name: "a1a",
|
26
|
+
url: "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a",
|
27
|
+
note: nil,
|
28
|
+
},
|
29
|
+
{
|
30
|
+
name: "a1a.t",
|
31
|
+
url: "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a.t",
|
32
|
+
note: "testing",
|
33
|
+
}
|
34
|
+
],
|
35
|
+
},
|
36
|
+
@dataset.first.to_h)
|
37
|
+
end
|
38
|
+
|
39
|
+
sub_test_case("#metadata") do
|
40
|
+
test("#description") do
|
41
|
+
description = @dataset.metadata.description
|
42
|
+
assert do
|
43
|
+
description.start_with?("This page contains many classification, ")
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
data/test/test-libsvm.rb
ADDED
@@ -0,0 +1,205 @@
|
|
1
|
+
class LIBSVMDatasetTest < Test::Unit::TestCase
|
2
|
+
test(":note") do
|
3
|
+
dataset = Datasets::LIBSVM.new("a1a", note: "testing")
|
4
|
+
hash = {label: -1}
|
5
|
+
n_features = 123
|
6
|
+
n_features.times do |i|
|
7
|
+
hash[i] = 0
|
8
|
+
end
|
9
|
+
[5, 7, 14, 19, 39, 40, 51, 63, 67, 73, 74, 76, 78, 83].each do |i|
|
10
|
+
hash[i - 1] = 1
|
11
|
+
end
|
12
|
+
assert_equal(hash,
|
13
|
+
dataset.first.to_h)
|
14
|
+
end
|
15
|
+
|
16
|
+
test(":default_feature_value") do
|
17
|
+
dataset = Datasets::LIBSVM.new("a1a", default_feature_value: nil)
|
18
|
+
hash = {label: -1}
|
19
|
+
n_features = 123
|
20
|
+
n_features.times do |i|
|
21
|
+
hash[i] = nil
|
22
|
+
end
|
23
|
+
[3, 11, 14, 19, 39, 42, 55, 64, 67, 73, 75, 76, 80, 83].each do |i|
|
24
|
+
hash[i - 1] = 1
|
25
|
+
end
|
26
|
+
assert_equal(hash,
|
27
|
+
dataset.first.to_h)
|
28
|
+
end
|
29
|
+
|
30
|
+
test("classification") do
|
31
|
+
dataset = Datasets::LIBSVM.new("a1a")
|
32
|
+
hash = {label: -1}
|
33
|
+
n_features = 123
|
34
|
+
n_features.times do |i|
|
35
|
+
hash[i] = 0
|
36
|
+
end
|
37
|
+
[3, 11, 14, 19, 39, 42, 55, 64, 67, 73, 75, 76, 80, 83].each do |i|
|
38
|
+
hash[i - 1] = 1
|
39
|
+
end
|
40
|
+
assert_equal(hash,
|
41
|
+
dataset.first.to_h)
|
42
|
+
end
|
43
|
+
|
44
|
+
test("regression") do
|
45
|
+
dataset = Datasets::LIBSVM.new("abalone")
|
46
|
+
hash = {label: 15}
|
47
|
+
n_features = 8
|
48
|
+
n_features.times do |i|
|
49
|
+
hash[i] = 0
|
50
|
+
end
|
51
|
+
[
|
52
|
+
[1, 1],
|
53
|
+
[2, 0.455],
|
54
|
+
[3, 0.365],
|
55
|
+
[4, 0.095],
|
56
|
+
[5, 0.514],
|
57
|
+
[6, 0.2245],
|
58
|
+
[7, 0.101],
|
59
|
+
[8, 0.15],
|
60
|
+
].each do |i, value|
|
61
|
+
hash[i - 1] = value
|
62
|
+
end
|
63
|
+
assert_equal(hash,
|
64
|
+
dataset.first.to_h)
|
65
|
+
end
|
66
|
+
|
67
|
+
test("multi-label") do
|
68
|
+
dataset = Datasets::LIBSVM.new("mediamill (exp1)")
|
69
|
+
hash = {label: [65, 67, 11, 31]}
|
70
|
+
n_features = 120
|
71
|
+
n_features.times do |i|
|
72
|
+
hash[i] = 0
|
73
|
+
end
|
74
|
+
[
|
75
|
+
[1, 0.380877],
|
76
|
+
[2, 0.494079],
|
77
|
+
[3, 0.540009],
|
78
|
+
[4, 0.422926],
|
79
|
+
[5, 0.158318],
|
80
|
+
[6, 0.326975],
|
81
|
+
[7, 0.390861],
|
82
|
+
[8, 0.527121],
|
83
|
+
[9, 0.254052],
|
84
|
+
[10, 0.223731],
|
85
|
+
[11, 0.040285],
|
86
|
+
[12, 0.141133],
|
87
|
+
[13, 0.112249],
|
88
|
+
[14, 0.263171],
|
89
|
+
[15, 0.147020],
|
90
|
+
[16, 0.472414],
|
91
|
+
[17, 0.592614],
|
92
|
+
[18, 0.653138],
|
93
|
+
[19, 0.499867],
|
94
|
+
[20, 0.196520],
|
95
|
+
[21, 0.403892],
|
96
|
+
[22, 0.482395],
|
97
|
+
[23, 0.619219],
|
98
|
+
[24, 0.320346],
|
99
|
+
[25, 0.281251],
|
100
|
+
[26, 0.054750],
|
101
|
+
[27, 0.180459],
|
102
|
+
[28, 0.139964],
|
103
|
+
[29, 0.319925],
|
104
|
+
[30, 0.181216],
|
105
|
+
[31, 0.364294],
|
106
|
+
[32, 0.407211],
|
107
|
+
[33, 0.368926],
|
108
|
+
[34, 0.427661],
|
109
|
+
[35, 0.211391],
|
110
|
+
[36, 0.364345],
|
111
|
+
[37, 0.370710],
|
112
|
+
[38, 0.409107],
|
113
|
+
[39, 0.289299],
|
114
|
+
[40, 0.243053],
|
115
|
+
[41, 0.063121],
|
116
|
+
[42, 0.193587],
|
117
|
+
[43, 0.158755],
|
118
|
+
[44, 0.316054],
|
119
|
+
[45, 0.197410],
|
120
|
+
[46, 0.656168],
|
121
|
+
[47, 0.678760],
|
122
|
+
[48, 0.650831],
|
123
|
+
[49, 0.674636],
|
124
|
+
[50, 0.492428],
|
125
|
+
[51, 0.623887],
|
126
|
+
[52, 0.610622],
|
127
|
+
[53, 0.678219],
|
128
|
+
[54, 0.574774],
|
129
|
+
[55, 0.523073],
|
130
|
+
[56, 0.206804],
|
131
|
+
[57, 0.496294],
|
132
|
+
[58, 0.429221],
|
133
|
+
[59, 0.586611],
|
134
|
+
[60, 0.471550],
|
135
|
+
[61, 0.284480],
|
136
|
+
[62, 0.432466],
|
137
|
+
[63, 0.498075],
|
138
|
+
[64, 0.408141],
|
139
|
+
[65, 0.102713],
|
140
|
+
[66, 0.303028],
|
141
|
+
[67, 0.309501],
|
142
|
+
[68, 0.444855],
|
143
|
+
[69, 0.191727],
|
144
|
+
[70, 0.174895],
|
145
|
+
[71, 0.034143],
|
146
|
+
[72, 0.153099],
|
147
|
+
[73, 0.068318],
|
148
|
+
[74, 0.217020],
|
149
|
+
[75, 0.099688],
|
150
|
+
[76, 0.409862],
|
151
|
+
[77, 0.561918],
|
152
|
+
[78, 0.612031],
|
153
|
+
[79, 0.514471],
|
154
|
+
[80, 0.146015],
|
155
|
+
[81, 0.398807],
|
156
|
+
[82, 0.383295],
|
157
|
+
[83, 0.548485],
|
158
|
+
[84, 0.282937],
|
159
|
+
[85, 0.252712],
|
160
|
+
[86, 0.051008],
|
161
|
+
[87, 0.223110],
|
162
|
+
[88, 0.098112],
|
163
|
+
[89, 0.299672],
|
164
|
+
[90, 0.144873],
|
165
|
+
[91, 0.308488],
|
166
|
+
[92, 0.358478],
|
167
|
+
[93, 0.352077],
|
168
|
+
[94, 0.394686],
|
169
|
+
[95, 0.157513],
|
170
|
+
[96, 0.339370],
|
171
|
+
[97, 0.321558],
|
172
|
+
[98, 0.341373],
|
173
|
+
[99, 0.247969],
|
174
|
+
[100, 0.206070],
|
175
|
+
[101, 0.061001],
|
176
|
+
[102, 0.216793],
|
177
|
+
[103, 0.112389],
|
178
|
+
[104, 0.273648],
|
179
|
+
[105, 0.152745],
|
180
|
+
[106, 0.598081],
|
181
|
+
[107, 0.621687],
|
182
|
+
[108, 0.607213],
|
183
|
+
[109, 0.644025],
|
184
|
+
[110, 0.394948],
|
185
|
+
[111, 0.593651],
|
186
|
+
[112, 0.551529],
|
187
|
+
[113, 0.574392],
|
188
|
+
[114, 0.511032],
|
189
|
+
[115, 0.463997],
|
190
|
+
[116, 0.202034],
|
191
|
+
[117, 0.492341],
|
192
|
+
[118, 0.317983],
|
193
|
+
[119, 0.547807],
|
194
|
+
[120, 0.393778],
|
195
|
+
].each do |i, value|
|
196
|
+
hash[i - 1] = value
|
197
|
+
end
|
198
|
+
assert_equal(hash,
|
199
|
+
dataset.first.to_h)
|
200
|
+
end
|
201
|
+
|
202
|
+
test("string") do
|
203
|
+
# TODO
|
204
|
+
end
|
205
|
+
end
|
data/test/test-mnist.rb
CHANGED
@@ -1,100 +1,125 @@
|
|
1
1
|
class MNISTTest < Test::Unit::TestCase
|
2
|
-
include Helper::Sandbox
|
3
|
-
|
4
2
|
sub_test_case("Normal") do
|
5
|
-
def setup_data
|
6
|
-
setup_sandbox
|
7
|
-
|
8
|
-
def @dataset.cache_dir_path
|
9
|
-
@cache_dir_path
|
10
|
-
end
|
11
|
-
|
12
|
-
def @dataset.cache_dir_path=(path)
|
13
|
-
@cache_dir_path = path
|
14
|
-
end
|
15
|
-
@dataset.cache_dir_path = @tmp_dir
|
16
|
-
|
17
|
-
def @dataset.download(output_path, url)
|
18
|
-
image_magic_number = 2051
|
19
|
-
label_magic_number = 2049
|
20
|
-
n_image, image_size_x, image_size_y, label = 10, 28, 28, 1
|
21
|
-
|
22
|
-
Zlib::GzipWriter.open(output_path) do |gz|
|
23
|
-
if output_path.basename.to_s.include?("-images-")
|
24
|
-
image_data = ([image_magic_number, n_image]).pack('N2') +
|
25
|
-
([image_size_x,image_size_y]).pack('N2') +
|
26
|
-
([0] * image_size_x * image_size_y).pack("C*") * n_image
|
27
|
-
gz.puts(image_data)
|
28
|
-
else
|
29
|
-
label_data = ([label_magic_number, n_image]).pack('N2') +
|
30
|
-
([label] * n_image).pack("C*")
|
31
|
-
gz.puts(label_data)
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
def teardown
|
38
|
-
teardown_sandbox
|
39
|
-
end
|
40
|
-
|
41
3
|
sub_test_case("train") do
|
42
4
|
def setup
|
43
5
|
@dataset = Datasets::MNIST.new(type: :train)
|
44
|
-
setup_data()
|
45
6
|
end
|
46
7
|
|
47
8
|
test("#each") do
|
48
|
-
|
49
|
-
{
|
50
|
-
:label => record.label,
|
51
|
-
:pixels => record.pixels
|
52
|
-
}
|
53
|
-
end
|
54
|
-
|
9
|
+
records = @dataset.each.to_a
|
55
10
|
assert_equal([
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
11
|
+
60000,
|
12
|
+
[
|
13
|
+
5,
|
14
|
+
784,
|
15
|
+
[0, 0, 0, 49, 238, 253, 253, 253, 253, 253],
|
16
|
+
[0, 0, 0, 0, 0, 81, 240, 253, 253, 119],
|
17
|
+
],
|
18
|
+
[8,
|
19
|
+
784,
|
20
|
+
[0, 0, 0, 0, 0, 0, 0, 0, 0, 62],
|
21
|
+
[0, 0, 190, 196, 14, 2, 97, 254, 252, 146],
|
22
|
+
],
|
23
|
+
],
|
24
|
+
[
|
25
|
+
records.size,
|
26
|
+
[
|
27
|
+
records[0].label,
|
28
|
+
records[0].pixels.size,
|
29
|
+
records[0].pixels[200, 10],
|
30
|
+
records[0].pixels[400, 10],
|
31
|
+
],
|
32
|
+
[
|
33
|
+
records[-1].label,
|
34
|
+
records[-1].pixels.size,
|
35
|
+
records[-1].pixels[200, 10],
|
36
|
+
records[-1].pixels[400, 10],
|
37
|
+
],
|
38
|
+
])
|
62
39
|
end
|
63
40
|
|
64
41
|
test("#to_table") do
|
65
42
|
table_data = @dataset.to_table
|
66
|
-
assert_equal([
|
67
|
-
|
43
|
+
assert_equal([
|
44
|
+
[0, 0, 0, 49, 238, 253, 253, 253, 253, 253],
|
45
|
+
[0, 0, 0, 0, 0, 0, 0, 0, 0, 62],
|
46
|
+
],
|
47
|
+
[
|
48
|
+
table_data[:pixels][0][200, 10],
|
49
|
+
table_data[:pixels][-1][200, 10],
|
50
|
+
])
|
51
|
+
end
|
52
|
+
|
53
|
+
sub_test_case("#metadata") do
|
54
|
+
test("#id") do
|
55
|
+
assert_equal("mnist-train", @dataset.metadata.id)
|
56
|
+
end
|
57
|
+
|
58
|
+
test("#name") do
|
59
|
+
assert_equal("MNIST: train", @dataset.metadata.name)
|
60
|
+
end
|
68
61
|
end
|
69
62
|
end
|
70
63
|
|
71
64
|
sub_test_case("test") do
|
72
65
|
def setup
|
73
66
|
@dataset = Datasets::MNIST.new(type: :test)
|
74
|
-
setup_data()
|
75
67
|
end
|
76
68
|
|
77
69
|
test("#each") do
|
78
|
-
|
79
|
-
{
|
80
|
-
:label => record.label,
|
81
|
-
:pixels => record.pixels
|
82
|
-
}
|
83
|
-
end
|
84
|
-
|
70
|
+
records = @dataset.each.to_a
|
85
71
|
assert_equal([
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
72
|
+
10000,
|
73
|
+
[
|
74
|
+
7,
|
75
|
+
784,
|
76
|
+
[0, 0, 84, 185, 159, 151, 60, 36, 0, 0],
|
77
|
+
[0, 0, 0, 0, 0, 0, 0, 0, 59, 249],
|
78
|
+
],
|
79
|
+
[
|
80
|
+
6,
|
81
|
+
784,
|
82
|
+
[0, 0, 0, 0, 0, 15, 60, 60, 168, 253],
|
83
|
+
[253, 253, 132, 64, 0, 0, 18, 43, 157, 171],
|
84
|
+
],
|
85
|
+
],
|
86
|
+
[
|
87
|
+
records.size,
|
88
|
+
[
|
89
|
+
records[0].label,
|
90
|
+
records[0].pixels.size,
|
91
|
+
records[0].pixels[200, 10],
|
92
|
+
records[0].pixels[400, 10],
|
93
|
+
],
|
94
|
+
[
|
95
|
+
records[-1].label,
|
96
|
+
records[-1].pixels.size,
|
97
|
+
records[-1].pixels[200, 10],
|
98
|
+
records[-1].pixels[400, 10],
|
99
|
+
],
|
100
|
+
])
|
92
101
|
end
|
93
102
|
|
94
103
|
test("#to_table") do
|
95
104
|
table_data = @dataset.to_table
|
96
|
-
assert_equal([
|
97
|
-
|
105
|
+
assert_equal([
|
106
|
+
[0, 0, 84, 185, 159, 151, 60, 36, 0, 0],
|
107
|
+
[0, 0, 0, 0, 0, 15, 60, 60, 168, 253],
|
108
|
+
],
|
109
|
+
[
|
110
|
+
table_data[:pixels][0][200, 10],
|
111
|
+
table_data[:pixels][-1][200, 10],
|
112
|
+
])
|
113
|
+
end
|
114
|
+
|
115
|
+
sub_test_case("#metadata") do
|
116
|
+
test("#id") do
|
117
|
+
assert_equal("mnist-test", @dataset.metadata.id)
|
118
|
+
end
|
119
|
+
|
120
|
+
test("#name") do
|
121
|
+
assert_equal("MNIST: test", @dataset.metadata.name)
|
122
|
+
end
|
98
123
|
end
|
99
124
|
end
|
100
125
|
end
|