red-datasets 0.0.6 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -7
  3. data/doc/text/news.md +124 -0
  4. data/lib/datasets.rb +18 -6
  5. data/lib/datasets/adult.rb +84 -0
  6. data/lib/datasets/cldr-plurals.rb +385 -0
  7. data/lib/datasets/communities.rb +198 -0
  8. data/lib/datasets/dataset.rb +13 -0
  9. data/lib/datasets/dictionary.rb +59 -0
  10. data/lib/datasets/downloader.rb +37 -62
  11. data/lib/datasets/e-stat-japan.rb +320 -0
  12. data/lib/datasets/error.rb +4 -0
  13. data/lib/datasets/fashion-mnist.rb +12 -0
  14. data/lib/datasets/hepatitis.rb +207 -0
  15. data/lib/datasets/iris.rb +1 -1
  16. data/lib/datasets/libsvm-dataset-list.rb +277 -0
  17. data/lib/datasets/libsvm.rb +135 -0
  18. data/lib/datasets/mnist.rb +11 -8
  19. data/lib/datasets/mushroom.rb +256 -0
  20. data/lib/datasets/penguins.rb +125 -0
  21. data/lib/datasets/penn-treebank.rb +2 -9
  22. data/lib/datasets/postal-code-japan.rb +154 -0
  23. data/lib/datasets/table.rb +99 -3
  24. data/lib/datasets/version.rb +1 -1
  25. data/lib/datasets/wikipedia.rb +2 -10
  26. data/lib/datasets/wine.rb +64 -0
  27. data/red-datasets.gemspec +4 -0
  28. data/test/helper.rb +1 -0
  29. data/test/run-test.rb +2 -0
  30. data/test/test-adult.rb +126 -0
  31. data/test/test-cldr-plurals.rb +180 -0
  32. data/test/test-communities.rb +290 -0
  33. data/test/test-dictionary.rb +43 -0
  34. data/test/test-e-stat-japan.rb +383 -0
  35. data/test/test-fashion-mnist.rb +137 -0
  36. data/test/test-hepatitis.rb +74 -0
  37. data/test/test-libsvm-dataset-list.rb +47 -0
  38. data/test/test-libsvm.rb +205 -0
  39. data/test/test-mnist.rb +95 -70
  40. data/test/test-mushroom.rb +80 -0
  41. data/test/test-penguins.rb +239 -0
  42. data/test/test-penn-treebank.rb +6 -6
  43. data/test/test-postal-code-japan.rb +69 -0
  44. data/test/test-table.rb +144 -19
  45. data/test/test-wine.rb +58 -0
  46. metadata +89 -8
@@ -0,0 +1,74 @@
1
+ class HepatitisTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::Hepatitis.new
4
+ end
5
+
6
+ def record(*args)
7
+ Datasets::Hepatitis::Record.new(*args)
8
+ end
9
+
10
+ test("#each") do
11
+ records = @dataset.each.to_a
12
+ assert_equal([
13
+ 155,
14
+ {
15
+ :label => :live,
16
+ :age => 30,
17
+ :sex => :female,
18
+ :steroid => false,
19
+ :antivirals => true,
20
+ :fatigue => true,
21
+ :malaise => true,
22
+ :anorexia => true,
23
+ :liver_big => false,
24
+ :liver_firm => true,
25
+ :spleen_palpable => true,
26
+ :spiders => true,
27
+ :ascites => true,
28
+ :varices => true,
29
+ :bilirubin => 1.0,
30
+ :alkaline_phosphate => 85,
31
+ :sgot => 18,
32
+ :albumin => 4.0,
33
+ :protime => nil,
34
+ :histology => false,
35
+ },
36
+ {
37
+ :label => :die,
38
+ :age => 43,
39
+ :sex => :male,
40
+ :steroid => true,
41
+ :antivirals => true,
42
+ :fatigue => false,
43
+ :malaise => true,
44
+ :anorexia => true,
45
+ :liver_big => true,
46
+ :liver_firm => true,
47
+ :spleen_palpable => false,
48
+ :spiders => false,
49
+ :ascites => false,
50
+ :varices => true,
51
+ :bilirubin => 1.2,
52
+ :alkaline_phosphate => 100,
53
+ :sgot => 19,
54
+ :albumin => 3.1,
55
+ :protime => 42,
56
+ :histology => true,
57
+ }
58
+ ],
59
+ [
60
+ records.size,
61
+ records[0].to_h,
62
+ records[-1].to_h,
63
+ ])
64
+ end
65
+
66
+ sub_test_case("#metadata") do
67
+ test("#description") do
68
+ description = @dataset.metadata.description
69
+ assert do
70
+ description.start_with?("1. Title: Hepatitis Domain")
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,47 @@
1
+ class LIBSVMDatasetListTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::LIBSVMDatasetList.new
4
+ end
5
+
6
+ test("#each") do
7
+ assert_equal({
8
+ name: "a1a",
9
+ source: "UCI / Adult",
10
+ preprocessing:
11
+ "The original Adult data set has 14 features, " +
12
+ "among which six are continuous and eight are " +
13
+ "categorical. In this data set, continuous features " +
14
+ "are discretized into quantiles, and each quantile is " +
15
+ "represented by a binary feature. Also, a categorical " +
16
+ "feature with m categories is converted to m binary " +
17
+ "features. Details on how each feature is converted " +
18
+ "can be found in the beginning of each file from this " +
19
+ "page. [JP98a]",
20
+ n_classes: 2,
21
+ n_data: 1605,
22
+ n_features: 123,
23
+ files: [
24
+ {
25
+ name: "a1a",
26
+ url: "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a",
27
+ note: nil,
28
+ },
29
+ {
30
+ name: "a1a.t",
31
+ url: "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a.t",
32
+ note: "testing",
33
+ }
34
+ ],
35
+ },
36
+ @dataset.first.to_h)
37
+ end
38
+
39
+ sub_test_case("#metadata") do
40
+ test("#description") do
41
+ description = @dataset.metadata.description
42
+ assert do
43
+ description.start_with?("This page contains many classification, ")
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,205 @@
1
+ class LIBSVMDatasetTest < Test::Unit::TestCase
2
+ test(":note") do
3
+ dataset = Datasets::LIBSVM.new("a1a", note: "testing")
4
+ hash = {label: -1}
5
+ n_features = 123
6
+ n_features.times do |i|
7
+ hash[i] = 0
8
+ end
9
+ [5, 7, 14, 19, 39, 40, 51, 63, 67, 73, 74, 76, 78, 83].each do |i|
10
+ hash[i - 1] = 1
11
+ end
12
+ assert_equal(hash,
13
+ dataset.first.to_h)
14
+ end
15
+
16
+ test(":default_feature_value") do
17
+ dataset = Datasets::LIBSVM.new("a1a", default_feature_value: nil)
18
+ hash = {label: -1}
19
+ n_features = 123
20
+ n_features.times do |i|
21
+ hash[i] = nil
22
+ end
23
+ [3, 11, 14, 19, 39, 42, 55, 64, 67, 73, 75, 76, 80, 83].each do |i|
24
+ hash[i - 1] = 1
25
+ end
26
+ assert_equal(hash,
27
+ dataset.first.to_h)
28
+ end
29
+
30
+ test("classification") do
31
+ dataset = Datasets::LIBSVM.new("a1a")
32
+ hash = {label: -1}
33
+ n_features = 123
34
+ n_features.times do |i|
35
+ hash[i] = 0
36
+ end
37
+ [3, 11, 14, 19, 39, 42, 55, 64, 67, 73, 75, 76, 80, 83].each do |i|
38
+ hash[i - 1] = 1
39
+ end
40
+ assert_equal(hash,
41
+ dataset.first.to_h)
42
+ end
43
+
44
+ test("regression") do
45
+ dataset = Datasets::LIBSVM.new("abalone")
46
+ hash = {label: 15}
47
+ n_features = 8
48
+ n_features.times do |i|
49
+ hash[i] = 0
50
+ end
51
+ [
52
+ [1, 1],
53
+ [2, 0.455],
54
+ [3, 0.365],
55
+ [4, 0.095],
56
+ [5, 0.514],
57
+ [6, 0.2245],
58
+ [7, 0.101],
59
+ [8, 0.15],
60
+ ].each do |i, value|
61
+ hash[i - 1] = value
62
+ end
63
+ assert_equal(hash,
64
+ dataset.first.to_h)
65
+ end
66
+
67
+ test("multi-label") do
68
+ dataset = Datasets::LIBSVM.new("mediamill (exp1)")
69
+ hash = {label: [65, 67, 11, 31]}
70
+ n_features = 120
71
+ n_features.times do |i|
72
+ hash[i] = 0
73
+ end
74
+ [
75
+ [1, 0.380877],
76
+ [2, 0.494079],
77
+ [3, 0.540009],
78
+ [4, 0.422926],
79
+ [5, 0.158318],
80
+ [6, 0.326975],
81
+ [7, 0.390861],
82
+ [8, 0.527121],
83
+ [9, 0.254052],
84
+ [10, 0.223731],
85
+ [11, 0.040285],
86
+ [12, 0.141133],
87
+ [13, 0.112249],
88
+ [14, 0.263171],
89
+ [15, 0.147020],
90
+ [16, 0.472414],
91
+ [17, 0.592614],
92
+ [18, 0.653138],
93
+ [19, 0.499867],
94
+ [20, 0.196520],
95
+ [21, 0.403892],
96
+ [22, 0.482395],
97
+ [23, 0.619219],
98
+ [24, 0.320346],
99
+ [25, 0.281251],
100
+ [26, 0.054750],
101
+ [27, 0.180459],
102
+ [28, 0.139964],
103
+ [29, 0.319925],
104
+ [30, 0.181216],
105
+ [31, 0.364294],
106
+ [32, 0.407211],
107
+ [33, 0.368926],
108
+ [34, 0.427661],
109
+ [35, 0.211391],
110
+ [36, 0.364345],
111
+ [37, 0.370710],
112
+ [38, 0.409107],
113
+ [39, 0.289299],
114
+ [40, 0.243053],
115
+ [41, 0.063121],
116
+ [42, 0.193587],
117
+ [43, 0.158755],
118
+ [44, 0.316054],
119
+ [45, 0.197410],
120
+ [46, 0.656168],
121
+ [47, 0.678760],
122
+ [48, 0.650831],
123
+ [49, 0.674636],
124
+ [50, 0.492428],
125
+ [51, 0.623887],
126
+ [52, 0.610622],
127
+ [53, 0.678219],
128
+ [54, 0.574774],
129
+ [55, 0.523073],
130
+ [56, 0.206804],
131
+ [57, 0.496294],
132
+ [58, 0.429221],
133
+ [59, 0.586611],
134
+ [60, 0.471550],
135
+ [61, 0.284480],
136
+ [62, 0.432466],
137
+ [63, 0.498075],
138
+ [64, 0.408141],
139
+ [65, 0.102713],
140
+ [66, 0.303028],
141
+ [67, 0.309501],
142
+ [68, 0.444855],
143
+ [69, 0.191727],
144
+ [70, 0.174895],
145
+ [71, 0.034143],
146
+ [72, 0.153099],
147
+ [73, 0.068318],
148
+ [74, 0.217020],
149
+ [75, 0.099688],
150
+ [76, 0.409862],
151
+ [77, 0.561918],
152
+ [78, 0.612031],
153
+ [79, 0.514471],
154
+ [80, 0.146015],
155
+ [81, 0.398807],
156
+ [82, 0.383295],
157
+ [83, 0.548485],
158
+ [84, 0.282937],
159
+ [85, 0.252712],
160
+ [86, 0.051008],
161
+ [87, 0.223110],
162
+ [88, 0.098112],
163
+ [89, 0.299672],
164
+ [90, 0.144873],
165
+ [91, 0.308488],
166
+ [92, 0.358478],
167
+ [93, 0.352077],
168
+ [94, 0.394686],
169
+ [95, 0.157513],
170
+ [96, 0.339370],
171
+ [97, 0.321558],
172
+ [98, 0.341373],
173
+ [99, 0.247969],
174
+ [100, 0.206070],
175
+ [101, 0.061001],
176
+ [102, 0.216793],
177
+ [103, 0.112389],
178
+ [104, 0.273648],
179
+ [105, 0.152745],
180
+ [106, 0.598081],
181
+ [107, 0.621687],
182
+ [108, 0.607213],
183
+ [109, 0.644025],
184
+ [110, 0.394948],
185
+ [111, 0.593651],
186
+ [112, 0.551529],
187
+ [113, 0.574392],
188
+ [114, 0.511032],
189
+ [115, 0.463997],
190
+ [116, 0.202034],
191
+ [117, 0.492341],
192
+ [118, 0.317983],
193
+ [119, 0.547807],
194
+ [120, 0.393778],
195
+ ].each do |i, value|
196
+ hash[i - 1] = value
197
+ end
198
+ assert_equal(hash,
199
+ dataset.first.to_h)
200
+ end
201
+
202
+ test("string") do
203
+ # TODO
204
+ end
205
+ end
data/test/test-mnist.rb CHANGED
@@ -1,100 +1,125 @@
1
1
  class MNISTTest < Test::Unit::TestCase
2
- include Helper::Sandbox
3
-
4
2
  sub_test_case("Normal") do
5
- def setup_data
6
- setup_sandbox
7
-
8
- def @dataset.cache_dir_path
9
- @cache_dir_path
10
- end
11
-
12
- def @dataset.cache_dir_path=(path)
13
- @cache_dir_path = path
14
- end
15
- @dataset.cache_dir_path = @tmp_dir
16
-
17
- def @dataset.download(output_path, url)
18
- image_magic_number = 2051
19
- label_magic_number = 2049
20
- n_image, image_size_x, image_size_y, label = 10, 28, 28, 1
21
-
22
- Zlib::GzipWriter.open(output_path) do |gz|
23
- if output_path.basename.to_s.include?("-images-")
24
- image_data = ([image_magic_number, n_image]).pack('N2') +
25
- ([image_size_x,image_size_y]).pack('N2') +
26
- ([0] * image_size_x * image_size_y).pack("C*") * n_image
27
- gz.puts(image_data)
28
- else
29
- label_data = ([label_magic_number, n_image]).pack('N2') +
30
- ([label] * n_image).pack("C*")
31
- gz.puts(label_data)
32
- end
33
- end
34
- end
35
- end
36
-
37
- def teardown
38
- teardown_sandbox
39
- end
40
-
41
3
  sub_test_case("train") do
42
4
  def setup
43
5
  @dataset = Datasets::MNIST.new(type: :train)
44
- setup_data()
45
6
  end
46
7
 
47
8
  test("#each") do
48
- raw_dataset = @dataset.collect do |record|
49
- {
50
- :label => record.label,
51
- :pixels => record.pixels
52
- }
53
- end
54
-
9
+ records = @dataset.each.to_a
55
10
  assert_equal([
56
- {
57
- :label => 1,
58
- :pixels => [0] * 28 * 28
59
- }
60
- ] * 10,
61
- raw_dataset)
11
+ 60000,
12
+ [
13
+ 5,
14
+ 784,
15
+ [0, 0, 0, 49, 238, 253, 253, 253, 253, 253],
16
+ [0, 0, 0, 0, 0, 81, 240, 253, 253, 119],
17
+ ],
18
+ [8,
19
+ 784,
20
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 62],
21
+ [0, 0, 190, 196, 14, 2, 97, 254, 252, 146],
22
+ ],
23
+ ],
24
+ [
25
+ records.size,
26
+ [
27
+ records[0].label,
28
+ records[0].pixels.size,
29
+ records[0].pixels[200, 10],
30
+ records[0].pixels[400, 10],
31
+ ],
32
+ [
33
+ records[-1].label,
34
+ records[-1].pixels.size,
35
+ records[-1].pixels[200, 10],
36
+ records[-1].pixels[400, 10],
37
+ ],
38
+ ])
62
39
  end
63
40
 
64
41
  test("#to_table") do
65
42
  table_data = @dataset.to_table
66
- assert_equal([[0] * 28 * 28] * 10,
67
- table_data[:pixels])
43
+ assert_equal([
44
+ [0, 0, 0, 49, 238, 253, 253, 253, 253, 253],
45
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 62],
46
+ ],
47
+ [
48
+ table_data[:pixels][0][200, 10],
49
+ table_data[:pixels][-1][200, 10],
50
+ ])
51
+ end
52
+
53
+ sub_test_case("#metadata") do
54
+ test("#id") do
55
+ assert_equal("mnist-train", @dataset.metadata.id)
56
+ end
57
+
58
+ test("#name") do
59
+ assert_equal("MNIST: train", @dataset.metadata.name)
60
+ end
68
61
  end
69
62
  end
70
63
 
71
64
  sub_test_case("test") do
72
65
  def setup
73
66
  @dataset = Datasets::MNIST.new(type: :test)
74
- setup_data()
75
67
  end
76
68
 
77
69
  test("#each") do
78
- raw_dataset = @dataset.collect do |record|
79
- {
80
- :label => record.label,
81
- :pixels => record.pixels
82
- }
83
- end
84
-
70
+ records = @dataset.each.to_a
85
71
  assert_equal([
86
- {
87
- :label => 1,
88
- :pixels => [0] * 28 * 28
89
- }
90
- ] * 10,
91
- raw_dataset)
72
+ 10000,
73
+ [
74
+ 7,
75
+ 784,
76
+ [0, 0, 84, 185, 159, 151, 60, 36, 0, 0],
77
+ [0, 0, 0, 0, 0, 0, 0, 0, 59, 249],
78
+ ],
79
+ [
80
+ 6,
81
+ 784,
82
+ [0, 0, 0, 0, 0, 15, 60, 60, 168, 253],
83
+ [253, 253, 132, 64, 0, 0, 18, 43, 157, 171],
84
+ ],
85
+ ],
86
+ [
87
+ records.size,
88
+ [
89
+ records[0].label,
90
+ records[0].pixels.size,
91
+ records[0].pixels[200, 10],
92
+ records[0].pixels[400, 10],
93
+ ],
94
+ [
95
+ records[-1].label,
96
+ records[-1].pixels.size,
97
+ records[-1].pixels[200, 10],
98
+ records[-1].pixels[400, 10],
99
+ ],
100
+ ])
92
101
  end
93
102
 
94
103
  test("#to_table") do
95
104
  table_data = @dataset.to_table
96
- assert_equal([[0] * 28 * 28] * 10,
97
- table_data[:pixels])
105
+ assert_equal([
106
+ [0, 0, 84, 185, 159, 151, 60, 36, 0, 0],
107
+ [0, 0, 0, 0, 0, 15, 60, 60, 168, 253],
108
+ ],
109
+ [
110
+ table_data[:pixels][0][200, 10],
111
+ table_data[:pixels][-1][200, 10],
112
+ ])
113
+ end
114
+
115
+ sub_test_case("#metadata") do
116
+ test("#id") do
117
+ assert_equal("mnist-test", @dataset.metadata.id)
118
+ end
119
+
120
+ test("#name") do
121
+ assert_equal("MNIST: test", @dataset.metadata.name)
122
+ end
98
123
  end
99
124
  end
100
125
  end