red-datasets 0.0.6 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -7
  3. data/doc/text/news.md +124 -0
  4. data/lib/datasets.rb +18 -6
  5. data/lib/datasets/adult.rb +84 -0
  6. data/lib/datasets/cldr-plurals.rb +385 -0
  7. data/lib/datasets/communities.rb +198 -0
  8. data/lib/datasets/dataset.rb +13 -0
  9. data/lib/datasets/dictionary.rb +59 -0
  10. data/lib/datasets/downloader.rb +37 -62
  11. data/lib/datasets/e-stat-japan.rb +320 -0
  12. data/lib/datasets/error.rb +4 -0
  13. data/lib/datasets/fashion-mnist.rb +12 -0
  14. data/lib/datasets/hepatitis.rb +207 -0
  15. data/lib/datasets/iris.rb +1 -1
  16. data/lib/datasets/libsvm-dataset-list.rb +277 -0
  17. data/lib/datasets/libsvm.rb +135 -0
  18. data/lib/datasets/mnist.rb +11 -8
  19. data/lib/datasets/mushroom.rb +256 -0
  20. data/lib/datasets/penguins.rb +125 -0
  21. data/lib/datasets/penn-treebank.rb +2 -9
  22. data/lib/datasets/postal-code-japan.rb +154 -0
  23. data/lib/datasets/table.rb +99 -3
  24. data/lib/datasets/version.rb +1 -1
  25. data/lib/datasets/wikipedia.rb +2 -10
  26. data/lib/datasets/wine.rb +64 -0
  27. data/red-datasets.gemspec +4 -0
  28. data/test/helper.rb +1 -0
  29. data/test/run-test.rb +2 -0
  30. data/test/test-adult.rb +126 -0
  31. data/test/test-cldr-plurals.rb +180 -0
  32. data/test/test-communities.rb +290 -0
  33. data/test/test-dictionary.rb +43 -0
  34. data/test/test-e-stat-japan.rb +383 -0
  35. data/test/test-fashion-mnist.rb +137 -0
  36. data/test/test-hepatitis.rb +74 -0
  37. data/test/test-libsvm-dataset-list.rb +47 -0
  38. data/test/test-libsvm.rb +205 -0
  39. data/test/test-mnist.rb +95 -70
  40. data/test/test-mushroom.rb +80 -0
  41. data/test/test-penguins.rb +239 -0
  42. data/test/test-penn-treebank.rb +6 -6
  43. data/test/test-postal-code-japan.rb +69 -0
  44. data/test/test-table.rb +144 -19
  45. data/test/test-wine.rb +58 -0
  46. metadata +89 -8
@@ -0,0 +1,74 @@
1
+ class HepatitisTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::Hepatitis.new
4
+ end
5
+
6
+ def record(*args)
7
+ Datasets::Hepatitis::Record.new(*args)
8
+ end
9
+
10
+ test("#each") do
11
+ records = @dataset.each.to_a
12
+ assert_equal([
13
+ 155,
14
+ {
15
+ :label => :live,
16
+ :age => 30,
17
+ :sex => :female,
18
+ :steroid => false,
19
+ :antivirals => true,
20
+ :fatigue => true,
21
+ :malaise => true,
22
+ :anorexia => true,
23
+ :liver_big => false,
24
+ :liver_firm => true,
25
+ :spleen_palpable => true,
26
+ :spiders => true,
27
+ :ascites => true,
28
+ :varices => true,
29
+ :bilirubin => 1.0,
30
+ :alkaline_phosphate => 85,
31
+ :sgot => 18,
32
+ :albumin => 4.0,
33
+ :protime => nil,
34
+ :histology => false,
35
+ },
36
+ {
37
+ :label => :die,
38
+ :age => 43,
39
+ :sex => :male,
40
+ :steroid => true,
41
+ :antivirals => true,
42
+ :fatigue => false,
43
+ :malaise => true,
44
+ :anorexia => true,
45
+ :liver_big => true,
46
+ :liver_firm => true,
47
+ :spleen_palpable => false,
48
+ :spiders => false,
49
+ :ascites => false,
50
+ :varices => true,
51
+ :bilirubin => 1.2,
52
+ :alkaline_phosphate => 100,
53
+ :sgot => 19,
54
+ :albumin => 3.1,
55
+ :protime => 42,
56
+ :histology => true,
57
+ }
58
+ ],
59
+ [
60
+ records.size,
61
+ records[0].to_h,
62
+ records[-1].to_h,
63
+ ])
64
+ end
65
+
66
+ sub_test_case("#metadata") do
67
+ test("#description") do
68
+ description = @dataset.metadata.description
69
+ assert do
70
+ description.start_with?("1. Title: Hepatitis Domain")
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,47 @@
1
+ class LIBSVMDatasetListTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::LIBSVMDatasetList.new
4
+ end
5
+
6
+ test("#each") do
7
+ assert_equal({
8
+ name: "a1a",
9
+ source: "UCI / Adult",
10
+ preprocessing:
11
+ "The original Adult data set has 14 features, " +
12
+ "among which six are continuous and eight are " +
13
+ "categorical. In this data set, continuous features " +
14
+ "are discretized into quantiles, and each quantile is " +
15
+ "represented by a binary feature. Also, a categorical " +
16
+ "feature with m categories is converted to m binary " +
17
+ "features. Details on how each feature is converted " +
18
+ "can be found in the beginning of each file from this " +
19
+ "page. [JP98a]",
20
+ n_classes: 2,
21
+ n_data: 1605,
22
+ n_features: 123,
23
+ files: [
24
+ {
25
+ name: "a1a",
26
+ url: "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a",
27
+ note: nil,
28
+ },
29
+ {
30
+ name: "a1a.t",
31
+ url: "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a.t",
32
+ note: "testing",
33
+ }
34
+ ],
35
+ },
36
+ @dataset.first.to_h)
37
+ end
38
+
39
+ sub_test_case("#metadata") do
40
+ test("#description") do
41
+ description = @dataset.metadata.description
42
+ assert do
43
+ description.start_with?("This page contains many classification, ")
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,205 @@
1
+ class LIBSVMDatasetTest < Test::Unit::TestCase
2
+ test(":note") do
3
+ dataset = Datasets::LIBSVM.new("a1a", note: "testing")
4
+ hash = {label: -1}
5
+ n_features = 123
6
+ n_features.times do |i|
7
+ hash[i] = 0
8
+ end
9
+ [5, 7, 14, 19, 39, 40, 51, 63, 67, 73, 74, 76, 78, 83].each do |i|
10
+ hash[i - 1] = 1
11
+ end
12
+ assert_equal(hash,
13
+ dataset.first.to_h)
14
+ end
15
+
16
+ test(":default_feature_value") do
17
+ dataset = Datasets::LIBSVM.new("a1a", default_feature_value: nil)
18
+ hash = {label: -1}
19
+ n_features = 123
20
+ n_features.times do |i|
21
+ hash[i] = nil
22
+ end
23
+ [3, 11, 14, 19, 39, 42, 55, 64, 67, 73, 75, 76, 80, 83].each do |i|
24
+ hash[i - 1] = 1
25
+ end
26
+ assert_equal(hash,
27
+ dataset.first.to_h)
28
+ end
29
+
30
+ test("classification") do
31
+ dataset = Datasets::LIBSVM.new("a1a")
32
+ hash = {label: -1}
33
+ n_features = 123
34
+ n_features.times do |i|
35
+ hash[i] = 0
36
+ end
37
+ [3, 11, 14, 19, 39, 42, 55, 64, 67, 73, 75, 76, 80, 83].each do |i|
38
+ hash[i - 1] = 1
39
+ end
40
+ assert_equal(hash,
41
+ dataset.first.to_h)
42
+ end
43
+
44
+ test("regression") do
45
+ dataset = Datasets::LIBSVM.new("abalone")
46
+ hash = {label: 15}
47
+ n_features = 8
48
+ n_features.times do |i|
49
+ hash[i] = 0
50
+ end
51
+ [
52
+ [1, 1],
53
+ [2, 0.455],
54
+ [3, 0.365],
55
+ [4, 0.095],
56
+ [5, 0.514],
57
+ [6, 0.2245],
58
+ [7, 0.101],
59
+ [8, 0.15],
60
+ ].each do |i, value|
61
+ hash[i - 1] = value
62
+ end
63
+ assert_equal(hash,
64
+ dataset.first.to_h)
65
+ end
66
+
67
+ test("multi-label") do
68
+ dataset = Datasets::LIBSVM.new("mediamill (exp1)")
69
+ hash = {label: [65, 67, 11, 31]}
70
+ n_features = 120
71
+ n_features.times do |i|
72
+ hash[i] = 0
73
+ end
74
+ [
75
+ [1, 0.380877],
76
+ [2, 0.494079],
77
+ [3, 0.540009],
78
+ [4, 0.422926],
79
+ [5, 0.158318],
80
+ [6, 0.326975],
81
+ [7, 0.390861],
82
+ [8, 0.527121],
83
+ [9, 0.254052],
84
+ [10, 0.223731],
85
+ [11, 0.040285],
86
+ [12, 0.141133],
87
+ [13, 0.112249],
88
+ [14, 0.263171],
89
+ [15, 0.147020],
90
+ [16, 0.472414],
91
+ [17, 0.592614],
92
+ [18, 0.653138],
93
+ [19, 0.499867],
94
+ [20, 0.196520],
95
+ [21, 0.403892],
96
+ [22, 0.482395],
97
+ [23, 0.619219],
98
+ [24, 0.320346],
99
+ [25, 0.281251],
100
+ [26, 0.054750],
101
+ [27, 0.180459],
102
+ [28, 0.139964],
103
+ [29, 0.319925],
104
+ [30, 0.181216],
105
+ [31, 0.364294],
106
+ [32, 0.407211],
107
+ [33, 0.368926],
108
+ [34, 0.427661],
109
+ [35, 0.211391],
110
+ [36, 0.364345],
111
+ [37, 0.370710],
112
+ [38, 0.409107],
113
+ [39, 0.289299],
114
+ [40, 0.243053],
115
+ [41, 0.063121],
116
+ [42, 0.193587],
117
+ [43, 0.158755],
118
+ [44, 0.316054],
119
+ [45, 0.197410],
120
+ [46, 0.656168],
121
+ [47, 0.678760],
122
+ [48, 0.650831],
123
+ [49, 0.674636],
124
+ [50, 0.492428],
125
+ [51, 0.623887],
126
+ [52, 0.610622],
127
+ [53, 0.678219],
128
+ [54, 0.574774],
129
+ [55, 0.523073],
130
+ [56, 0.206804],
131
+ [57, 0.496294],
132
+ [58, 0.429221],
133
+ [59, 0.586611],
134
+ [60, 0.471550],
135
+ [61, 0.284480],
136
+ [62, 0.432466],
137
+ [63, 0.498075],
138
+ [64, 0.408141],
139
+ [65, 0.102713],
140
+ [66, 0.303028],
141
+ [67, 0.309501],
142
+ [68, 0.444855],
143
+ [69, 0.191727],
144
+ [70, 0.174895],
145
+ [71, 0.034143],
146
+ [72, 0.153099],
147
+ [73, 0.068318],
148
+ [74, 0.217020],
149
+ [75, 0.099688],
150
+ [76, 0.409862],
151
+ [77, 0.561918],
152
+ [78, 0.612031],
153
+ [79, 0.514471],
154
+ [80, 0.146015],
155
+ [81, 0.398807],
156
+ [82, 0.383295],
157
+ [83, 0.548485],
158
+ [84, 0.282937],
159
+ [85, 0.252712],
160
+ [86, 0.051008],
161
+ [87, 0.223110],
162
+ [88, 0.098112],
163
+ [89, 0.299672],
164
+ [90, 0.144873],
165
+ [91, 0.308488],
166
+ [92, 0.358478],
167
+ [93, 0.352077],
168
+ [94, 0.394686],
169
+ [95, 0.157513],
170
+ [96, 0.339370],
171
+ [97, 0.321558],
172
+ [98, 0.341373],
173
+ [99, 0.247969],
174
+ [100, 0.206070],
175
+ [101, 0.061001],
176
+ [102, 0.216793],
177
+ [103, 0.112389],
178
+ [104, 0.273648],
179
+ [105, 0.152745],
180
+ [106, 0.598081],
181
+ [107, 0.621687],
182
+ [108, 0.607213],
183
+ [109, 0.644025],
184
+ [110, 0.394948],
185
+ [111, 0.593651],
186
+ [112, 0.551529],
187
+ [113, 0.574392],
188
+ [114, 0.511032],
189
+ [115, 0.463997],
190
+ [116, 0.202034],
191
+ [117, 0.492341],
192
+ [118, 0.317983],
193
+ [119, 0.547807],
194
+ [120, 0.393778],
195
+ ].each do |i, value|
196
+ hash[i - 1] = value
197
+ end
198
+ assert_equal(hash,
199
+ dataset.first.to_h)
200
+ end
201
+
202
+ test("string") do
203
+ # TODO
204
+ end
205
+ end
data/test/test-mnist.rb CHANGED
@@ -1,100 +1,125 @@
1
1
  class MNISTTest < Test::Unit::TestCase
2
- include Helper::Sandbox
3
-
4
2
  sub_test_case("Normal") do
5
- def setup_data
6
- setup_sandbox
7
-
8
- def @dataset.cache_dir_path
9
- @cache_dir_path
10
- end
11
-
12
- def @dataset.cache_dir_path=(path)
13
- @cache_dir_path = path
14
- end
15
- @dataset.cache_dir_path = @tmp_dir
16
-
17
- def @dataset.download(output_path, url)
18
- image_magic_number = 2051
19
- label_magic_number = 2049
20
- n_image, image_size_x, image_size_y, label = 10, 28, 28, 1
21
-
22
- Zlib::GzipWriter.open(output_path) do |gz|
23
- if output_path.basename.to_s.include?("-images-")
24
- image_data = ([image_magic_number, n_image]).pack('N2') +
25
- ([image_size_x,image_size_y]).pack('N2') +
26
- ([0] * image_size_x * image_size_y).pack("C*") * n_image
27
- gz.puts(image_data)
28
- else
29
- label_data = ([label_magic_number, n_image]).pack('N2') +
30
- ([label] * n_image).pack("C*")
31
- gz.puts(label_data)
32
- end
33
- end
34
- end
35
- end
36
-
37
- def teardown
38
- teardown_sandbox
39
- end
40
-
41
3
  sub_test_case("train") do
42
4
  def setup
43
5
  @dataset = Datasets::MNIST.new(type: :train)
44
- setup_data()
45
6
  end
46
7
 
47
8
  test("#each") do
48
- raw_dataset = @dataset.collect do |record|
49
- {
50
- :label => record.label,
51
- :pixels => record.pixels
52
- }
53
- end
54
-
9
+ records = @dataset.each.to_a
55
10
  assert_equal([
56
- {
57
- :label => 1,
58
- :pixels => [0] * 28 * 28
59
- }
60
- ] * 10,
61
- raw_dataset)
11
+ 60000,
12
+ [
13
+ 5,
14
+ 784,
15
+ [0, 0, 0, 49, 238, 253, 253, 253, 253, 253],
16
+ [0, 0, 0, 0, 0, 81, 240, 253, 253, 119],
17
+ ],
18
+ [8,
19
+ 784,
20
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 62],
21
+ [0, 0, 190, 196, 14, 2, 97, 254, 252, 146],
22
+ ],
23
+ ],
24
+ [
25
+ records.size,
26
+ [
27
+ records[0].label,
28
+ records[0].pixels.size,
29
+ records[0].pixels[200, 10],
30
+ records[0].pixels[400, 10],
31
+ ],
32
+ [
33
+ records[-1].label,
34
+ records[-1].pixels.size,
35
+ records[-1].pixels[200, 10],
36
+ records[-1].pixels[400, 10],
37
+ ],
38
+ ])
62
39
  end
63
40
 
64
41
  test("#to_table") do
65
42
  table_data = @dataset.to_table
66
- assert_equal([[0] * 28 * 28] * 10,
67
- table_data[:pixels])
43
+ assert_equal([
44
+ [0, 0, 0, 49, 238, 253, 253, 253, 253, 253],
45
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 62],
46
+ ],
47
+ [
48
+ table_data[:pixels][0][200, 10],
49
+ table_data[:pixels][-1][200, 10],
50
+ ])
51
+ end
52
+
53
+ sub_test_case("#metadata") do
54
+ test("#id") do
55
+ assert_equal("mnist-train", @dataset.metadata.id)
56
+ end
57
+
58
+ test("#name") do
59
+ assert_equal("MNIST: train", @dataset.metadata.name)
60
+ end
68
61
  end
69
62
  end
70
63
 
71
64
  sub_test_case("test") do
72
65
  def setup
73
66
  @dataset = Datasets::MNIST.new(type: :test)
74
- setup_data()
75
67
  end
76
68
 
77
69
  test("#each") do
78
- raw_dataset = @dataset.collect do |record|
79
- {
80
- :label => record.label,
81
- :pixels => record.pixels
82
- }
83
- end
84
-
70
+ records = @dataset.each.to_a
85
71
  assert_equal([
86
- {
87
- :label => 1,
88
- :pixels => [0] * 28 * 28
89
- }
90
- ] * 10,
91
- raw_dataset)
72
+ 10000,
73
+ [
74
+ 7,
75
+ 784,
76
+ [0, 0, 84, 185, 159, 151, 60, 36, 0, 0],
77
+ [0, 0, 0, 0, 0, 0, 0, 0, 59, 249],
78
+ ],
79
+ [
80
+ 6,
81
+ 784,
82
+ [0, 0, 0, 0, 0, 15, 60, 60, 168, 253],
83
+ [253, 253, 132, 64, 0, 0, 18, 43, 157, 171],
84
+ ],
85
+ ],
86
+ [
87
+ records.size,
88
+ [
89
+ records[0].label,
90
+ records[0].pixels.size,
91
+ records[0].pixels[200, 10],
92
+ records[0].pixels[400, 10],
93
+ ],
94
+ [
95
+ records[-1].label,
96
+ records[-1].pixels.size,
97
+ records[-1].pixels[200, 10],
98
+ records[-1].pixels[400, 10],
99
+ ],
100
+ ])
92
101
  end
93
102
 
94
103
  test("#to_table") do
95
104
  table_data = @dataset.to_table
96
- assert_equal([[0] * 28 * 28] * 10,
97
- table_data[:pixels])
105
+ assert_equal([
106
+ [0, 0, 84, 185, 159, 151, 60, 36, 0, 0],
107
+ [0, 0, 0, 0, 0, 15, 60, 60, 168, 253],
108
+ ],
109
+ [
110
+ table_data[:pixels][0][200, 10],
111
+ table_data[:pixels][-1][200, 10],
112
+ ])
113
+ end
114
+
115
+ sub_test_case("#metadata") do
116
+ test("#id") do
117
+ assert_equal("mnist-test", @dataset.metadata.id)
118
+ end
119
+
120
+ test("#name") do
121
+ assert_equal("MNIST: test", @dataset.metadata.name)
122
+ end
98
123
  end
99
124
  end
100
125
  end