red-datasets 0.0.6 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -7
  3. data/doc/text/news.md +124 -0
  4. data/lib/datasets.rb +18 -6
  5. data/lib/datasets/adult.rb +84 -0
  6. data/lib/datasets/cldr-plurals.rb +385 -0
  7. data/lib/datasets/communities.rb +198 -0
  8. data/lib/datasets/dataset.rb +13 -0
  9. data/lib/datasets/dictionary.rb +59 -0
  10. data/lib/datasets/downloader.rb +37 -62
  11. data/lib/datasets/e-stat-japan.rb +320 -0
  12. data/lib/datasets/error.rb +4 -0
  13. data/lib/datasets/fashion-mnist.rb +12 -0
  14. data/lib/datasets/hepatitis.rb +207 -0
  15. data/lib/datasets/iris.rb +1 -1
  16. data/lib/datasets/libsvm-dataset-list.rb +277 -0
  17. data/lib/datasets/libsvm.rb +135 -0
  18. data/lib/datasets/mnist.rb +11 -8
  19. data/lib/datasets/mushroom.rb +256 -0
  20. data/lib/datasets/penguins.rb +125 -0
  21. data/lib/datasets/penn-treebank.rb +2 -9
  22. data/lib/datasets/postal-code-japan.rb +154 -0
  23. data/lib/datasets/table.rb +99 -3
  24. data/lib/datasets/version.rb +1 -1
  25. data/lib/datasets/wikipedia.rb +2 -10
  26. data/lib/datasets/wine.rb +64 -0
  27. data/red-datasets.gemspec +4 -0
  28. data/test/helper.rb +1 -0
  29. data/test/run-test.rb +2 -0
  30. data/test/test-adult.rb +126 -0
  31. data/test/test-cldr-plurals.rb +180 -0
  32. data/test/test-communities.rb +290 -0
  33. data/test/test-dictionary.rb +43 -0
  34. data/test/test-e-stat-japan.rb +383 -0
  35. data/test/test-fashion-mnist.rb +137 -0
  36. data/test/test-hepatitis.rb +74 -0
  37. data/test/test-libsvm-dataset-list.rb +47 -0
  38. data/test/test-libsvm.rb +205 -0
  39. data/test/test-mnist.rb +95 -70
  40. data/test/test-mushroom.rb +80 -0
  41. data/test/test-penguins.rb +239 -0
  42. data/test/test-penn-treebank.rb +6 -6
  43. data/test/test-postal-code-japan.rb +69 -0
  44. data/test/test-table.rb +144 -19
  45. data/test/test-wine.rb +58 -0
  46. metadata +89 -8
@@ -0,0 +1,43 @@
1
+ class DictionaryTest < Test::Unit::TestCase
2
+ def setup
3
+ penn_treebank = Datasets::PennTreebank.new(type: :test)
4
+ @dictionary = penn_treebank.to_table.dictionary_encode(:word)
5
+ end
6
+
7
+ test("#id") do
8
+ assert_equal(95, @dictionary.id("<unk>"))
9
+ end
10
+
11
+ test("#value") do
12
+ assert_equal("<unk>", @dictionary.value(95))
13
+ end
14
+
15
+ test("#ids") do
16
+ assert_equal([0, 1, 2, 3, 4], @dictionary.ids.first(5))
17
+ end
18
+
19
+ test("#values") do
20
+ assert_equal(["no", "it", "was", "n't", "black"],
21
+ @dictionary.values.first(5))
22
+ end
23
+
24
+ test("#each") do
25
+ assert_equal([
26
+ [0, "no"],
27
+ [1, "it"],
28
+ [2, "was"],
29
+ [3, "n't"],
30
+ [4, "black"],
31
+ ],
32
+ @dictionary.each.first(5).to_a)
33
+ end
34
+
35
+ test("#size") do
36
+ assert_equal(6048, @dictionary.size)
37
+ end
38
+
39
+ test("#length") do
40
+ assert_equal(@dictionary.size,
41
+ @dictionary.length)
42
+ end
43
+ end
@@ -0,0 +1,383 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+ require 'tmpdir'
5
+
6
+ class EStatJapanTest < Test::Unit::TestCase
7
+ sub_test_case('app_id') do
8
+ def setup
9
+ ENV['ESTATJAPAN_APP_ID'] = nil
10
+ Datasets::EStatJapan.app_id = nil
11
+ end
12
+
13
+ test('nothing') do
14
+ assert_raise(Datasets::EStatJapan::ArgumentError) do
15
+ Datasets::EStatJapan::StatsData.new('test-data-id')
16
+ end
17
+ end
18
+
19
+ test('constructor') do
20
+ stats_data = Datasets::EStatJapan::StatsData.new('test-data-id', app_id: 'test_by_constructor')
21
+ assert_equal('test_by_constructor', stats_data.app_id)
22
+ end
23
+
24
+ test('env') do
25
+ ENV['ESTATJAPAN_APP_ID'] = 'test_by_env'
26
+ stats_data = Datasets::EStatJapan::StatsData.new('test-data-id')
27
+ assert_equal('test_by_env', stats_data.app_id)
28
+ end
29
+
30
+ test('configure') do
31
+ Datasets::EStatJapan.configure do |config|
32
+ config.app_id = 'test_by_configure'
33
+ end
34
+ stats_data = Datasets::EStatJapan::StatsData.new('test-data-id')
35
+ assert_equal('test_by_configure', stats_data.app_id)
36
+ end
37
+
38
+ test('env & configure') do
39
+ ENV['ESTATJAPAN_APP_ID'] = 'test_by_env'
40
+ Datasets::EStatJapan.configure do |config|
41
+ config.app_id = 'test_by_configure'
42
+ end
43
+ stats_data = Datasets::EStatJapan::StatsData.new('test-data-id')
44
+ assert_equal('test_by_configure', stats_data.app_id)
45
+ end
46
+
47
+ test('env & configure & constructor') do
48
+ ENV['ESTATJAPAN_APP_ID'] = 'test_by_env'
49
+ Datasets::EStatJapan.configure do |config|
50
+ config.app_id = 'test_by_configure'
51
+ end
52
+ stats_data = Datasets::EStatJapan::StatsData.new('test-data-id', app_id: 'test_by_constructor')
53
+ assert_equal('test_by_constructor', stats_data.app_id)
54
+ end
55
+ end
56
+
57
+ sub_test_case('url generation') do
58
+ def setup
59
+ ENV['ESTATJAPAN_APP_ID'] = nil
60
+ Datasets::EStatJapan.app_id = nil
61
+ end
62
+
63
+ test('generates url correctly') do
64
+ Datasets::EStatJapan.app_id = 'abcdef'
65
+ stats_data = Datasets::EStatJapan::StatsData.new('test-data-id')
66
+ stats_data_id = '000000'
67
+ stats_data.instance_eval do
68
+ @id = stats_data_id
69
+ @base_url = 'http://testurl/rest/2.1/app/json/getStatsData'
70
+ end
71
+ url = stats_data.send(:generate_url)
72
+ assert_equal(
73
+ 'http://testurl/rest/2.1/app/json/getStatsData' \
74
+ '?appId=abcdef&lang=J&statsDataId=000000&' \
75
+ 'metaGetFlg=Y&cntGetFlg=N&sectionHeaderFlg=1',
76
+ url.to_s
77
+ )
78
+ end
79
+ end
80
+
81
+ sub_test_case('parsing records') do
82
+ def setup
83
+ Datasets::EStatJapan.app_id = nil
84
+ # prepare test data
85
+ class_obj = [
86
+ {
87
+ "@name": 'table1',
88
+ "@id": 'tab',
89
+ "CLASS": {
90
+ "@level": '1',
91
+ "@code": '00001',
92
+ "@name": 'table1'
93
+ }
94
+ },
95
+ {
96
+ "@name": 'data1',
97
+ "@id": 'cat01',
98
+ "CLASS": {
99
+ "@level": '1',
100
+ "@code": 'data1',
101
+ "@name": 'data1_name'
102
+ }
103
+ },
104
+ {
105
+ "@name": 'area1',
106
+ "@id": 'area',
107
+ "CLASS": [
108
+ {
109
+ "@level": '2',
110
+ "@code": '01100',
111
+ "@name": 'test1 big-city',
112
+ "@parentCode": '01000'
113
+ },
114
+ {
115
+ "@level": '3',
116
+ "@code": '01101',
117
+ "@name": 'test1 big-city a-ku',
118
+ "@parentCode": '01100'
119
+ },
120
+ {
121
+ "@level": '3',
122
+ "@code": '01102',
123
+ "@name": 'test1 big-city b-ku',
124
+ "@parentCode": '01100'
125
+ },
126
+ {
127
+ "@level": '2',
128
+ "@code": '02555',
129
+ "@name": 'test2 a-city',
130
+ "@parentCode": '02000'
131
+ },
132
+ {
133
+ "@level": '2',
134
+ "@code": '02556',
135
+ "@name": 'test2 b-city',
136
+ "@parentCode": '02000'
137
+ }
138
+ ]
139
+ },
140
+ {
141
+ "@name": 'time',
142
+ "@id": 'time',
143
+ "CLASS": [
144
+ {
145
+ "@level": '1',
146
+ "@code": 'time1',
147
+ "@name": 'time1'
148
+ },
149
+ {
150
+ "@level": '1',
151
+ "@code": 'time2',
152
+ "@name": 'time2'
153
+ },
154
+ {
155
+ "@level": '1',
156
+ "@code": 'time3',
157
+ "@name": 'time3'
158
+ }
159
+ ]
160
+ }
161
+ ]
162
+ data_inf = class_obj[2][:CLASS].map do |entry|
163
+ [
164
+ {
165
+ "$": 1000,
166
+ "@area": entry[:@code],
167
+ "@cat01": 'data1',
168
+ "@tab": 'table1',
169
+ "@time": 'time1',
170
+ "@unit": 'person'
171
+ },
172
+ {
173
+ "$": 2000,
174
+ "@area": entry[:@code],
175
+ "@cat01": 'data1',
176
+ "@tab": 'table1',
177
+ "@time": 'time2',
178
+ "@unit": 'person'
179
+ }
180
+ ]
181
+ end.flatten
182
+ ## test record for `skip_nil_row: true`
183
+ data_inf << {
184
+ "$": 3000,
185
+ "@area": '02556',
186
+ "@cat01": 'data1',
187
+ "@tab": 'table1',
188
+ "@time": 'time3',
189
+ "@unit": 'person'
190
+ }
191
+ @response_data_default = {
192
+ 'GET_STATS_DATA' => {
193
+ 'RESULT' => {
194
+ 'STATUS' => 0,
195
+ 'ERROR_MSG' => 'succeeded'
196
+ },
197
+ 'STATISTICAL_DATA' => {
198
+ 'DATA_INF' => {
199
+ 'VALUE' => data_inf
200
+ },
201
+ 'CLASS_INF' => {
202
+ 'CLASS_OBJ' => class_obj
203
+ }
204
+ }
205
+ }
206
+ }
207
+
208
+ @tmp_dir = Dir.mktmpdir
209
+ @test_data_path = Pathname(File.join(@tmp_dir, '200-ok.json'))
210
+ ENV['ESTATJAPAN_APP_ID'] = 'test_appid_correct'
211
+ File.open(@test_data_path, 'w') do |f|
212
+ f.write(@response_data_default.to_json)
213
+ end
214
+ end
215
+
216
+ def teardown
217
+ FileUtils.remove_entry_secure(@test_data_path)
218
+ FileUtils.remove_entry_secure(@tmp_dir)
219
+ end
220
+
221
+ test('parsing records with default option') do
222
+ test_data_path = @test_data_path
223
+ stats_data = Datasets::EStatJapan::StatsData.new('test-data-id', app_id: 'valid')
224
+ stats_data.instance_eval do
225
+ @data_path = test_data_path
226
+ end
227
+
228
+ records = []
229
+ value_num = 0
230
+ stats_data.each do |record|
231
+ records << record
232
+ value_num += record.values.length
233
+ end
234
+ assert_equal(4, records.length)
235
+ assert_equal(4 * 2, value_num)
236
+ assert_equal(4, stats_data.areas.length)
237
+ assert_equal(3, stats_data.time_tables.length)
238
+ assert_equal(2, stats_data.time_tables.reject { |_k, v| v[:skip] }.to_h.length)
239
+ assert_equal(1, stats_data.columns.length)
240
+ assert_equal(2, stats_data.schema.length)
241
+ end
242
+
243
+ test('parsing records with hierarchy_selection') do
244
+ test_data_path = @test_data_path
245
+ stats_data = \
246
+ Datasets::EStatJapan::StatsData.new('test-data-id',
247
+ hierarchy_selection: 'parent')
248
+ stats_data.instance_eval do
249
+ @data_path = test_data_path
250
+ end
251
+ records = []
252
+ stats_data.each do |record|
253
+ records << record
254
+ end
255
+ assert_equal(3, records.length)
256
+ assert_equal(3, stats_data.areas.length)
257
+ assert_equal(3, stats_data.time_tables.length)
258
+ assert_equal(2, stats_data.time_tables.reject { |_k, v| v[:skip] }.to_h.length)
259
+ assert_equal(1, stats_data.columns.length)
260
+ assert_equal(2, stats_data.schema.length)
261
+
262
+ stats_data = \
263
+ Datasets::EStatJapan::StatsData.new('test-data-id',
264
+ hierarchy_selection: 'child')
265
+ stats_data.instance_eval do
266
+ @data_path = test_data_path
267
+ end
268
+ records = []
269
+ stats_data.each do |record|
270
+ records << record
271
+ end
272
+ assert_equal(4, records.length)
273
+ assert_equal(4, stats_data.areas.length)
274
+ assert_equal(3, stats_data.time_tables.length)
275
+ assert_equal(2, stats_data.time_tables.reject { |_k, v| v[:skip] }.to_h.length)
276
+ assert_equal(1, stats_data.columns.length)
277
+ assert_equal(2, stats_data.schema.length)
278
+
279
+ stats_data = \
280
+ Datasets::EStatJapan::StatsData.new('test-data-id',
281
+ hierarchy_selection: 'both')
282
+ stats_data.instance_eval do
283
+ @data_path = test_data_path
284
+ end
285
+ records = []
286
+ stats_data.each do |record|
287
+ records << record
288
+ end
289
+ assert_equal(5, records.length)
290
+ assert_equal(5, stats_data.areas.length)
291
+ assert_equal(3, stats_data.time_tables.length)
292
+ assert_equal(2, stats_data.time_tables.reject { |_k, v| v[:skip] }.to_h.length)
293
+ assert_equal(1, stats_data.columns.length)
294
+ assert_equal(2, stats_data.schema.length)
295
+ end
296
+
297
+ test('parsing records with skip_nil_(column|row)') do
298
+ test_data_path = @test_data_path
299
+ stats_data = \
300
+ Datasets::EStatJapan::StatsData.new('test-data-id',
301
+ skip_nil_column: false)
302
+ stats_data.instance_eval do
303
+ @data_path = test_data_path
304
+ end
305
+ records = []
306
+ value_num = 0
307
+ stats_data.each do |record|
308
+ records << record
309
+ value_num += record.values.length
310
+ end
311
+ assert_equal(4, records.length)
312
+ assert_equal(4 * 3, value_num)
313
+ assert_equal(4, stats_data.areas.length)
314
+ assert_equal(3, stats_data.time_tables.length)
315
+ assert_equal(3, stats_data.time_tables.reject { |_k, v| v[:skip] }.to_h.length)
316
+ assert_equal(1, stats_data.columns.length)
317
+ assert_equal(3, stats_data.schema.length)
318
+
319
+ stats_data = \
320
+ Datasets::EStatJapan::StatsData.new('test-data-id',
321
+ skip_nil_row: true,
322
+ skip_nil_column: false)
323
+ stats_data.instance_eval do
324
+ @data_path = test_data_path
325
+ end
326
+ records = []
327
+ value_num = 0
328
+ stats_data.each do |record|
329
+ records << record
330
+ value_num += record.values.length
331
+ end
332
+ assert_equal(1, records.length)
333
+ assert_equal(1 * 3, value_num)
334
+ assert_equal(4, stats_data.areas.length)
335
+ assert_equal(3, stats_data.time_tables.length)
336
+ assert_equal(3, stats_data.time_tables.reject { |_k, v| v[:skip] }.to_h.length)
337
+ assert_equal(1, stats_data.columns.length)
338
+ assert_equal(3, stats_data.schema.length)
339
+ end
340
+ end
341
+
342
+ sub_test_case('anomaly responses') do
343
+ def setup
344
+ ENV['ESTATJAPAN_APP_ID'] = nil
345
+ Datasets::EStatJapan.app_id = nil
346
+ @response_data = {
347
+ 'GET_STATS_DATA' => {
348
+ 'RESULT' => {
349
+ 'STATUS' => 100,
350
+ 'ERROR_MSG' => 'error message'
351
+ }
352
+ }
353
+ }
354
+ @tmp_dir = Dir.mktmpdir
355
+ @test_data_path = Pathname(File.join(@tmp_dir, '200-error.json'))
356
+ File.open(@test_data_path, 'w') do |f|
357
+ f.write(@response_data.to_json)
358
+ end
359
+ end
360
+
361
+ def teardown
362
+ FileUtils.remove_entry_secure(@tmp_dir)
363
+ end
364
+
365
+ test('forbidden access with invalid app_id') do
366
+ test_data_path = @test_data_path
367
+ ENV['ESTATJAPAN_APP_ID'] = 'test_appid_invalid'
368
+ stats_data = Datasets::EStatJapan::StatsData.new('test-data-id')
369
+ cache_file_path = nil
370
+ stats_data.instance_eval do
371
+ cache_file_path = @data_path = test_data_path
372
+ end
373
+ assert_raise(Datasets::EStatJapan::APIError) do
374
+ # contains no data
375
+ stats_data.each do |record|
376
+ record
377
+ end
378
+ end
379
+ # ensure remove error response cache
380
+ assert_equal(cache_file_path.exist?, false)
381
+ end
382
+ end
383
+ end
@@ -0,0 +1,137 @@
1
+ class FashionMNISTTest < Test::Unit::TestCase
2
+ sub_test_case("Normal") do
3
+ sub_test_case("train") do
4
+ def setup
5
+ @dataset = Datasets::FashionMNIST.new(type: :train)
6
+ end
7
+
8
+ test("#each") do
9
+ records = @dataset.each.to_a
10
+ assert_equal([
11
+ 60000,
12
+ [
13
+ 9,
14
+ 784,
15
+ [0, 0, 0, 0, 237, 226, 217, 223, 222, 219],
16
+ [220, 232, 246, 0, 3, 202, 228, 224, 221, 211],
17
+ ],
18
+ [
19
+ 5,
20
+ 784,
21
+ [129, 153, 34, 0, 3, 3, 0, 3, 0, 24],
22
+ [180, 177, 177, 47, 101, 235, 194, 223, 232, 255],
23
+ ],
24
+ ],
25
+ [
26
+ records.size,
27
+ [
28
+ records[0].label,
29
+ records[0].pixels.size,
30
+ records[0].pixels[400, 10],
31
+ records[0].pixels[500, 10],
32
+ ],
33
+ [
34
+ records[-1].label,
35
+ records[-1].pixels.size,
36
+ records[-1].pixels[400, 10],
37
+ records[-1].pixels[500, 10],
38
+ ],
39
+ ])
40
+ end
41
+
42
+ test("#to_table") do
43
+ table_data = @dataset.to_table
44
+ assert_equal([
45
+ [0, 0, 0, 0, 237, 226, 217, 223, 222, 219],
46
+ [129, 153, 34, 0, 3, 3, 0, 3, 0, 24],
47
+ ],
48
+ [
49
+ table_data[:pixels][0][400, 10],
50
+ table_data[:pixels][-1][400, 10],
51
+ ])
52
+ end
53
+
54
+ sub_test_case("#metadata") do
55
+ test("#id") do
56
+ assert_equal("fashion-mnist-train", @dataset.metadata.id)
57
+ end
58
+
59
+ test("#name") do
60
+ assert_equal("Fashion-MNIST: train", @dataset.metadata.name)
61
+ end
62
+ end
63
+ end
64
+
65
+ sub_test_case("test") do
66
+ def setup
67
+ @dataset = Datasets::FashionMNIST.new(type: :test)
68
+ end
69
+
70
+ test("#each") do
71
+ records = @dataset.each.to_a
72
+ assert_equal([
73
+ 10000,
74
+ [
75
+ 9,
76
+ 784,
77
+ [1, 0, 0, 0, 98, 136, 110, 109, 110, 162],
78
+ [172, 161, 189, 62, 0, 68, 94, 90, 111, 114],
79
+ ],
80
+ [
81
+ 5,
82
+ 784,
83
+ [45, 45, 69, 128, 100, 120, 132, 123, 135, 171],
84
+ [63, 74, 72, 0, 1, 0, 0, 0, 4, 85],
85
+ ],
86
+ ],
87
+ [
88
+ records.size,
89
+ [
90
+ records[0].label,
91
+ records[0].pixels.size,
92
+ records[0].pixels[400, 10],
93
+ records[0].pixels[500, 10],
94
+ ],
95
+ [
96
+ records[-1].label,
97
+ records[-1].pixels.size,
98
+ records[-1].pixels[400, 10],
99
+ records[-1].pixels[500, 10],
100
+ ],
101
+ ])
102
+ end
103
+
104
+ test("#to_table") do
105
+ table_data = @dataset.to_table
106
+ assert_equal([
107
+ [1, 0, 0, 0, 98, 136, 110, 109, 110, 162],
108
+ [45, 45, 69, 128, 100, 120, 132, 123, 135, 171],
109
+ ],
110
+ [
111
+ table_data[:pixels][0][400, 10],
112
+ table_data[:pixels][-1][400, 10],
113
+ ])
114
+ end
115
+
116
+ sub_test_case("#metadata") do
117
+ test("#id") do
118
+ assert_equal("fashion-mnist-test", @dataset.metadata.id)
119
+ end
120
+
121
+ test("#name") do
122
+ assert_equal("Fashion-MNIST: test", @dataset.metadata.name)
123
+ end
124
+ end
125
+ end
126
+ end
127
+
128
+ sub_test_case("Abnormal") do
129
+ test("invalid type") do
130
+ invalid_type = :invalid
131
+ message = "Please set type :train or :test: #{invalid_type.inspect}"
132
+ assert_raise(ArgumentError.new(message)) do
133
+ Datasets::FashionMNIST.new(type: invalid_type)
134
+ end
135
+ end
136
+ end
137
+ end