red-datasets 0.0.6 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -7
  3. data/doc/text/news.md +124 -0
  4. data/lib/datasets.rb +18 -6
  5. data/lib/datasets/adult.rb +84 -0
  6. data/lib/datasets/cldr-plurals.rb +385 -0
  7. data/lib/datasets/communities.rb +198 -0
  8. data/lib/datasets/dataset.rb +13 -0
  9. data/lib/datasets/dictionary.rb +59 -0
  10. data/lib/datasets/downloader.rb +37 -62
  11. data/lib/datasets/e-stat-japan.rb +320 -0
  12. data/lib/datasets/error.rb +4 -0
  13. data/lib/datasets/fashion-mnist.rb +12 -0
  14. data/lib/datasets/hepatitis.rb +207 -0
  15. data/lib/datasets/iris.rb +1 -1
  16. data/lib/datasets/libsvm-dataset-list.rb +277 -0
  17. data/lib/datasets/libsvm.rb +135 -0
  18. data/lib/datasets/mnist.rb +11 -8
  19. data/lib/datasets/mushroom.rb +256 -0
  20. data/lib/datasets/penguins.rb +125 -0
  21. data/lib/datasets/penn-treebank.rb +2 -9
  22. data/lib/datasets/postal-code-japan.rb +154 -0
  23. data/lib/datasets/table.rb +99 -3
  24. data/lib/datasets/version.rb +1 -1
  25. data/lib/datasets/wikipedia.rb +2 -10
  26. data/lib/datasets/wine.rb +64 -0
  27. data/red-datasets.gemspec +4 -0
  28. data/test/helper.rb +1 -0
  29. data/test/run-test.rb +2 -0
  30. data/test/test-adult.rb +126 -0
  31. data/test/test-cldr-plurals.rb +180 -0
  32. data/test/test-communities.rb +290 -0
  33. data/test/test-dictionary.rb +43 -0
  34. data/test/test-e-stat-japan.rb +383 -0
  35. data/test/test-fashion-mnist.rb +137 -0
  36. data/test/test-hepatitis.rb +74 -0
  37. data/test/test-libsvm-dataset-list.rb +47 -0
  38. data/test/test-libsvm.rb +205 -0
  39. data/test/test-mnist.rb +95 -70
  40. data/test/test-mushroom.rb +80 -0
  41. data/test/test-penguins.rb +239 -0
  42. data/test/test-penn-treebank.rb +6 -6
  43. data/test/test-postal-code-japan.rb +69 -0
  44. data/test/test-table.rb +144 -19
  45. data/test/test-wine.rb +58 -0
  46. metadata +89 -8
@@ -0,0 +1,80 @@
1
+ class MushroomTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::Mushroom.new
4
+ end
5
+
6
+ def record(*args)
7
+ Datasets::Mushroom::Record.new(*args)
8
+ end
9
+
10
+ test("#each") do
11
+ records = @dataset.each.to_a
12
+ assert_equal([
13
+ 8124,
14
+ {
15
+ :label => "poisonous",
16
+ :cap_shape => "convex",
17
+ :cap_surface => "smooth",
18
+ :cap_color => "brown",
19
+ :bruises => "bruises",
20
+ :odor => "pungent",
21
+ :gill_attachment => "free",
22
+ :gill_spacing => "close",
23
+ :gill_size => "narrow",
24
+ :gill_color => "black",
25
+ :stalk_shape => "enlarging",
26
+ :stalk_root => "equal",
27
+ :stalk_surface_above_ring => "smooth",
28
+ :stalk_surface_below_ring => "smooth",
29
+ :stalk_color_above_ring => "white",
30
+ :stalk_color_below_ring => "white",
31
+ :veil_type => "partial",
32
+ :veil_color => "white",
33
+ :n_rings => 1,
34
+ :ring_type => "pendant",
35
+ :spore_print_color => "black",
36
+ :population => "scattered",
37
+ :habitat => "urban"
38
+ },
39
+ {
40
+ :label => "edible",
41
+ :cap_shape => "convex",
42
+ :cap_surface => "smooth",
43
+ :cap_color => "brown",
44
+ :bruises => "no",
45
+ :odor => "none",
46
+ :gill_attachment => "attached",
47
+ :gill_spacing => "close",
48
+ :gill_size => "broad",
49
+ :gill_color => "yellow",
50
+ :stalk_shape => "enlarging",
51
+ :stalk_root => "missing",
52
+ :stalk_surface_above_ring => "smooth",
53
+ :stalk_surface_below_ring => "smooth",
54
+ :stalk_color_above_ring => "orange",
55
+ :stalk_color_below_ring => "orange",
56
+ :veil_type => "partial",
57
+ :veil_color => "orange",
58
+ :n_rings => 1,
59
+ :ring_type => "pendant",
60
+ :spore_print_color => "orange",
61
+ :population => "clustered",
62
+ :habitat => "leaves"
63
+ }
64
+ ],
65
+ [
66
+ records.size,
67
+ records[0].to_h,
68
+ records[-1].to_h
69
+ ])
70
+ end
71
+
72
+ sub_test_case("#metadata") do
73
+ test("#description") do
74
+ description = @dataset.metadata.description
75
+ assert do
76
+ description.start_with?("1. Title: Mushroom Database")
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,239 @@
1
+ class PenguinsTest < Test::Unit::TestCase
2
+ sub_test_case("PenguinsRawData::SpeciesBase") do
3
+ test("#data_path") do
4
+ data_paths = [ Datasets::PenguinsRawData::Adelie,
5
+ Datasets::PenguinsRawData::Gentoo,
6
+ Datasets::PenguinsRawData::Chinstrap ].map {|cls|
7
+ dataset = cls.new
8
+ dataset.data_path.relative_path_from(dataset.send(:cache_dir_path)).to_s
9
+ }
10
+ assert_equal(["penguins/adelie.csv", "penguins/gentoo.csv", "penguins/chinstrap.csv"],
11
+ data_paths)
12
+ end
13
+ end
14
+
15
+ sub_test_case("Adelie") do
16
+ def setup
17
+ @dataset = Datasets::PenguinsRawData::Adelie.new
18
+ end
19
+
20
+ test("#each") do
21
+ records = @dataset.each.to_a
22
+ assert_equal([ 152,
23
+ {
24
+ study_name: "PAL0708",
25
+ sample_number: 1,
26
+ species: "Adelie Penguin (Pygoscelis adeliae)",
27
+ region: "Anvers",
28
+ island: "Torgersen",
29
+ stage: "Adult, 1 Egg Stage",
30
+ individual_id: "N1A1",
31
+ clutch_completion: "Yes",
32
+ date_egg: DateTime.new(2007, 11, 11),
33
+ culmen_length_mm: 39.1,
34
+ culmen_depth_mm: 18.7,
35
+ flipper_length_mm: 181,
36
+ body_mass_g: 3750,
37
+ sex: "MALE",
38
+ delta_15_n_permil: nil,
39
+ delta_13_c_permil: nil,
40
+ comments: "Not enough blood for isotopes."
41
+ },
42
+ {
43
+ study_name: "PAL0910",
44
+ sample_number: 152,
45
+ species: "Adelie Penguin (Pygoscelis adeliae)",
46
+ region: "Anvers",
47
+ island: "Dream",
48
+ stage: "Adult, 1 Egg Stage",
49
+ individual_id: "N85A2",
50
+ clutch_completion: "Yes",
51
+ date_egg: DateTime.new(2009, 11, 17),
52
+ culmen_length_mm: 41.5,
53
+ culmen_depth_mm: 18.5,
54
+ flipper_length_mm: 201,
55
+ body_mass_g: 4000,
56
+ sex: "MALE",
57
+ delta_15_n_permil: 8.89640,
58
+ delta_13_c_permil: -26.06967,
59
+ comments: nil
60
+ }
61
+ ],
62
+ [
63
+ records.size,
64
+ records[0].to_h,
65
+ records[-1].to_h
66
+ ])
67
+ end
68
+ end
69
+
70
+ sub_test_case("Gentoo") do
71
+ def setup
72
+ @dataset = Datasets::PenguinsRawData::Gentoo.new
73
+ end
74
+
75
+ test("#each") do
76
+ records = @dataset.each.to_a
77
+ assert_equal([ 124,
78
+ {
79
+ study_name: "PAL0708",
80
+ sample_number: 1,
81
+ species: "Gentoo penguin (Pygoscelis papua)",
82
+ region: "Anvers",
83
+ island: "Biscoe",
84
+ stage: "Adult, 1 Egg Stage",
85
+ individual_id: "N31A1",
86
+ clutch_completion: "Yes",
87
+ date_egg: DateTime.new(2007, 11, 27),
88
+ culmen_length_mm: 46.1,
89
+ culmen_depth_mm: 13.2,
90
+ flipper_length_mm: 211,
91
+ body_mass_g: 4500,
92
+ sex: "FEMALE",
93
+ delta_15_n_permil: 7.993,
94
+ delta_13_c_permil: -25.5139,
95
+ comments: nil
96
+ },
97
+ {
98
+ study_name: "PAL0910",
99
+ sample_number: 124,
100
+ species: "Gentoo penguin (Pygoscelis papua)",
101
+ region: "Anvers",
102
+ island: "Biscoe",
103
+ stage: "Adult, 1 Egg Stage",
104
+ individual_id: "N43A2",
105
+ clutch_completion: "Yes",
106
+ date_egg: DateTime.new(2009, 11, 22),
107
+ culmen_length_mm: 49.9,
108
+ culmen_depth_mm: 16.1,
109
+ flipper_length_mm: 213,
110
+ body_mass_g: 5400,
111
+ sex: "MALE",
112
+ delta_15_n_permil: 8.3639,
113
+ delta_13_c_permil: -26.15531,
114
+ comments: nil
115
+ }
116
+ ],
117
+ [
118
+ records.size,
119
+ records[0].to_h,
120
+ records[-1].to_h
121
+ ])
122
+ end
123
+ end
124
+
125
+ sub_test_case("Chinstrap") do
126
+ def setup
127
+ @dataset = Datasets::PenguinsRawData::Chinstrap.new
128
+ end
129
+
130
+ test("#each") do
131
+ records = @dataset.each.to_a
132
+ assert_equal([ 68,
133
+ {
134
+ study_name: "PAL0708",
135
+ sample_number: 1,
136
+ species: "Chinstrap penguin (Pygoscelis antarctica)",
137
+ region: "Anvers",
138
+ island: "Dream",
139
+ stage: "Adult, 1 Egg Stage",
140
+ individual_id: "N61A1",
141
+ clutch_completion: "No",
142
+ date_egg: DateTime.new(2007, 11, 19),
143
+ culmen_length_mm: 46.5,
144
+ culmen_depth_mm: 17.9,
145
+ flipper_length_mm: 192,
146
+ body_mass_g: 3500,
147
+ sex: "FEMALE",
148
+ delta_15_n_permil: 9.03935,
149
+ delta_13_c_permil: -24.30229,
150
+ comments: "Nest never observed with full clutch."
151
+ },
152
+ {
153
+ study_name: "PAL0910",
154
+ sample_number: 68,
155
+ species: "Chinstrap penguin (Pygoscelis antarctica)",
156
+ region: "Anvers",
157
+ island: "Dream",
158
+ stage: "Adult, 1 Egg Stage",
159
+ individual_id: "N100A2",
160
+ clutch_completion: "Yes",
161
+ date_egg: DateTime.new(2009, 11, 21),
162
+ culmen_length_mm: 50.2,
163
+ culmen_depth_mm: 18.7,
164
+ flipper_length_mm: 198,
165
+ body_mass_g: 3775,
166
+ sex: "FEMALE",
167
+ delta_15_n_permil: 9.39305,
168
+ delta_13_c_permil: -24.25255,
169
+ comments: nil
170
+ }
171
+ ],
172
+ [
173
+ records.size,
174
+ records[0].to_h,
175
+ records[-1].to_h
176
+ ])
177
+ end
178
+ end
179
+
180
+ sub_test_case("Penguins") do
181
+ def setup
182
+ @dataset = Datasets::Penguins.new
183
+ end
184
+
185
+ test("#each") do
186
+ records = @dataset.each.to_a
187
+ assert_equal([
188
+ 344,
189
+ {
190
+ species: "Adelie",
191
+ island: "Torgersen",
192
+ bill_length_mm: 39.1,
193
+ bill_depth_mm: 18.7,
194
+ flipper_length_mm: 181,
195
+ body_mass_g: 3750,
196
+ sex: "male",
197
+ year: 2007
198
+ },
199
+ {
200
+ species: "Gentoo",
201
+ island: "Biscoe",
202
+ bill_length_mm: 46.1,
203
+ bill_depth_mm: 13.2,
204
+ flipper_length_mm: 211,
205
+ body_mass_g: 4500,
206
+ sex: "female",
207
+ year: 2007
208
+ },
209
+ {
210
+ species: "Chinstrap",
211
+ island: "Dream",
212
+ bill_length_mm: 46.5,
213
+ bill_depth_mm: 17.9,
214
+ flipper_length_mm: 192,
215
+ body_mass_g: 3500,
216
+ sex: "female",
217
+ year: 2007
218
+ },
219
+ {
220
+ species: "Chinstrap",
221
+ island: "Dream",
222
+ bill_length_mm: 50.2,
223
+ bill_depth_mm: 18.7,
224
+ flipper_length_mm: 198,
225
+ body_mass_g: 3775,
226
+ sex: "female",
227
+ year: 2009
228
+ }
229
+ ],
230
+ [
231
+ records.size,
232
+ records[0].to_h,
233
+ records[152].to_h,
234
+ records[276].to_h,
235
+ records[-1].to_h,
236
+ ])
237
+ end
238
+ end
239
+ end
@@ -9,8 +9,8 @@ class PennTreebankTest < Test::Unit::TestCase
9
9
  records = dataset.to_a
10
10
  assert_equal([
11
11
  887521,
12
- record("aer", 0),
13
- record("<unk>", 25),
12
+ record("aer"),
13
+ record("<unk>"),
14
14
  ],
15
15
  [
16
16
  records.size,
@@ -24,8 +24,8 @@ class PennTreebankTest < Test::Unit::TestCase
24
24
  records = dataset.to_a
25
25
  assert_equal([
26
26
  78669,
27
- record("no", 0),
28
- record("us", 953),
27
+ record("no"),
28
+ record("us"),
29
29
  ],
30
30
  [
31
31
  records.size,
@@ -39,8 +39,8 @@ class PennTreebankTest < Test::Unit::TestCase
39
39
  records = dataset.to_a
40
40
  assert_equal([
41
41
  70390,
42
- record("consumers", 0),
43
- record("N", 28),
42
+ record("consumers"),
43
+ record("N"),
44
44
  ],
45
45
  [
46
46
  records.size,
@@ -0,0 +1,69 @@
1
+ class PostalCodeJapanTest < Test::Unit::TestCase
2
+ sub_test_case(":reading") do
3
+ test(":lowercase") do
4
+ dataset = Datasets::PostalCodeJapan.new(reading: :lowercase)
5
+ assert_equal({
6
+ organization_code: "01101",
7
+ old_postal_code: "060",
8
+ postal_code: "0600000",
9
+ prefecture_reading: "ホッカイドウ",
10
+ city_reading: "サッポロシチュウオウク",
11
+ address_reading: "イカニケイサイガナイバアイ",
12
+ prefecture: "北海道",
13
+ city: "札幌市中央区",
14
+ address: "以下に掲載がない場合",
15
+ have_multiple_postal_codes: false,
16
+ have_address_number_per_koaza: false,
17
+ have_chome: false,
18
+ postal_code_is_shared: false,
19
+ changed: false,
20
+ change_reason: nil,
21
+ },
22
+ dataset.first.to_h)
23
+ end
24
+
25
+ test(":uppercase") do
26
+ dataset = Datasets::PostalCodeJapan.new(reading: :uppercase)
27
+ assert_equal({
28
+ organization_code: "01101",
29
+ old_postal_code: "060",
30
+ postal_code: "0600000",
31
+ prefecture_reading: "ホツカイドウ",
32
+ city_reading: "サツポロシチユウオウク",
33
+ address_reading: "イカニケイサイガナイバアイ",
34
+ prefecture: "北海道",
35
+ city: "札幌市中央区",
36
+ address: "以下に掲載がない場合",
37
+ have_multiple_postal_codes: false,
38
+ have_address_number_per_koaza: false,
39
+ have_chome: false,
40
+ postal_code_is_shared: false,
41
+ changed: false,
42
+ change_reason: nil,
43
+ },
44
+ dataset.first.to_h)
45
+ end
46
+
47
+ test(":romaji") do
48
+ dataset = Datasets::PostalCodeJapan.new(reading: :romaji)
49
+ assert_equal({
50
+ organization_code: nil,
51
+ old_postal_code: nil,
52
+ postal_code: "0600000",
53
+ prefecture_reading: "HOKKAIDO",
54
+ city_reading: "SAPPORO SHI CHUO KU",
55
+ address_reading: "IKANIKEISAIGANAIBAAI",
56
+ prefecture: "北海道",
57
+ city: "札幌市 中央区",
58
+ address: "以下に掲載がない場合",
59
+ have_multiple_postal_codes: false,
60
+ have_address_number_per_koaza: false,
61
+ have_chome: false,
62
+ postal_code_is_shared: false,
63
+ changed: false,
64
+ change_reason: nil,
65
+ },
66
+ dataset.first.to_h)
67
+ end
68
+ end
69
+ end
data/test/test-table.rb CHANGED
@@ -3,9 +3,149 @@ class TableTest < Test::Unit::TestCase
3
3
  @table = Datasets::Iris.new.to_table
4
4
  end
5
5
 
6
- test("#[]") do
7
- assert_equal([1.4, 1.4, 1.3, 1.5, 1.4],
8
- @table[:petal_length].first(5))
6
+ test("#n_columns") do
7
+ assert_equal(5, @table.n_columns)
8
+ end
9
+
10
+ test("#n_rows") do
11
+ assert_equal(150, @table.n_rows)
12
+ end
13
+
14
+ test("#column_names") do
15
+ assert_equal([
16
+ :sepal_length,
17
+ :sepal_width,
18
+ :petal_length,
19
+ :petal_width,
20
+ :label,
21
+ ],
22
+ @table.column_names)
23
+ end
24
+
25
+ test("#each") do
26
+ shorten_hash = {}
27
+ @table.each do |name, values|
28
+ shorten_hash[name] = values.first(5)
29
+ end
30
+ assert_equal({
31
+ :label => ["Iris-setosa"] * 5,
32
+ :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
33
+ :petal_width => [0.2, 0.2, 0.2, 0.2, 0.2],
34
+ :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
35
+ :sepal_width => [3.5, 3.0, 3.2, 3.1, 3.6],
36
+ },
37
+ shorten_hash)
38
+ end
39
+
40
+ test("#each_column") do
41
+ shorten_hash = {}
42
+ @table.each_column do |name, values|
43
+ shorten_hash[name] = values.first(5)
44
+ end
45
+ assert_equal({
46
+ :label => ["Iris-setosa"] * 5,
47
+ :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
48
+ :petal_width => [0.2, 0.2, 0.2, 0.2, 0.2],
49
+ :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
50
+ :sepal_width => [3.5, 3.0, 3.2, 3.1, 3.6],
51
+ },
52
+ shorten_hash)
53
+ end
54
+
55
+ test("#each_record") do
56
+ records = []
57
+ @table.each_record do |record|
58
+ records << record
59
+ break if records.size == 3
60
+ end
61
+ assert_equal([
62
+ {
63
+ label: "Iris-setosa",
64
+ petal_length: 1.4,
65
+ petal_width: 0.2,
66
+ sepal_length: 5.1,
67
+ sepal_width: 3.5,
68
+ },
69
+ {
70
+ label: "Iris-setosa",
71
+ petal_length: 1.4,
72
+ petal_width: 0.2,
73
+ sepal_length: 4.9,
74
+ sepal_width: 3.0,
75
+ },
76
+ {
77
+ label: "Iris-setosa",
78
+ petal_length: 1.3,
79
+ petal_width: 0.2,
80
+ sepal_length: 4.7,
81
+ sepal_width: 3.2,
82
+ },
83
+ ],
84
+ records.collect(&:to_h))
85
+ end
86
+
87
+ sub_test_case("#find_record") do
88
+ test("positive") do
89
+ assert_equal({
90
+ label: "Iris-setosa",
91
+ petal_length: 1.4,
92
+ petal_width: 0.2,
93
+ sepal_length: 4.9,
94
+ sepal_width: 3.0,
95
+ },
96
+ @table.find_record(1).to_h)
97
+ end
98
+
99
+ test("positive - over") do
100
+ assert_nil(@table.find_record(151))
101
+ end
102
+
103
+ test("negative") do
104
+ assert_equal({
105
+ label: "Iris-virginica",
106
+ petal_length: 5.1,
107
+ petal_width: 1.8,
108
+ sepal_length: 5.9,
109
+ sepal_width: 3.0,
110
+ },
111
+ @table.find_record(-1).to_h)
112
+ end
113
+
114
+ test("negative - over") do
115
+ assert_nil(@table.find_record(-151))
116
+ end
117
+ end
118
+
119
+ sub_test_case("#[]") do
120
+ test("index") do
121
+ assert_equal([1.4, 1.4, 1.3, 1.5, 1.4],
122
+ @table[2].first(5))
123
+ end
124
+
125
+ test("name") do
126
+ assert_equal([1.4, 1.4, 1.3, 1.5, 1.4],
127
+ @table[:petal_length].first(5))
128
+ end
129
+ end
130
+
131
+ test("#dictionary_encode") do
132
+ assert_equal([
133
+ [0, "Iris-setosa"],
134
+ [1, "Iris-versicolor"],
135
+ [2, "Iris-virginica"],
136
+ ],
137
+ @table.dictionary_encode(:label).to_a)
138
+ end
139
+
140
+ test("#label_encode") do
141
+ label_encoded_labels = @table.label_encode(:label)
142
+ labels = @table[:label]
143
+ assert_equal([0, 1, 2],
144
+ [
145
+ label_encoded_labels[labels.find_index("Iris-setosa")],
146
+ label_encoded_labels[labels.find_index("Iris-versicolor")],
147
+ label_encoded_labels[labels.find_index("Iris-virginica")],
148
+ ])
9
149
  end
10
150
 
11
151
  sub_test_case("#fetch_values") do
@@ -38,28 +178,13 @@ class TableTest < Test::Unit::TestCase
38
178
  end
39
179
  end
40
180
 
41
- test("#each") do
42
- shorten_hash = {}
43
- @table.each do |name, values|
44
- shorten_hash[name] = values.first(5)
45
- end
46
- assert_equal({
47
- :class => ["Iris-setosa"] * 5,
48
- :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
49
- :petal_width => [0.2, 0.2, 0.2, 0.2, 0.2],
50
- :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
51
- :sepal_width => [3.5, 3.0, 3.2, 3.1, 3.6],
52
- },
53
- shorten_hash)
54
- end
55
-
56
181
  test("#to_h") do
57
182
  shorten_hash = {}
58
183
  @table.to_h.each do |name, values|
59
184
  shorten_hash[name] = values.first(5)
60
185
  end
61
186
  assert_equal({
62
- :class => ["Iris-setosa"] * 5,
187
+ :label => ["Iris-setosa"] * 5,
63
188
  :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
64
189
  :petal_width => [0.2, 0.2, 0.2, 0.2, 0.2],
65
190
  :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],