red-datasets 0.0.6 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -7
  3. data/doc/text/news.md +124 -0
  4. data/lib/datasets.rb +18 -6
  5. data/lib/datasets/adult.rb +84 -0
  6. data/lib/datasets/cldr-plurals.rb +385 -0
  7. data/lib/datasets/communities.rb +198 -0
  8. data/lib/datasets/dataset.rb +13 -0
  9. data/lib/datasets/dictionary.rb +59 -0
  10. data/lib/datasets/downloader.rb +37 -62
  11. data/lib/datasets/e-stat-japan.rb +320 -0
  12. data/lib/datasets/error.rb +4 -0
  13. data/lib/datasets/fashion-mnist.rb +12 -0
  14. data/lib/datasets/hepatitis.rb +207 -0
  15. data/lib/datasets/iris.rb +1 -1
  16. data/lib/datasets/libsvm-dataset-list.rb +277 -0
  17. data/lib/datasets/libsvm.rb +135 -0
  18. data/lib/datasets/mnist.rb +11 -8
  19. data/lib/datasets/mushroom.rb +256 -0
  20. data/lib/datasets/penguins.rb +125 -0
  21. data/lib/datasets/penn-treebank.rb +2 -9
  22. data/lib/datasets/postal-code-japan.rb +154 -0
  23. data/lib/datasets/table.rb +99 -3
  24. data/lib/datasets/version.rb +1 -1
  25. data/lib/datasets/wikipedia.rb +2 -10
  26. data/lib/datasets/wine.rb +64 -0
  27. data/red-datasets.gemspec +4 -0
  28. data/test/helper.rb +1 -0
  29. data/test/run-test.rb +2 -0
  30. data/test/test-adult.rb +126 -0
  31. data/test/test-cldr-plurals.rb +180 -0
  32. data/test/test-communities.rb +290 -0
  33. data/test/test-dictionary.rb +43 -0
  34. data/test/test-e-stat-japan.rb +383 -0
  35. data/test/test-fashion-mnist.rb +137 -0
  36. data/test/test-hepatitis.rb +74 -0
  37. data/test/test-libsvm-dataset-list.rb +47 -0
  38. data/test/test-libsvm.rb +205 -0
  39. data/test/test-mnist.rb +95 -70
  40. data/test/test-mushroom.rb +80 -0
  41. data/test/test-penguins.rb +239 -0
  42. data/test/test-penn-treebank.rb +6 -6
  43. data/test/test-postal-code-japan.rb +69 -0
  44. data/test/test-table.rb +144 -19
  45. data/test/test-wine.rb +58 -0
  46. metadata +89 -8
@@ -0,0 +1,135 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class LIBSVM < Dataset
7
+ class Record
8
+ attr_reader :label
9
+ attr_reader :features
10
+ def initialize(label, features)
11
+ @label = label
12
+ @features = features
13
+ end
14
+
15
+ def [](index)
16
+ @features[index]
17
+ end
18
+
19
+ def to_h
20
+ hash = {
21
+ label: @label,
22
+ }
23
+ @features.each_with_index do |feature, i|
24
+ hash[i] = feature
25
+ end
26
+ hash
27
+ end
28
+
29
+ def values
30
+ [@label] + @features
31
+ end
32
+ end
33
+
34
+ def initialize(name,
35
+ note: nil,
36
+ default_feature_value: 0)
37
+ super()
38
+ @libsvm_dataset_metadata = fetch_dataset_info(name)
39
+ @file = choose_file(note)
40
+ @default_feature_value = default_feature_value
41
+ @metadata.id = "libsvm-#{normalize_name(name)}"
42
+ @metadata.name = "LIBSVM dataset: #{name}"
43
+ @metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
44
+ end
45
+
46
+ def each
47
+ return to_enum(__method__) unless block_given?
48
+
49
+ open_data do |input|
50
+ n_features = @libsvm_dataset_metadata.n_features
51
+ csv = CSV.new(input, col_sep: " ")
52
+ csv.each do |row|
53
+ label = parse_label(row.shift)
54
+ features = [@default_feature_value] * n_features
55
+ row.each do |column|
56
+ next if column.nil?
57
+ index, value = column.split(":", 2)
58
+ features[Integer(index, 10) - 1] = parse_value(value)
59
+ end
60
+ yield(Record.new(label, features))
61
+ end
62
+ end
63
+ end
64
+
65
+ private
66
+ def fetch_dataset_info(name)
67
+ list = LIBSVMDatasetList.new
68
+ available_datasets = []
69
+ list.each do |record|
70
+ available_datasets << record.name
71
+ if record.name == name
72
+ return record
73
+ end
74
+ end
75
+ message = "unavailable LIBSVM dataset: #{name.inspect}: "
76
+ message << "available datasets: ["
77
+ message << available_datasets.collect(&:inspect).join(", ")
78
+ message << "]"
79
+ raise ArgumentError, message
80
+ end
81
+
82
+ def choose_file(note)
83
+ files = @libsvm_dataset_metadata.files
84
+ return files.first if note.nil?
85
+
86
+ available_notes = []
87
+ @libsvm_dataset_metadata.files.find do |file|
88
+ return file if file.note == note
89
+ available_notes << file.note if file.note
90
+ end
91
+
92
+ name = @libsvm_dataset_metadata.name
93
+ message = "unavailable note: #{name}: #{note.inspect}: "
94
+ message << "available notes: ["
95
+ message << available_notes.collect(&:inspect).join(", ")
96
+ message << "]"
97
+ raise ArgumentError, message
98
+ end
99
+
100
+ def open_data(&block)
101
+ data_path = cache_dir_path + @file.name
102
+ unless data_path.exist?
103
+ download(data_path, @file.url)
104
+ end
105
+ if data_path.extname == ".bz2"
106
+ extract_bz2(data_path, &block)
107
+ else
108
+ File.open(data_path, &block)
109
+ end
110
+ end
111
+
112
+ def normalize_name(name)
113
+ name.gsub(/[()]/, "").gsub(/[ _;]+/, "-").downcase
114
+ end
115
+
116
+ def parse_label(label)
117
+ labels = label.split(",").collect do |value|
118
+ parse_value(value)
119
+ end
120
+ if labels.size == 1
121
+ labels[0]
122
+ else
123
+ labels
124
+ end
125
+ end
126
+
127
+ def parse_value(value)
128
+ if value.include?(".")
129
+ Float(value)
130
+ else
131
+ Integer(value, 10)
132
+ end
133
+ end
134
+ end
135
+ end
@@ -2,10 +2,9 @@ require 'zlib'
2
2
 
3
3
  require_relative "dataset"
4
4
 
5
- class SetTypeError < StandardError; end
6
-
7
5
  module Datasets
8
6
  class MNIST < Dataset
7
+ BASE_URL = "http://yann.lecun.com/exdb/mnist/"
9
8
 
10
9
  class Record < Struct.new(:data, :label)
11
10
  def pixels
@@ -26,9 +25,9 @@ module Datasets
26
25
 
27
26
  super()
28
27
 
29
- @metadata.id = "mnist-#{type}"
30
- @metadata.name = "MNIST: #{type}"
31
- @metadata.url = "http://yann.lecun.com/exdb/mnist/"
28
+ @metadata.id = "#{dataset_name.downcase}-#{type}"
29
+ @metadata.name = "#{dataset_name}: #{type}"
30
+ @metadata.url = self.class::BASE_URL
32
31
  @type = type
33
32
 
34
33
  case type
@@ -44,7 +43,7 @@ module Datasets
44
43
 
45
44
  image_path = cache_dir_path + target_file(:image)
46
45
  label_path = cache_dir_path + target_file(:label)
47
- base_url = "http://yann.lecun.com/exdb/mnist/"
46
+ base_url = self.class::BASE_URL
48
47
 
49
48
  unless image_path.exist?
50
49
  download(image_path, base_url + target_file(:image))
@@ -66,7 +65,7 @@ module Datasets
66
65
  n_bytes = n_uint32s * 4
67
66
  mnist_magic_number = 2051
68
67
  magic, n_images, n_rows, n_cols = f.read(n_bytes).unpack("N*")
69
- raise 'This is not MNIST image file' if magic != mnist_magic_number
68
+ raise "This is not #{dataset_name} image file" if magic != mnist_magic_number
70
69
  n_images.times do |i|
71
70
  data = f.read(n_rows * n_cols)
72
71
  label = labels[i]
@@ -100,9 +99,13 @@ module Datasets
100
99
  n_bytes = n_uint32s * 2
101
100
  mnist_magic_number = 2049
102
101
  magic, n_labels = f.read(n_bytes).unpack('N2')
103
- raise 'This is not MNIST label file' if magic != mnist_magic_number
102
+ raise "This is not #{dataset_name} label file" if magic != mnist_magic_number
104
103
  f.read(n_labels).unpack('C*')
105
104
  end
106
105
  end
106
+
107
+ def dataset_name
108
+ "MNIST"
109
+ end
107
110
  end
108
111
  end
@@ -0,0 +1,256 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class Mushroom < Dataset
7
+ Record = Struct.new(
8
+ :label,
9
+ :cap_shape,
10
+ :cap_surface,
11
+ :cap_color,
12
+ :bruises,
13
+ :odor,
14
+ :gill_attachment,
15
+ :gill_spacing,
16
+ :gill_size,
17
+ :gill_color,
18
+ :stalk_shape,
19
+ :stalk_root,
20
+ :stalk_surface_above_ring,
21
+ :stalk_surface_below_ring,
22
+ :stalk_color_above_ring,
23
+ :stalk_color_below_ring,
24
+ :veil_type,
25
+ :veil_color,
26
+ :n_rings,
27
+ :ring_type,
28
+ :spore_print_color,
29
+ :population,
30
+ :habitat,
31
+ )
32
+
33
+ def initialize
34
+ super()
35
+ @metadata.id = "mushroom"
36
+ @metadata.name = "Mushroom"
37
+ @metadata.url = "https://archive.ics.uci.edu/ml/datasets/mushroom"
38
+ @metadata.description = lambda do
39
+ read_names
40
+ end
41
+ end
42
+
43
+ def each
44
+ return to_enum(__method__) unless block_given?
45
+
46
+ open_data do |csv|
47
+ csv.each do |row|
48
+ next if row[0].nil?
49
+ record = Record.new(*row)
50
+ record.members.each do |member|
51
+ record[member] = CONVERTERS[member][record[member]]
52
+ end
53
+ yield(record)
54
+ end
55
+ end
56
+ end
57
+
58
+ private
59
+ def open_data
60
+ data_path = cache_dir_path + "agaricus-lepiota.data"
61
+ unless data_path.exist?
62
+ data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
63
+ download(data_path, data_url)
64
+ end
65
+ CSV.open(data_path) do |csv|
66
+ yield(csv)
67
+ end
68
+ end
69
+
70
+ def read_names
71
+ names_path = cache_dir_path + "agaricus-lepiota.names"
72
+ unless names_path.exist?
73
+ names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
74
+ download(names_path, names_url)
75
+ end
76
+ names_path.read
77
+ end
78
+
79
+ CONVERTERS = {
80
+ label: {
81
+ "p" => "poisonous",
82
+ "e" => "edible",
83
+ },
84
+ cap_shape: {
85
+ "b" => "bell",
86
+ "c" => "conical",
87
+ "x" => "convex",
88
+ "f" => "flat",
89
+ "k" => "knobbed",
90
+ "s" => "sunken",
91
+ },
92
+ cap_surface: {
93
+ "f" => "fibrous",
94
+ "g" => "grooves",
95
+ "y" => "scaly",
96
+ "s" => "smooth",
97
+ },
98
+ cap_color: {
99
+ "n" => "brown",
100
+ "b" => "buff",
101
+ "c" => "cinnamon",
102
+ "g" => "gray",
103
+ "r" => "green",
104
+ "p" => "pink",
105
+ "u" => "purple",
106
+ "e" => "red",
107
+ "w" => "white",
108
+ "y" => "yellow",
109
+ },
110
+ bruises: {
111
+ "t" => "bruises",
112
+ "f" => "no",
113
+ },
114
+ odor: {
115
+ "a" => "almond",
116
+ "l" => "anise",
117
+ "c" => "creosote",
118
+ "y" => "fishy",
119
+ "f" => "foul",
120
+ "m" => "musty",
121
+ "n" => "none",
122
+ "p" => "pungent",
123
+ "s" => "spicy",
124
+ },
125
+ gill_attachment: {
126
+ "a" => "attached",
127
+ "d" => "descending",
128
+ "f" => "free",
129
+ "n" => "notched",
130
+ },
131
+ gill_spacing: {
132
+ "c" => "close",
133
+ "w" => "crowded",
134
+ "d" => "distant",
135
+ },
136
+ gill_size: {
137
+ "b" => "broad",
138
+ "n" => "narrow",
139
+ },
140
+ gill_color: {
141
+ "k" => "black",
142
+ "n" => "brown",
143
+ "b" => "buff",
144
+ "h" => "chocolate",
145
+ "g" => "gray",
146
+ "r" => "green",
147
+ "o" => "orange",
148
+ "p" => "pink",
149
+ "u" => "purple",
150
+ "e" => "red",
151
+ "w" => "white",
152
+ "y" => "yellow",
153
+ },
154
+ stalk_shape: {
155
+ "e" => "enlarging",
156
+ "t" => "tapering",
157
+ },
158
+ stalk_root: {
159
+ "b" => "bulbous",
160
+ "c" => "club",
161
+ "u" => "cup",
162
+ "e" => "equal",
163
+ "z" => "rhizomorphs",
164
+ "r" => "rooted",
165
+ "?" => "missing",
166
+ },
167
+ stalk_surface_above_ring: {
168
+ "f" => "fibrous",
169
+ "y" => "scaly",
170
+ "k" => "silky",
171
+ "s" => "smooth",
172
+ },
173
+ stalk_surface_below_ring: {
174
+ "f" => "fibrous",
175
+ "y" => "scaly",
176
+ "k" => "silky",
177
+ "s" => "smooth",
178
+ },
179
+ stalk_color_above_ring: {
180
+ "n" => "brown",
181
+ "b" => "buff",
182
+ "c" => "cinnamon",
183
+ "g" => "gray",
184
+ "o" => "orange",
185
+ "p" => "pink",
186
+ "e" => "red",
187
+ "w" => "white",
188
+ "y" => "yellow",
189
+ },
190
+ stalk_color_below_ring: {
191
+ "n" => "brown",
192
+ "b" => "buff",
193
+ "c" => "cinnamon",
194
+ "g" => "gray",
195
+ "o" => "orange",
196
+ "p" => "pink",
197
+ "e" => "red",
198
+ "w" => "white",
199
+ "y" => "yellow",
200
+ },
201
+ veil_type: {
202
+ "p" => "partial",
203
+ "u" => "universal",
204
+ },
205
+ veil_color: {
206
+ "n" => "brown",
207
+ "o" => "orange",
208
+ "w" => "white",
209
+ "y" => "yellow",
210
+ },
211
+ n_rings: {
212
+ "n" => 0,
213
+ "o" => 1,
214
+ "t" => 2,
215
+ },
216
+ ring_type: {
217
+ "c" => "cobwebby",
218
+ "e" => "evanescent",
219
+ "f" => "flaring",
220
+ "l" => "large",
221
+ "n" => "none",
222
+ "p" => "pendant",
223
+ "s" => "sheathing",
224
+ "z" => "zone",
225
+ },
226
+ spore_print_color: {
227
+ "k" => "black",
228
+ "n" => "brown",
229
+ "b" => "buff",
230
+ "h" => "chocolate",
231
+ "r" => "green",
232
+ "o" => "orange",
233
+ "u" => "purple",
234
+ "w" => "white",
235
+ "y" => "yellow",
236
+ },
237
+ population: {
238
+ "a" => "abundant",
239
+ "c" => "clustered",
240
+ "n" => "numerous",
241
+ "s" => "scattered",
242
+ "v" => "several",
243
+ "y" => "solitary",
244
+ },
245
+ habitat: {
246
+ "g" => "grasses",
247
+ "l" => "leaves",
248
+ "m" => "meadows",
249
+ "p" => "paths",
250
+ "u" => "urban",
251
+ "w" => "waste",
252
+ "d" => "woods",
253
+ }
254
+ }
255
+ end
256
+ end
@@ -0,0 +1,125 @@
1
+ require_relative "dataset"
2
+
3
+ module Datasets
4
+ module PenguinsRawData
5
+ Record = Struct.new(:study_name,
6
+ :sample_number,
7
+ :species,
8
+ :region,
9
+ :island,
10
+ :stage,
11
+ :individual_id,
12
+ :clutch_completion,
13
+ :date_egg,
14
+ :culmen_length_mm,
15
+ :culmen_depth_mm,
16
+ :flipper_length_mm,
17
+ :body_mass_g,
18
+ :sex,
19
+ :delta_15_n_permil,
20
+ :delta_13_c_permil,
21
+ :comments)
22
+
23
+ class SpeciesBase < Dataset
24
+ def initialize
25
+ super
26
+ species = self.class.name.split("::").last.downcase
27
+ @metadata.id = "palmerpenguins-raw-#{species}"
28
+ @metadata.url = self.class::URL
29
+ @metadata.licenses = ["CC0"]
30
+ @data_path = cache_dir_path + "penguins" + (species + ".csv")
31
+ end
32
+
33
+ attr_reader :data_path
34
+
35
+ def each
36
+ return to_enum(__method__) unless block_given?
37
+
38
+ open_data do |csv|
39
+ csv.each do |row|
40
+ next if row[0].nil?
41
+ record = Record.new(*row.fields)
42
+ yield record
43
+ end
44
+ end
45
+ end
46
+
47
+ private def open_data
48
+ download unless data_path.exist?
49
+ CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
50
+ yield csv
51
+ end
52
+ end
53
+
54
+ private def download
55
+ super(data_path, metadata.url)
56
+ end
57
+ end
58
+
59
+ # Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86
60
+ class Adelie < SpeciesBase
61
+ DOI = "doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86".freeze
62
+ URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.219.3&entityid=002f3893385f710df69eeebe893144ff".freeze
63
+ end
64
+
65
+ # Gentoo penguin data from: https://doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce
66
+ class Gentoo < SpeciesBase
67
+ DOI = "doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce".freeze
68
+ URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.220.3&entityid=e03b43c924f226486f2f0ab6709d2381".freeze
69
+ end
70
+
71
+ # Chinstrap penguin data from: https://doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7
72
+ class Chinstrap < SpeciesBase
73
+ DOI = "doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7".freeze
74
+ URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.221.2&entityid=fe853aa8f7a59aa84cdd3197619ef462".freeze
75
+ end
76
+ end
77
+
78
+ # This dataset provides the same dataset as https://github.com/allisonhorst/palmerpenguins
79
+ class Penguins < Dataset
80
+ Record = Struct.new(:species,
81
+ :island,
82
+ :bill_length_mm,
83
+ :bill_depth_mm,
84
+ :flipper_length_mm,
85
+ :body_mass_g,
86
+ :sex,
87
+ :year)
88
+
89
+ def initialize
90
+ super
91
+ @metadata.id = "palmerpenguins"
92
+ @metadata.name = "palmerpenguins"
93
+ @metadata.url = "https://allisonhorst.github.io/palmerpenguins/"
94
+ @metadata.licenses = ["CC0"]
95
+ @metadata.description = "A great dataset for data exploration & visualization, as an alternative to iris"
96
+ end
97
+
98
+ def each(&block)
99
+ return to_enum(__method__) unless block_given?
100
+
101
+ species_classes = [
102
+ PenguinsRawData::Adelie,
103
+ PenguinsRawData::Gentoo,
104
+ PenguinsRawData::Chinstrap
105
+ ]
106
+
107
+ species_classes.each do |species_class|
108
+ species_class.new.each do |raw_record|
109
+ yield convert_record(raw_record)
110
+ end
111
+ end
112
+ end
113
+
114
+ private def convert_record(raw_record)
115
+ Record.new(raw_record.species.split(' ')[0],
116
+ raw_record.island,
117
+ raw_record.culmen_length_mm,
118
+ raw_record.culmen_depth_mm,
119
+ raw_record.flipper_length_mm&.to_i,
120
+ raw_record.body_mass_g&.to_i,
121
+ raw_record.sex&.downcase,
122
+ raw_record.date_egg&.year)
123
+ end
124
+ end
125
+ end