red-datasets 0.0.6 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -7
  3. data/doc/text/news.md +124 -0
  4. data/lib/datasets.rb +18 -6
  5. data/lib/datasets/adult.rb +84 -0
  6. data/lib/datasets/cldr-plurals.rb +385 -0
  7. data/lib/datasets/communities.rb +198 -0
  8. data/lib/datasets/dataset.rb +13 -0
  9. data/lib/datasets/dictionary.rb +59 -0
  10. data/lib/datasets/downloader.rb +37 -62
  11. data/lib/datasets/e-stat-japan.rb +320 -0
  12. data/lib/datasets/error.rb +4 -0
  13. data/lib/datasets/fashion-mnist.rb +12 -0
  14. data/lib/datasets/hepatitis.rb +207 -0
  15. data/lib/datasets/iris.rb +1 -1
  16. data/lib/datasets/libsvm-dataset-list.rb +277 -0
  17. data/lib/datasets/libsvm.rb +135 -0
  18. data/lib/datasets/mnist.rb +11 -8
  19. data/lib/datasets/mushroom.rb +256 -0
  20. data/lib/datasets/penguins.rb +125 -0
  21. data/lib/datasets/penn-treebank.rb +2 -9
  22. data/lib/datasets/postal-code-japan.rb +154 -0
  23. data/lib/datasets/table.rb +99 -3
  24. data/lib/datasets/version.rb +1 -1
  25. data/lib/datasets/wikipedia.rb +2 -10
  26. data/lib/datasets/wine.rb +64 -0
  27. data/red-datasets.gemspec +4 -0
  28. data/test/helper.rb +1 -0
  29. data/test/run-test.rb +2 -0
  30. data/test/test-adult.rb +126 -0
  31. data/test/test-cldr-plurals.rb +180 -0
  32. data/test/test-communities.rb +290 -0
  33. data/test/test-dictionary.rb +43 -0
  34. data/test/test-e-stat-japan.rb +383 -0
  35. data/test/test-fashion-mnist.rb +137 -0
  36. data/test/test-hepatitis.rb +74 -0
  37. data/test/test-libsvm-dataset-list.rb +47 -0
  38. data/test/test-libsvm.rb +205 -0
  39. data/test/test-mnist.rb +95 -70
  40. data/test/test-mushroom.rb +80 -0
  41. data/test/test-penguins.rb +239 -0
  42. data/test/test-penn-treebank.rb +6 -6
  43. data/test/test-postal-code-japan.rb +69 -0
  44. data/test/test-table.rb +144 -19
  45. data/test/test-wine.rb +58 -0
  46. metadata +89 -8
@@ -0,0 +1,135 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class LIBSVM < Dataset
7
+ class Record
8
+ attr_reader :label
9
+ attr_reader :features
10
+ def initialize(label, features)
11
+ @label = label
12
+ @features = features
13
+ end
14
+
15
+ def [](index)
16
+ @features[index]
17
+ end
18
+
19
+ def to_h
20
+ hash = {
21
+ label: @label,
22
+ }
23
+ @features.each_with_index do |feature, i|
24
+ hash[i] = feature
25
+ end
26
+ hash
27
+ end
28
+
29
+ def values
30
+ [@label] + @features
31
+ end
32
+ end
33
+
34
+ def initialize(name,
35
+ note: nil,
36
+ default_feature_value: 0)
37
+ super()
38
+ @libsvm_dataset_metadata = fetch_dataset_info(name)
39
+ @file = choose_file(note)
40
+ @default_feature_value = default_feature_value
41
+ @metadata.id = "libsvm-#{normalize_name(name)}"
42
+ @metadata.name = "LIBSVM dataset: #{name}"
43
+ @metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
44
+ end
45
+
46
+ def each
47
+ return to_enum(__method__) unless block_given?
48
+
49
+ open_data do |input|
50
+ n_features = @libsvm_dataset_metadata.n_features
51
+ csv = CSV.new(input, col_sep: " ")
52
+ csv.each do |row|
53
+ label = parse_label(row.shift)
54
+ features = [@default_feature_value] * n_features
55
+ row.each do |column|
56
+ next if column.nil?
57
+ index, value = column.split(":", 2)
58
+ features[Integer(index, 10) - 1] = parse_value(value)
59
+ end
60
+ yield(Record.new(label, features))
61
+ end
62
+ end
63
+ end
64
+
65
+ private
66
+ def fetch_dataset_info(name)
67
+ list = LIBSVMDatasetList.new
68
+ available_datasets = []
69
+ list.each do |record|
70
+ available_datasets << record.name
71
+ if record.name == name
72
+ return record
73
+ end
74
+ end
75
+ message = "unavailable LIBSVM dataset: #{name.inspect}: "
76
+ message << "available datasets: ["
77
+ message << available_datasets.collect(&:inspect).join(", ")
78
+ message << "]"
79
+ raise ArgumentError, message
80
+ end
81
+
82
+ def choose_file(note)
83
+ files = @libsvm_dataset_metadata.files
84
+ return files.first if note.nil?
85
+
86
+ available_notes = []
87
+ @libsvm_dataset_metadata.files.find do |file|
88
+ return file if file.note == note
89
+ available_notes << file.note if file.note
90
+ end
91
+
92
+ name = @libsvm_dataset_metadata.name
93
+ message = "unavailable note: #{name}: #{note.inspect}: "
94
+ message << "available notes: ["
95
+ message << available_notes.collect(&:inspect).join(", ")
96
+ message << "]"
97
+ raise ArgumentError, message
98
+ end
99
+
100
+ def open_data(&block)
101
+ data_path = cache_dir_path + @file.name
102
+ unless data_path.exist?
103
+ download(data_path, @file.url)
104
+ end
105
+ if data_path.extname == ".bz2"
106
+ extract_bz2(data_path, &block)
107
+ else
108
+ File.open(data_path, &block)
109
+ end
110
+ end
111
+
112
+ def normalize_name(name)
113
+ name.gsub(/[()]/, "").gsub(/[ _;]+/, "-").downcase
114
+ end
115
+
116
+ def parse_label(label)
117
+ labels = label.split(",").collect do |value|
118
+ parse_value(value)
119
+ end
120
+ if labels.size == 1
121
+ labels[0]
122
+ else
123
+ labels
124
+ end
125
+ end
126
+
127
+ def parse_value(value)
128
+ if value.include?(".")
129
+ Float(value)
130
+ else
131
+ Integer(value, 10)
132
+ end
133
+ end
134
+ end
135
+ end
@@ -2,10 +2,9 @@ require 'zlib'
2
2
 
3
3
  require_relative "dataset"
4
4
 
5
- class SetTypeError < StandardError; end
6
-
7
5
  module Datasets
8
6
  class MNIST < Dataset
7
+ BASE_URL = "http://yann.lecun.com/exdb/mnist/"
9
8
 
10
9
  class Record < Struct.new(:data, :label)
11
10
  def pixels
@@ -26,9 +25,9 @@ module Datasets
26
25
 
27
26
  super()
28
27
 
29
- @metadata.id = "mnist-#{type}"
30
- @metadata.name = "MNIST: #{type}"
31
- @metadata.url = "http://yann.lecun.com/exdb/mnist/"
28
+ @metadata.id = "#{dataset_name.downcase}-#{type}"
29
+ @metadata.name = "#{dataset_name}: #{type}"
30
+ @metadata.url = self.class::BASE_URL
32
31
  @type = type
33
32
 
34
33
  case type
@@ -44,7 +43,7 @@ module Datasets
44
43
 
45
44
  image_path = cache_dir_path + target_file(:image)
46
45
  label_path = cache_dir_path + target_file(:label)
47
- base_url = "http://yann.lecun.com/exdb/mnist/"
46
+ base_url = self.class::BASE_URL
48
47
 
49
48
  unless image_path.exist?
50
49
  download(image_path, base_url + target_file(:image))
@@ -66,7 +65,7 @@ module Datasets
66
65
  n_bytes = n_uint32s * 4
67
66
  mnist_magic_number = 2051
68
67
  magic, n_images, n_rows, n_cols = f.read(n_bytes).unpack("N*")
69
- raise 'This is not MNIST image file' if magic != mnist_magic_number
68
+ raise "This is not #{dataset_name} image file" if magic != mnist_magic_number
70
69
  n_images.times do |i|
71
70
  data = f.read(n_rows * n_cols)
72
71
  label = labels[i]
@@ -100,9 +99,13 @@ module Datasets
100
99
  n_bytes = n_uint32s * 2
101
100
  mnist_magic_number = 2049
102
101
  magic, n_labels = f.read(n_bytes).unpack('N2')
103
- raise 'This is not MNIST label file' if magic != mnist_magic_number
102
+ raise "This is not #{dataset_name} label file" if magic != mnist_magic_number
104
103
  f.read(n_labels).unpack('C*')
105
104
  end
106
105
  end
106
+
107
+ def dataset_name
108
+ "MNIST"
109
+ end
107
110
  end
108
111
  end
@@ -0,0 +1,256 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class Mushroom < Dataset
7
+ Record = Struct.new(
8
+ :label,
9
+ :cap_shape,
10
+ :cap_surface,
11
+ :cap_color,
12
+ :bruises,
13
+ :odor,
14
+ :gill_attachment,
15
+ :gill_spacing,
16
+ :gill_size,
17
+ :gill_color,
18
+ :stalk_shape,
19
+ :stalk_root,
20
+ :stalk_surface_above_ring,
21
+ :stalk_surface_below_ring,
22
+ :stalk_color_above_ring,
23
+ :stalk_color_below_ring,
24
+ :veil_type,
25
+ :veil_color,
26
+ :n_rings,
27
+ :ring_type,
28
+ :spore_print_color,
29
+ :population,
30
+ :habitat,
31
+ )
32
+
33
+ def initialize
34
+ super()
35
+ @metadata.id = "mushroom"
36
+ @metadata.name = "Mushroom"
37
+ @metadata.url = "https://archive.ics.uci.edu/ml/datasets/mushroom"
38
+ @metadata.description = lambda do
39
+ read_names
40
+ end
41
+ end
42
+
43
+ def each
44
+ return to_enum(__method__) unless block_given?
45
+
46
+ open_data do |csv|
47
+ csv.each do |row|
48
+ next if row[0].nil?
49
+ record = Record.new(*row)
50
+ record.members.each do |member|
51
+ record[member] = CONVERTERS[member][record[member]]
52
+ end
53
+ yield(record)
54
+ end
55
+ end
56
+ end
57
+
58
+ private
59
+ def open_data
60
+ data_path = cache_dir_path + "agaricus-lepiota.data"
61
+ unless data_path.exist?
62
+ data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
63
+ download(data_path, data_url)
64
+ end
65
+ CSV.open(data_path) do |csv|
66
+ yield(csv)
67
+ end
68
+ end
69
+
70
+ def read_names
71
+ names_path = cache_dir_path + "agaricus-lepiota.names"
72
+ unless names_path.exist?
73
+ names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
74
+ download(names_path, names_url)
75
+ end
76
+ names_path.read
77
+ end
78
+
79
+ CONVERTERS = {
80
+ label: {
81
+ "p" => "poisonous",
82
+ "e" => "edible",
83
+ },
84
+ cap_shape: {
85
+ "b" => "bell",
86
+ "c" => "conical",
87
+ "x" => "convex",
88
+ "f" => "flat",
89
+ "k" => "knobbed",
90
+ "s" => "sunken",
91
+ },
92
+ cap_surface: {
93
+ "f" => "fibrous",
94
+ "g" => "grooves",
95
+ "y" => "scaly",
96
+ "s" => "smooth",
97
+ },
98
+ cap_color: {
99
+ "n" => "brown",
100
+ "b" => "buff",
101
+ "c" => "cinnamon",
102
+ "g" => "gray",
103
+ "r" => "green",
104
+ "p" => "pink",
105
+ "u" => "purple",
106
+ "e" => "red",
107
+ "w" => "white",
108
+ "y" => "yellow",
109
+ },
110
+ bruises: {
111
+ "t" => "bruises",
112
+ "f" => "no",
113
+ },
114
+ odor: {
115
+ "a" => "almond",
116
+ "l" => "anise",
117
+ "c" => "creosote",
118
+ "y" => "fishy",
119
+ "f" => "foul",
120
+ "m" => "musty",
121
+ "n" => "none",
122
+ "p" => "pungent",
123
+ "s" => "spicy",
124
+ },
125
+ gill_attachment: {
126
+ "a" => "attached",
127
+ "d" => "descending",
128
+ "f" => "free",
129
+ "n" => "notched",
130
+ },
131
+ gill_spacing: {
132
+ "c" => "close",
133
+ "w" => "crowded",
134
+ "d" => "distant",
135
+ },
136
+ gill_size: {
137
+ "b" => "broad",
138
+ "n" => "narrow",
139
+ },
140
+ gill_color: {
141
+ "k" => "black",
142
+ "n" => "brown",
143
+ "b" => "buff",
144
+ "h" => "chocolate",
145
+ "g" => "gray",
146
+ "r" => "green",
147
+ "o" => "orange",
148
+ "p" => "pink",
149
+ "u" => "purple",
150
+ "e" => "red",
151
+ "w" => "white",
152
+ "y" => "yellow",
153
+ },
154
+ stalk_shape: {
155
+ "e" => "enlarging",
156
+ "t" => "tapering",
157
+ },
158
+ stalk_root: {
159
+ "b" => "bulbous",
160
+ "c" => "club",
161
+ "u" => "cup",
162
+ "e" => "equal",
163
+ "z" => "rhizomorphs",
164
+ "r" => "rooted",
165
+ "?" => "missing",
166
+ },
167
+ stalk_surface_above_ring: {
168
+ "f" => "fibrous",
169
+ "y" => "scaly",
170
+ "k" => "silky",
171
+ "s" => "smooth",
172
+ },
173
+ stalk_surface_below_ring: {
174
+ "f" => "fibrous",
175
+ "y" => "scaly",
176
+ "k" => "silky",
177
+ "s" => "smooth",
178
+ },
179
+ stalk_color_above_ring: {
180
+ "n" => "brown",
181
+ "b" => "buff",
182
+ "c" => "cinnamon",
183
+ "g" => "gray",
184
+ "o" => "orange",
185
+ "p" => "pink",
186
+ "e" => "red",
187
+ "w" => "white",
188
+ "y" => "yellow",
189
+ },
190
+ stalk_color_below_ring: {
191
+ "n" => "brown",
192
+ "b" => "buff",
193
+ "c" => "cinnamon",
194
+ "g" => "gray",
195
+ "o" => "orange",
196
+ "p" => "pink",
197
+ "e" => "red",
198
+ "w" => "white",
199
+ "y" => "yellow",
200
+ },
201
+ veil_type: {
202
+ "p" => "partial",
203
+ "u" => "universal",
204
+ },
205
+ veil_color: {
206
+ "n" => "brown",
207
+ "o" => "orange",
208
+ "w" => "white",
209
+ "y" => "yellow",
210
+ },
211
+ n_rings: {
212
+ "n" => 0,
213
+ "o" => 1,
214
+ "t" => 2,
215
+ },
216
+ ring_type: {
217
+ "c" => "cobwebby",
218
+ "e" => "evanescent",
219
+ "f" => "flaring",
220
+ "l" => "large",
221
+ "n" => "none",
222
+ "p" => "pendant",
223
+ "s" => "sheathing",
224
+ "z" => "zone",
225
+ },
226
+ spore_print_color: {
227
+ "k" => "black",
228
+ "n" => "brown",
229
+ "b" => "buff",
230
+ "h" => "chocolate",
231
+ "r" => "green",
232
+ "o" => "orange",
233
+ "u" => "purple",
234
+ "w" => "white",
235
+ "y" => "yellow",
236
+ },
237
+ population: {
238
+ "a" => "abundant",
239
+ "c" => "clustered",
240
+ "n" => "numerous",
241
+ "s" => "scattered",
242
+ "v" => "several",
243
+ "y" => "solitary",
244
+ },
245
+ habitat: {
246
+ "g" => "grasses",
247
+ "l" => "leaves",
248
+ "m" => "meadows",
249
+ "p" => "paths",
250
+ "u" => "urban",
251
+ "w" => "waste",
252
+ "d" => "woods",
253
+ }
254
+ }
255
+ end
256
+ end
@@ -0,0 +1,125 @@
1
+ require_relative "dataset"
2
+
3
+ module Datasets
4
+ module PenguinsRawData
5
+ Record = Struct.new(:study_name,
6
+ :sample_number,
7
+ :species,
8
+ :region,
9
+ :island,
10
+ :stage,
11
+ :individual_id,
12
+ :clutch_completion,
13
+ :date_egg,
14
+ :culmen_length_mm,
15
+ :culmen_depth_mm,
16
+ :flipper_length_mm,
17
+ :body_mass_g,
18
+ :sex,
19
+ :delta_15_n_permil,
20
+ :delta_13_c_permil,
21
+ :comments)
22
+
23
+ class SpeciesBase < Dataset
24
+ def initialize
25
+ super
26
+ species = self.class.name.split("::").last.downcase
27
+ @metadata.id = "palmerpenguins-raw-#{species}"
28
+ @metadata.url = self.class::URL
29
+ @metadata.licenses = ["CC0"]
30
+ @data_path = cache_dir_path + "penguins" + (species + ".csv")
31
+ end
32
+
33
+ attr_reader :data_path
34
+
35
+ def each
36
+ return to_enum(__method__) unless block_given?
37
+
38
+ open_data do |csv|
39
+ csv.each do |row|
40
+ next if row[0].nil?
41
+ record = Record.new(*row.fields)
42
+ yield record
43
+ end
44
+ end
45
+ end
46
+
47
+ private def open_data
48
+ download unless data_path.exist?
49
+ CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
50
+ yield csv
51
+ end
52
+ end
53
+
54
+ private def download
55
+ super(data_path, metadata.url)
56
+ end
57
+ end
58
+
59
+ # Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86
60
+ class Adelie < SpeciesBase
61
+ DOI = "doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86".freeze
62
+ URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.219.3&entityid=002f3893385f710df69eeebe893144ff".freeze
63
+ end
64
+
65
+ # Gentoo penguin data from: https://doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce
66
+ class Gentoo < SpeciesBase
67
+ DOI = "doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce".freeze
68
+ URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.220.3&entityid=e03b43c924f226486f2f0ab6709d2381".freeze
69
+ end
70
+
71
+ # Chinstrap penguin data from: https://doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7
72
+ class Chinstrap < SpeciesBase
73
+ DOI = "doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7".freeze
74
+ URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.221.2&entityid=fe853aa8f7a59aa84cdd3197619ef462".freeze
75
+ end
76
+ end
77
+
78
+ # This dataset provides the same dataset as https://github.com/allisonhorst/palmerpenguins
79
+ class Penguins < Dataset
80
+ Record = Struct.new(:species,
81
+ :island,
82
+ :bill_length_mm,
83
+ :bill_depth_mm,
84
+ :flipper_length_mm,
85
+ :body_mass_g,
86
+ :sex,
87
+ :year)
88
+
89
+ def initialize
90
+ super
91
+ @metadata.id = "palmerpenguins"
92
+ @metadata.name = "palmerpenguins"
93
+ @metadata.url = "https://allisonhorst.github.io/palmerpenguins/"
94
+ @metadata.licenses = ["CC0"]
95
+ @metadata.description = "A great dataset for data exploration & visualization, as an alternative to iris"
96
+ end
97
+
98
+ def each(&block)
99
+ return to_enum(__method__) unless block_given?
100
+
101
+ species_classes = [
102
+ PenguinsRawData::Adelie,
103
+ PenguinsRawData::Gentoo,
104
+ PenguinsRawData::Chinstrap
105
+ ]
106
+
107
+ species_classes.each do |species_class|
108
+ species_class.new.each do |raw_record|
109
+ yield convert_record(raw_record)
110
+ end
111
+ end
112
+ end
113
+
114
+ private def convert_record(raw_record)
115
+ Record.new(raw_record.species.split(' ')[0],
116
+ raw_record.island,
117
+ raw_record.culmen_length_mm,
118
+ raw_record.culmen_depth_mm,
119
+ raw_record.flipper_length_mm&.to_i,
120
+ raw_record.body_mass_g&.to_i,
121
+ raw_record.sex&.downcase,
122
+ raw_record.date_egg&.year)
123
+ end
124
+ end
125
+ end