red-datasets 0.0.7 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,135 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class LIBSVM < Dataset
7
+ class Record
8
+ attr_reader :label
9
+ attr_reader :features
10
+ def initialize(label, features)
11
+ @label = label
12
+ @features = features
13
+ end
14
+
15
+ def [](index)
16
+ @features[index]
17
+ end
18
+
19
+ def to_h
20
+ hash = {
21
+ label: @label,
22
+ }
23
+ @features.each_with_index do |feature, i|
24
+ hash[i] = feature
25
+ end
26
+ hash
27
+ end
28
+
29
+ def values
30
+ [@label] + @features
31
+ end
32
+ end
33
+
34
+ def initialize(name,
35
+ note: nil,
36
+ default_feature_value: 0)
37
+ super()
38
+ @libsvm_dataset_metadata = fetch_dataset_info(name)
39
+ @file = choose_file(note)
40
+ @default_feature_value = default_feature_value
41
+ @metadata.id = "libsvm-#{normalize_name(name)}"
42
+ @metadata.name = "LIBSVM dataset: #{name}"
43
+ @metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
44
+ end
45
+
46
+ def each
47
+ return to_enum(__method__) unless block_given?
48
+
49
+ open_data do |input|
50
+ n_features = @libsvm_dataset_metadata.n_features
51
+ csv = CSV.new(input, col_sep: " ")
52
+ csv.each do |row|
53
+ label = parse_label(row.shift)
54
+ features = [@default_feature_value] * n_features
55
+ row.each do |column|
56
+ next if column.nil?
57
+ index, value = column.split(":", 2)
58
+ features[Integer(index, 10) - 1] = parse_value(value)
59
+ end
60
+ yield(Record.new(label, features))
61
+ end
62
+ end
63
+ end
64
+
65
+ private
66
+ def fetch_dataset_info(name)
67
+ list = LIBSVMDatasetList.new
68
+ available_datasets = []
69
+ list.each do |record|
70
+ available_datasets << record.name
71
+ if record.name == name
72
+ return record
73
+ end
74
+ end
75
+ message = "unavailable LIBSVM dataset: #{name.inspect}: "
76
+ message << "available datasets: ["
77
+ message << available_datasets.collect(&:inspect).join(", ")
78
+ message << "]"
79
+ raise ArgumentError, message
80
+ end
81
+
82
+ def choose_file(note)
83
+ files = @libsvm_dataset_metadata.files
84
+ return files.first if note.nil?
85
+
86
+ available_notes = []
87
+ @libsvm_dataset_metadata.files.find do |file|
88
+ return file if file.note == note
89
+ available_notes << file.note if file.note
90
+ end
91
+
92
+ name = @libsvm_dataset_metadata.name
93
+ message = "unavailable note: #{name}: #{note.inspect}: "
94
+ message << "available notes: ["
95
+ message << available_notes.collect(&:inspect).join(", ")
96
+ message << "]"
97
+ raise ArgumentError, message
98
+ end
99
+
100
+ def open_data(&block)
101
+ data_path = cache_dir_path + @file.name
102
+ unless data_path.exist?
103
+ download(data_path, @file.url)
104
+ end
105
+ if data_path.extname == ".bz2"
106
+ extract_bz2(data_path, &block)
107
+ else
108
+ File.open(data_path, &block)
109
+ end
110
+ end
111
+
112
+ def normalize_name(name)
113
+ name.gsub(/[()]/, "").gsub(/[ _;]+/, "-").downcase
114
+ end
115
+
116
+ def parse_label(label)
117
+ labels = label.split(",").collect do |value|
118
+ parse_value(value)
119
+ end
120
+ if labels.size == 1
121
+ labels[0]
122
+ else
123
+ labels
124
+ end
125
+ end
126
+
127
+ def parse_value(value)
128
+ if value.include?(".")
129
+ Float(value)
130
+ else
131
+ Integer(value, 10)
132
+ end
133
+ end
134
+ end
135
+ end
@@ -2,8 +2,6 @@ require 'zlib'
2
2
 
3
3
  require_relative "dataset"
4
4
 
5
- class SetTypeError < StandardError; end
6
-
7
5
  module Datasets
8
6
  class MNIST < Dataset
9
7
  BASE_URL = "http://yann.lecun.com/exdb/mnist/"
@@ -0,0 +1,256 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class Mushroom < Dataset
7
+ Record = Struct.new(
8
+ :label,
9
+ :cap_shape,
10
+ :cap_surface,
11
+ :cap_color,
12
+ :bruises,
13
+ :odor,
14
+ :gill_attachment,
15
+ :gill_spacing,
16
+ :gill_size,
17
+ :gill_color,
18
+ :stalk_shape,
19
+ :stalk_root,
20
+ :stalk_surface_above_ring,
21
+ :stalk_surface_below_ring,
22
+ :stalk_color_above_ring,
23
+ :stalk_color_below_ring,
24
+ :veil_type,
25
+ :veil_color,
26
+ :n_rings,
27
+ :ring_type,
28
+ :spore_print_color,
29
+ :population,
30
+ :habitat,
31
+ )
32
+
33
+ def initialize
34
+ super()
35
+ @metadata.id = "mushroom"
36
+ @metadata.name = "Mushroom"
37
+ @metadata.url = "https://archive.ics.uci.edu/ml/datasets/mushroom"
38
+ @metadata.description = lambda do
39
+ read_names
40
+ end
41
+ end
42
+
43
+ def each
44
+ return to_enum(__method__) unless block_given?
45
+
46
+ open_data do |csv|
47
+ csv.each do |row|
48
+ next if row[0].nil?
49
+ record = Record.new(*row)
50
+ record.members.each do |member|
51
+ record[member] = CONVERTERS[member][record[member]]
52
+ end
53
+ yield(record)
54
+ end
55
+ end
56
+ end
57
+
58
+ private
59
+ def open_data
60
+ data_path = cache_dir_path + "agaricus-lepiota.data"
61
+ unless data_path.exist?
62
+ data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
63
+ download(data_path, data_url)
64
+ end
65
+ CSV.open(data_path) do |csv|
66
+ yield(csv)
67
+ end
68
+ end
69
+
70
+ def read_names
71
+ names_path = cache_dir_path + "agaricus-lepiota.names"
72
+ unless names_path.exist?
73
+ names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
74
+ download(names_path, names_url)
75
+ end
76
+ names_path.read
77
+ end
78
+
79
+ CONVERTERS = {
80
+ label: {
81
+ "p" => "poisonous",
82
+ "e" => "edible",
83
+ },
84
+ cap_shape: {
85
+ "b" => "bell",
86
+ "c" => "conical",
87
+ "x" => "convex",
88
+ "f" => "flat",
89
+ "k" => "knobbed",
90
+ "s" => "sunken",
91
+ },
92
+ cap_surface: {
93
+ "f" => "fibrous",
94
+ "g" => "grooves",
95
+ "y" => "scaly",
96
+ "s" => "smooth",
97
+ },
98
+ cap_color: {
99
+ "n" => "brown",
100
+ "b" => "buff",
101
+ "c" => "cinnamon",
102
+ "g" => "gray",
103
+ "r" => "green",
104
+ "p" => "pink",
105
+ "u" => "purple",
106
+ "e" => "red",
107
+ "w" => "white",
108
+ "y" => "yellow",
109
+ },
110
+ bruises: {
111
+ "t" => "bruises",
112
+ "f" => "no",
113
+ },
114
+ odor: {
115
+ "a" => "almond",
116
+ "l" => "anise",
117
+ "c" => "creosote",
118
+ "y" => "fishy",
119
+ "f" => "foul",
120
+ "m" => "musty",
121
+ "n" => "none",
122
+ "p" => "pungent",
123
+ "s" => "spicy",
124
+ },
125
+ gill_attachment: {
126
+ "a" => "attached",
127
+ "d" => "descending",
128
+ "f" => "free",
129
+ "n" => "notched",
130
+ },
131
+ gill_spacing: {
132
+ "c" => "close",
133
+ "w" => "crowded",
134
+ "d" => "distant",
135
+ },
136
+ gill_size: {
137
+ "b" => "broad",
138
+ "n" => "narrow",
139
+ },
140
+ gill_color: {
141
+ "k" => "black",
142
+ "n" => "brown",
143
+ "b" => "buff",
144
+ "h" => "chocolate",
145
+ "g" => "gray",
146
+ "r" => "green",
147
+ "o" => "orange",
148
+ "p" => "pink",
149
+ "u" => "purple",
150
+ "e" => "red",
151
+ "w" => "white",
152
+ "y" => "yellow",
153
+ },
154
+ stalk_shape: {
155
+ "e" => "enlarging",
156
+ "t" => "tapering",
157
+ },
158
+ stalk_root: {
159
+ "b" => "bulbous",
160
+ "c" => "club",
161
+ "u" => "cup",
162
+ "e" => "equal",
163
+ "z" => "rhizomorphs",
164
+ "r" => "rooted",
165
+ "?" => "missing",
166
+ },
167
+ stalk_surface_above_ring: {
168
+ "f" => "fibrous",
169
+ "y" => "scaly",
170
+ "k" => "silky",
171
+ "s" => "smooth",
172
+ },
173
+ stalk_surface_below_ring: {
174
+ "f" => "fibrous",
175
+ "y" => "scaly",
176
+ "k" => "silky",
177
+ "s" => "smooth",
178
+ },
179
+ stalk_color_above_ring: {
180
+ "n" => "brown",
181
+ "b" => "buff",
182
+ "c" => "cinnamon",
183
+ "g" => "gray",
184
+ "o" => "orange",
185
+ "p" => "pink",
186
+ "e" => "red",
187
+ "w" => "white",
188
+ "y" => "yellow",
189
+ },
190
+ stalk_color_below_ring: {
191
+ "n" => "brown",
192
+ "b" => "buff",
193
+ "c" => "cinnamon",
194
+ "g" => "gray",
195
+ "o" => "orange",
196
+ "p" => "pink",
197
+ "e" => "red",
198
+ "w" => "white",
199
+ "y" => "yellow",
200
+ },
201
+ veil_type: {
202
+ "p" => "partial",
203
+ "u" => "universal",
204
+ },
205
+ veil_color: {
206
+ "n" => "brown",
207
+ "o" => "orange",
208
+ "w" => "white",
209
+ "y" => "yellow",
210
+ },
211
+ n_rings: {
212
+ "n" => 0,
213
+ "o" => 1,
214
+ "t" => 2,
215
+ },
216
+ ring_type: {
217
+ "c" => "cobwebby",
218
+ "e" => "evanescent",
219
+ "f" => "flaring",
220
+ "l" => "large",
221
+ "n" => "none",
222
+ "p" => "pendant",
223
+ "s" => "sheathing",
224
+ "z" => "zone",
225
+ },
226
+ spore_print_color: {
227
+ "k" => "black",
228
+ "n" => "brown",
229
+ "b" => "buff",
230
+ "h" => "chocolate",
231
+ "r" => "green",
232
+ "o" => "orange",
233
+ "u" => "purple",
234
+ "w" => "white",
235
+ "y" => "yellow",
236
+ },
237
+ population: {
238
+ "a" => "abundant",
239
+ "c" => "clustered",
240
+ "n" => "numerous",
241
+ "s" => "scattered",
242
+ "v" => "several",
243
+ "y" => "solitary",
244
+ },
245
+ habitat: {
246
+ "g" => "grasses",
247
+ "l" => "leaves",
248
+ "m" => "meadows",
249
+ "p" => "paths",
250
+ "u" => "urban",
251
+ "w" => "waste",
252
+ "d" => "woods",
253
+ }
254
+ }
255
+ end
256
+ end
@@ -0,0 +1,146 @@
1
+ require_relative "dataset"
2
+
3
+ module Datasets
4
+ module PenguinsRawData
5
+ Record = Struct.new(:study_name,
6
+ :sample_number,
7
+ :species,
8
+ :region,
9
+ :island,
10
+ :stage,
11
+ :individual_id,
12
+ :clutch_completion,
13
+ :date_egg,
14
+ :culmen_length_mm,
15
+ :culmen_depth_mm,
16
+ :flipper_length_mm,
17
+ :body_mass_g,
18
+ :sex,
19
+ :delta_15_n_permil,
20
+ :delta_13_c_permil,
21
+ :comments)
22
+ class SpeciesBase < Dataset
23
+ def initialize
24
+ super
25
+ species = self.class.name.split("::").last.downcase
26
+ @metadata.id = "palmerpenguins-raw-#{species}"
27
+ @metadata.url = self.class::URL
28
+ @metadata.licenses = ["CC0"]
29
+ @data_path = cache_dir_path + "penguins" + (species + ".csv")
30
+ end
31
+
32
+ attr_reader :data_path
33
+
34
+ def each
35
+ return to_enum(__method__) unless block_given?
36
+
37
+ open_data do |csv|
38
+ csv.each do |row|
39
+ next if row[0].nil?
40
+ record = Record.new(*row.fields)
41
+ yield record
42
+ end
43
+ end
44
+ end
45
+
46
+ private def open_data
47
+ download unless data_path.exist?
48
+ CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
49
+ yield csv
50
+ end
51
+ end
52
+
53
+ private def download
54
+ super(data_path, metadata.url)
55
+ end
56
+ end
57
+
58
+ # Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86
59
+ class Adelie < SpeciesBase
60
+ DOI = "doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86".freeze
61
+ URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.219.3&entityid=002f3893385f710df69eeebe893144ff".freeze
62
+ end
63
+
64
+ # Chinstrap penguin data from: https://doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7
65
+ class Chinstrap < SpeciesBase
66
+ DOI = "doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7".freeze
67
+ URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.221.2&entityid=fe853aa8f7a59aa84cdd3197619ef462".freeze
68
+ end
69
+
70
+ # Gentoo penguin data from: https://doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce
71
+ class Gentoo < SpeciesBase
72
+ DOI = "doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce".freeze
73
+ URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.220.3&entityid=e03b43c924f226486f2f0ab6709d2381".freeze
74
+ end
75
+ end
76
+
77
+ # This dataset provides the same dataset as https://github.com/allisonhorst/palmerpenguins
78
+ class Penguins < Dataset
79
+ Record = Struct.new(:species,
80
+ :island,
81
+ :bill_length_mm,
82
+ :bill_depth_mm,
83
+ :flipper_length_mm,
84
+ :body_mass_g,
85
+ :sex,
86
+ :year)
87
+
88
+ def initialize
89
+ super
90
+ @metadata.id = "palmerpenguins"
91
+ @metadata.name = "palmerpenguins"
92
+ @metadata.url = "https://allisonhorst.github.io/palmerpenguins/"
93
+ @metadata.licenses = ["CC0"]
94
+ @metadata.description = "A great dataset for data exploration & visualization, as an alternative to iris"
95
+ end
96
+
97
+ def each(&block)
98
+ return to_enum(__method__) unless block_given?
99
+
100
+ species_classes = [
101
+ PenguinsRawData::Adelie,
102
+ PenguinsRawData::Chinstrap,
103
+ PenguinsRawData::Gentoo,
104
+ ]
105
+
106
+ species_classes.each do |species_class|
107
+ species_class.new.each do |raw_record|
108
+ yield convert_record(raw_record)
109
+ end
110
+ end
111
+ end
112
+
113
+ private def convert_record(raw_record)
114
+ Record.new(*cleanse_fields(raw_record))
115
+ end
116
+
117
+ private def cleanse_fields(raw_record)
118
+ species = raw_record.species.split(' ')[0]
119
+ flipper_length_mm = raw_record.flipper_length_mm&.to_i
120
+ body_mass_g = raw_record.body_mass_g&.to_i
121
+ sex = normalize_sex(raw_record.sex)
122
+ year = raw_record.date_egg&.year
123
+
124
+ [
125
+ species,
126
+ raw_record.island,
127
+ raw_record.culmen_length_mm,
128
+ raw_record.culmen_depth_mm,
129
+ flipper_length_mm,
130
+ body_mass_g,
131
+ sex,
132
+ year
133
+ ]
134
+ end
135
+
136
+ private def normalize_sex(val)
137
+ val = val&.downcase
138
+ case val
139
+ when "female", "male", nil
140
+ val
141
+ else
142
+ nil
143
+ end
144
+ end
145
+ end
146
+ end