red-datasets 0.0.7 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,135 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class LIBSVM < Dataset
7
+ class Record
8
+ attr_reader :label
9
+ attr_reader :features
10
+ def initialize(label, features)
11
+ @label = label
12
+ @features = features
13
+ end
14
+
15
+ def [](index)
16
+ @features[index]
17
+ end
18
+
19
+ def to_h
20
+ hash = {
21
+ label: @label,
22
+ }
23
+ @features.each_with_index do |feature, i|
24
+ hash[i] = feature
25
+ end
26
+ hash
27
+ end
28
+
29
+ def values
30
+ [@label] + @features
31
+ end
32
+ end
33
+
34
+ def initialize(name,
35
+ note: nil,
36
+ default_feature_value: 0)
37
+ super()
38
+ @libsvm_dataset_metadata = fetch_dataset_info(name)
39
+ @file = choose_file(note)
40
+ @default_feature_value = default_feature_value
41
+ @metadata.id = "libsvm-#{normalize_name(name)}"
42
+ @metadata.name = "LIBSVM dataset: #{name}"
43
+ @metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
44
+ end
45
+
46
+ def each
47
+ return to_enum(__method__) unless block_given?
48
+
49
+ open_data do |input|
50
+ n_features = @libsvm_dataset_metadata.n_features
51
+ csv = CSV.new(input, col_sep: " ")
52
+ csv.each do |row|
53
+ label = parse_label(row.shift)
54
+ features = [@default_feature_value] * n_features
55
+ row.each do |column|
56
+ next if column.nil?
57
+ index, value = column.split(":", 2)
58
+ features[Integer(index, 10) - 1] = parse_value(value)
59
+ end
60
+ yield(Record.new(label, features))
61
+ end
62
+ end
63
+ end
64
+
65
+ private
66
+ def fetch_dataset_info(name)
67
+ list = LIBSVMDatasetList.new
68
+ available_datasets = []
69
+ list.each do |record|
70
+ available_datasets << record.name
71
+ if record.name == name
72
+ return record
73
+ end
74
+ end
75
+ message = "unavailable LIBSVM dataset: #{name.inspect}: "
76
+ message << "available datasets: ["
77
+ message << available_datasets.collect(&:inspect).join(", ")
78
+ message << "]"
79
+ raise ArgumentError, message
80
+ end
81
+
82
+ def choose_file(note)
83
+ files = @libsvm_dataset_metadata.files
84
+ return files.first if note.nil?
85
+
86
+ available_notes = []
87
+ @libsvm_dataset_metadata.files.find do |file|
88
+ return file if file.note == note
89
+ available_notes << file.note if file.note
90
+ end
91
+
92
+ name = @libsvm_dataset_metadata.name
93
+ message = "unavailable note: #{name}: #{note.inspect}: "
94
+ message << "available notes: ["
95
+ message << available_notes.collect(&:inspect).join(", ")
96
+ message << "]"
97
+ raise ArgumentError, message
98
+ end
99
+
100
+ def open_data(&block)
101
+ data_path = cache_dir_path + @file.name
102
+ unless data_path.exist?
103
+ download(data_path, @file.url)
104
+ end
105
+ if data_path.extname == ".bz2"
106
+ extract_bz2(data_path, &block)
107
+ else
108
+ File.open(data_path, &block)
109
+ end
110
+ end
111
+
112
+ def normalize_name(name)
113
+ name.gsub(/[()]/, "").gsub(/[ _;]+/, "-").downcase
114
+ end
115
+
116
+ def parse_label(label)
117
+ labels = label.split(",").collect do |value|
118
+ parse_value(value)
119
+ end
120
+ if labels.size == 1
121
+ labels[0]
122
+ else
123
+ labels
124
+ end
125
+ end
126
+
127
+ def parse_value(value)
128
+ if value.include?(".")
129
+ Float(value)
130
+ else
131
+ Integer(value, 10)
132
+ end
133
+ end
134
+ end
135
+ end
@@ -2,8 +2,6 @@ require 'zlib'
2
2
 
3
3
  require_relative "dataset"
4
4
 
5
- class SetTypeError < StandardError; end
6
-
7
5
  module Datasets
8
6
  class MNIST < Dataset
9
7
  BASE_URL = "http://yann.lecun.com/exdb/mnist/"
@@ -0,0 +1,256 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class Mushroom < Dataset
7
+ Record = Struct.new(
8
+ :label,
9
+ :cap_shape,
10
+ :cap_surface,
11
+ :cap_color,
12
+ :bruises,
13
+ :odor,
14
+ :gill_attachment,
15
+ :gill_spacing,
16
+ :gill_size,
17
+ :gill_color,
18
+ :stalk_shape,
19
+ :stalk_root,
20
+ :stalk_surface_above_ring,
21
+ :stalk_surface_below_ring,
22
+ :stalk_color_above_ring,
23
+ :stalk_color_below_ring,
24
+ :veil_type,
25
+ :veil_color,
26
+ :n_rings,
27
+ :ring_type,
28
+ :spore_print_color,
29
+ :population,
30
+ :habitat,
31
+ )
32
+
33
+ def initialize
34
+ super()
35
+ @metadata.id = "mushroom"
36
+ @metadata.name = "Mushroom"
37
+ @metadata.url = "https://archive.ics.uci.edu/ml/datasets/mushroom"
38
+ @metadata.description = lambda do
39
+ read_names
40
+ end
41
+ end
42
+
43
+ def each
44
+ return to_enum(__method__) unless block_given?
45
+
46
+ open_data do |csv|
47
+ csv.each do |row|
48
+ next if row[0].nil?
49
+ record = Record.new(*row)
50
+ record.members.each do |member|
51
+ record[member] = CONVERTERS[member][record[member]]
52
+ end
53
+ yield(record)
54
+ end
55
+ end
56
+ end
57
+
58
+ private
59
+ def open_data
60
+ data_path = cache_dir_path + "agaricus-lepiota.data"
61
+ unless data_path.exist?
62
+ data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
63
+ download(data_path, data_url)
64
+ end
65
+ CSV.open(data_path) do |csv|
66
+ yield(csv)
67
+ end
68
+ end
69
+
70
+ def read_names
71
+ names_path = cache_dir_path + "agaricus-lepiota.names"
72
+ unless names_path.exist?
73
+ names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
74
+ download(names_path, names_url)
75
+ end
76
+ names_path.read
77
+ end
78
+
79
+ CONVERTERS = {
80
+ label: {
81
+ "p" => "poisonous",
82
+ "e" => "edible",
83
+ },
84
+ cap_shape: {
85
+ "b" => "bell",
86
+ "c" => "conical",
87
+ "x" => "convex",
88
+ "f" => "flat",
89
+ "k" => "knobbed",
90
+ "s" => "sunken",
91
+ },
92
+ cap_surface: {
93
+ "f" => "fibrous",
94
+ "g" => "grooves",
95
+ "y" => "scaly",
96
+ "s" => "smooth",
97
+ },
98
+ cap_color: {
99
+ "n" => "brown",
100
+ "b" => "buff",
101
+ "c" => "cinnamon",
102
+ "g" => "gray",
103
+ "r" => "green",
104
+ "p" => "pink",
105
+ "u" => "purple",
106
+ "e" => "red",
107
+ "w" => "white",
108
+ "y" => "yellow",
109
+ },
110
+ bruises: {
111
+ "t" => "bruises",
112
+ "f" => "no",
113
+ },
114
+ odor: {
115
+ "a" => "almond",
116
+ "l" => "anise",
117
+ "c" => "creosote",
118
+ "y" => "fishy",
119
+ "f" => "foul",
120
+ "m" => "musty",
121
+ "n" => "none",
122
+ "p" => "pungent",
123
+ "s" => "spicy",
124
+ },
125
+ gill_attachment: {
126
+ "a" => "attached",
127
+ "d" => "descending",
128
+ "f" => "free",
129
+ "n" => "notched",
130
+ },
131
+ gill_spacing: {
132
+ "c" => "close",
133
+ "w" => "crowded",
134
+ "d" => "distant",
135
+ },
136
+ gill_size: {
137
+ "b" => "broad",
138
+ "n" => "narrow",
139
+ },
140
+ gill_color: {
141
+ "k" => "black",
142
+ "n" => "brown",
143
+ "b" => "buff",
144
+ "h" => "chocolate",
145
+ "g" => "gray",
146
+ "r" => "green",
147
+ "o" => "orange",
148
+ "p" => "pink",
149
+ "u" => "purple",
150
+ "e" => "red",
151
+ "w" => "white",
152
+ "y" => "yellow",
153
+ },
154
+ stalk_shape: {
155
+ "e" => "enlarging",
156
+ "t" => "tapering",
157
+ },
158
+ stalk_root: {
159
+ "b" => "bulbous",
160
+ "c" => "club",
161
+ "u" => "cup",
162
+ "e" => "equal",
163
+ "z" => "rhizomorphs",
164
+ "r" => "rooted",
165
+ "?" => "missing",
166
+ },
167
+ stalk_surface_above_ring: {
168
+ "f" => "fibrous",
169
+ "y" => "scaly",
170
+ "k" => "silky",
171
+ "s" => "smooth",
172
+ },
173
+ stalk_surface_below_ring: {
174
+ "f" => "fibrous",
175
+ "y" => "scaly",
176
+ "k" => "silky",
177
+ "s" => "smooth",
178
+ },
179
+ stalk_color_above_ring: {
180
+ "n" => "brown",
181
+ "b" => "buff",
182
+ "c" => "cinnamon",
183
+ "g" => "gray",
184
+ "o" => "orange",
185
+ "p" => "pink",
186
+ "e" => "red",
187
+ "w" => "white",
188
+ "y" => "yellow",
189
+ },
190
+ stalk_color_below_ring: {
191
+ "n" => "brown",
192
+ "b" => "buff",
193
+ "c" => "cinnamon",
194
+ "g" => "gray",
195
+ "o" => "orange",
196
+ "p" => "pink",
197
+ "e" => "red",
198
+ "w" => "white",
199
+ "y" => "yellow",
200
+ },
201
+ veil_type: {
202
+ "p" => "partial",
203
+ "u" => "universal",
204
+ },
205
+ veil_color: {
206
+ "n" => "brown",
207
+ "o" => "orange",
208
+ "w" => "white",
209
+ "y" => "yellow",
210
+ },
211
+ n_rings: {
212
+ "n" => 0,
213
+ "o" => 1,
214
+ "t" => 2,
215
+ },
216
+ ring_type: {
217
+ "c" => "cobwebby",
218
+ "e" => "evanescent",
219
+ "f" => "flaring",
220
+ "l" => "large",
221
+ "n" => "none",
222
+ "p" => "pendant",
223
+ "s" => "sheathing",
224
+ "z" => "zone",
225
+ },
226
+ spore_print_color: {
227
+ "k" => "black",
228
+ "n" => "brown",
229
+ "b" => "buff",
230
+ "h" => "chocolate",
231
+ "r" => "green",
232
+ "o" => "orange",
233
+ "u" => "purple",
234
+ "w" => "white",
235
+ "y" => "yellow",
236
+ },
237
+ population: {
238
+ "a" => "abundant",
239
+ "c" => "clustered",
240
+ "n" => "numerous",
241
+ "s" => "scattered",
242
+ "v" => "several",
243
+ "y" => "solitary",
244
+ },
245
+ habitat: {
246
+ "g" => "grasses",
247
+ "l" => "leaves",
248
+ "m" => "meadows",
249
+ "p" => "paths",
250
+ "u" => "urban",
251
+ "w" => "waste",
252
+ "d" => "woods",
253
+ }
254
+ }
255
+ end
256
+ end
@@ -0,0 +1,146 @@
1
+ require_relative "dataset"
2
+
3
+ module Datasets
4
+ module PenguinsRawData
5
+ Record = Struct.new(:study_name,
6
+ :sample_number,
7
+ :species,
8
+ :region,
9
+ :island,
10
+ :stage,
11
+ :individual_id,
12
+ :clutch_completion,
13
+ :date_egg,
14
+ :culmen_length_mm,
15
+ :culmen_depth_mm,
16
+ :flipper_length_mm,
17
+ :body_mass_g,
18
+ :sex,
19
+ :delta_15_n_permil,
20
+ :delta_13_c_permil,
21
+ :comments)
22
+ class SpeciesBase < Dataset
23
+ def initialize
24
+ super
25
+ species = self.class.name.split("::").last.downcase
26
+ @metadata.id = "palmerpenguins-raw-#{species}"
27
+ @metadata.url = self.class::URL
28
+ @metadata.licenses = ["CC0"]
29
+ @data_path = cache_dir_path + "penguins" + (species + ".csv")
30
+ end
31
+
32
+ attr_reader :data_path
33
+
34
+ def each
35
+ return to_enum(__method__) unless block_given?
36
+
37
+ open_data do |csv|
38
+ csv.each do |row|
39
+ next if row[0].nil?
40
+ record = Record.new(*row.fields)
41
+ yield record
42
+ end
43
+ end
44
+ end
45
+
46
+ private def open_data
47
+ download unless data_path.exist?
48
+ CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
49
+ yield csv
50
+ end
51
+ end
52
+
53
+ private def download
54
+ super(data_path, metadata.url)
55
+ end
56
+ end
57
+
58
+ # Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86
59
+ class Adelie < SpeciesBase
60
+ DOI = "doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86".freeze
61
+ URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.219.3&entityid=002f3893385f710df69eeebe893144ff".freeze
62
+ end
63
+
64
+ # Chinstrap penguin data from: https://doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7
65
+ class Chinstrap < SpeciesBase
66
+ DOI = "doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7".freeze
67
+ URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.221.2&entityid=fe853aa8f7a59aa84cdd3197619ef462".freeze
68
+ end
69
+
70
+ # Gentoo penguin data from: https://doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce
71
+ class Gentoo < SpeciesBase
72
+ DOI = "doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce".freeze
73
+ URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.220.3&entityid=e03b43c924f226486f2f0ab6709d2381".freeze
74
+ end
75
+ end
76
+
77
+ # This dataset provides the same dataset as https://github.com/allisonhorst/palmerpenguins
78
+ class Penguins < Dataset
79
+ Record = Struct.new(:species,
80
+ :island,
81
+ :bill_length_mm,
82
+ :bill_depth_mm,
83
+ :flipper_length_mm,
84
+ :body_mass_g,
85
+ :sex,
86
+ :year)
87
+
88
+ def initialize
89
+ super
90
+ @metadata.id = "palmerpenguins"
91
+ @metadata.name = "palmerpenguins"
92
+ @metadata.url = "https://allisonhorst.github.io/palmerpenguins/"
93
+ @metadata.licenses = ["CC0"]
94
+ @metadata.description = "A great dataset for data exploration & visualization, as an alternative to iris"
95
+ end
96
+
97
+ def each(&block)
98
+ return to_enum(__method__) unless block_given?
99
+
100
+ species_classes = [
101
+ PenguinsRawData::Adelie,
102
+ PenguinsRawData::Chinstrap,
103
+ PenguinsRawData::Gentoo,
104
+ ]
105
+
106
+ species_classes.each do |species_class|
107
+ species_class.new.each do |raw_record|
108
+ yield convert_record(raw_record)
109
+ end
110
+ end
111
+ end
112
+
113
+ private def convert_record(raw_record)
114
+ Record.new(*cleanse_fields(raw_record))
115
+ end
116
+
117
+ private def cleanse_fields(raw_record)
118
+ species = raw_record.species.split(' ')[0]
119
+ flipper_length_mm = raw_record.flipper_length_mm&.to_i
120
+ body_mass_g = raw_record.body_mass_g&.to_i
121
+ sex = normalize_sex(raw_record.sex)
122
+ year = raw_record.date_egg&.year
123
+
124
+ [
125
+ species,
126
+ raw_record.island,
127
+ raw_record.culmen_length_mm,
128
+ raw_record.culmen_depth_mm,
129
+ flipper_length_mm,
130
+ body_mass_g,
131
+ sex,
132
+ year
133
+ ]
134
+ end
135
+
136
+ private def normalize_sex(val)
137
+ val = val&.downcase
138
+ case val
139
+ when "female", "male", nil
140
+ val
141
+ else
142
+ nil
143
+ end
144
+ end
145
+ end
146
+ end