red-datasets 0.0.8 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -103,15 +103,7 @@ module Datasets
103
103
  download(data_path, @file.url)
104
104
  end
105
105
  if data_path.extname == ".bz2"
106
- input, output = IO.pipe
107
- pid = spawn("bzcat", data_path.to_s, {:out => output})
108
- begin
109
- output.close
110
- yield(input)
111
- ensure
112
- input.close
113
- Process.waitpid(pid)
114
- end
106
+ extract_bz2(data_path, &block)
115
107
  else
116
108
  File.open(data_path, &block)
117
109
  end
@@ -2,8 +2,6 @@ require 'zlib'
2
2
 
3
3
  require_relative "dataset"
4
4
 
5
- class SetTypeError < StandardError; end
6
-
7
5
  module Datasets
8
6
  class MNIST < Dataset
9
7
  BASE_URL = "http://yann.lecun.com/exdb/mnist/"
@@ -67,7 +65,9 @@ module Datasets
67
65
  n_bytes = n_uint32s * 4
68
66
  mnist_magic_number = 2051
69
67
  magic, n_images, n_rows, n_cols = f.read(n_bytes).unpack("N*")
70
- raise "This is not #{dataset_name} image file" if magic != mnist_magic_number
68
+ if magic != mnist_magic_number
69
+ raise Error, "This is not #{dataset_name} image file"
70
+ end
71
71
  n_images.times do |i|
72
72
  data = f.read(n_rows * n_cols)
73
73
  label = labels[i]
@@ -101,7 +101,9 @@ module Datasets
101
101
  n_bytes = n_uint32s * 2
102
102
  mnist_magic_number = 2049
103
103
  magic, n_labels = f.read(n_bytes).unpack('N2')
104
- raise "This is not #{dataset_name} label file" if magic != mnist_magic_number
104
+ if magic != mnist_magic_number
105
+ raise Error, "This is not #{dataset_name} label file"
106
+ end
105
107
  f.read(n_labels).unpack('C*')
106
108
  end
107
109
  end
@@ -0,0 +1,256 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class Mushroom < Dataset
7
+ Record = Struct.new(
8
+ :label,
9
+ :cap_shape,
10
+ :cap_surface,
11
+ :cap_color,
12
+ :bruises,
13
+ :odor,
14
+ :gill_attachment,
15
+ :gill_spacing,
16
+ :gill_size,
17
+ :gill_color,
18
+ :stalk_shape,
19
+ :stalk_root,
20
+ :stalk_surface_above_ring,
21
+ :stalk_surface_below_ring,
22
+ :stalk_color_above_ring,
23
+ :stalk_color_below_ring,
24
+ :veil_type,
25
+ :veil_color,
26
+ :n_rings,
27
+ :ring_type,
28
+ :spore_print_color,
29
+ :population,
30
+ :habitat,
31
+ )
32
+
33
+ def initialize
34
+ super()
35
+ @metadata.id = "mushroom"
36
+ @metadata.name = "Mushroom"
37
+ @metadata.url = "https://archive.ics.uci.edu/ml/datasets/mushroom"
38
+ @metadata.description = lambda do
39
+ read_names
40
+ end
41
+ end
42
+
43
+ def each
44
+ return to_enum(__method__) unless block_given?
45
+
46
+ open_data do |csv|
47
+ csv.each do |row|
48
+ next if row[0].nil?
49
+ record = Record.new(*row)
50
+ record.members.each do |member|
51
+ record[member] = CONVERTERS[member][record[member]]
52
+ end
53
+ yield(record)
54
+ end
55
+ end
56
+ end
57
+
58
+ private
59
+ def open_data
60
+ data_path = cache_dir_path + "agaricus-lepiota.data"
61
+ unless data_path.exist?
62
+ data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
63
+ download(data_path, data_url)
64
+ end
65
+ CSV.open(data_path) do |csv|
66
+ yield(csv)
67
+ end
68
+ end
69
+
70
+ def read_names
71
+ names_path = cache_dir_path + "agaricus-lepiota.names"
72
+ unless names_path.exist?
73
+ names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
74
+ download(names_path, names_url)
75
+ end
76
+ names_path.read
77
+ end
78
+
79
+ CONVERTERS = {
80
+ label: {
81
+ "p" => "poisonous",
82
+ "e" => "edible",
83
+ },
84
+ cap_shape: {
85
+ "b" => "bell",
86
+ "c" => "conical",
87
+ "x" => "convex",
88
+ "f" => "flat",
89
+ "k" => "knobbed",
90
+ "s" => "sunken",
91
+ },
92
+ cap_surface: {
93
+ "f" => "fibrous",
94
+ "g" => "grooves",
95
+ "y" => "scaly",
96
+ "s" => "smooth",
97
+ },
98
+ cap_color: {
99
+ "n" => "brown",
100
+ "b" => "buff",
101
+ "c" => "cinnamon",
102
+ "g" => "gray",
103
+ "r" => "green",
104
+ "p" => "pink",
105
+ "u" => "purple",
106
+ "e" => "red",
107
+ "w" => "white",
108
+ "y" => "yellow",
109
+ },
110
+ bruises: {
111
+ "t" => "bruises",
112
+ "f" => "no",
113
+ },
114
+ odor: {
115
+ "a" => "almond",
116
+ "l" => "anise",
117
+ "c" => "creosote",
118
+ "y" => "fishy",
119
+ "f" => "foul",
120
+ "m" => "musty",
121
+ "n" => "none",
122
+ "p" => "pungent",
123
+ "s" => "spicy",
124
+ },
125
+ gill_attachment: {
126
+ "a" => "attached",
127
+ "d" => "descending",
128
+ "f" => "free",
129
+ "n" => "notched",
130
+ },
131
+ gill_spacing: {
132
+ "c" => "close",
133
+ "w" => "crowded",
134
+ "d" => "distant",
135
+ },
136
+ gill_size: {
137
+ "b" => "broad",
138
+ "n" => "narrow",
139
+ },
140
+ gill_color: {
141
+ "k" => "black",
142
+ "n" => "brown",
143
+ "b" => "buff",
144
+ "h" => "chocolate",
145
+ "g" => "gray",
146
+ "r" => "green",
147
+ "o" => "orange",
148
+ "p" => "pink",
149
+ "u" => "purple",
150
+ "e" => "red",
151
+ "w" => "white",
152
+ "y" => "yellow",
153
+ },
154
+ stalk_shape: {
155
+ "e" => "enlarging",
156
+ "t" => "tapering",
157
+ },
158
+ stalk_root: {
159
+ "b" => "bulbous",
160
+ "c" => "club",
161
+ "u" => "cup",
162
+ "e" => "equal",
163
+ "z" => "rhizomorphs",
164
+ "r" => "rooted",
165
+ "?" => "missing",
166
+ },
167
+ stalk_surface_above_ring: {
168
+ "f" => "fibrous",
169
+ "y" => "scaly",
170
+ "k" => "silky",
171
+ "s" => "smooth",
172
+ },
173
+ stalk_surface_below_ring: {
174
+ "f" => "fibrous",
175
+ "y" => "scaly",
176
+ "k" => "silky",
177
+ "s" => "smooth",
178
+ },
179
+ stalk_color_above_ring: {
180
+ "n" => "brown",
181
+ "b" => "buff",
182
+ "c" => "cinnamon",
183
+ "g" => "gray",
184
+ "o" => "orange",
185
+ "p" => "pink",
186
+ "e" => "red",
187
+ "w" => "white",
188
+ "y" => "yellow",
189
+ },
190
+ stalk_color_below_ring: {
191
+ "n" => "brown",
192
+ "b" => "buff",
193
+ "c" => "cinnamon",
194
+ "g" => "gray",
195
+ "o" => "orange",
196
+ "p" => "pink",
197
+ "e" => "red",
198
+ "w" => "white",
199
+ "y" => "yellow",
200
+ },
201
+ veil_type: {
202
+ "p" => "partial",
203
+ "u" => "universal",
204
+ },
205
+ veil_color: {
206
+ "n" => "brown",
207
+ "o" => "orange",
208
+ "w" => "white",
209
+ "y" => "yellow",
210
+ },
211
+ n_rings: {
212
+ "n" => 0,
213
+ "o" => 1,
214
+ "t" => 2,
215
+ },
216
+ ring_type: {
217
+ "c" => "cobwebby",
218
+ "e" => "evanescent",
219
+ "f" => "flaring",
220
+ "l" => "large",
221
+ "n" => "none",
222
+ "p" => "pendant",
223
+ "s" => "sheathing",
224
+ "z" => "zone",
225
+ },
226
+ spore_print_color: {
227
+ "k" => "black",
228
+ "n" => "brown",
229
+ "b" => "buff",
230
+ "h" => "chocolate",
231
+ "r" => "green",
232
+ "o" => "orange",
233
+ "u" => "purple",
234
+ "w" => "white",
235
+ "y" => "yellow",
236
+ },
237
+ population: {
238
+ "a" => "abundant",
239
+ "c" => "clustered",
240
+ "n" => "numerous",
241
+ "s" => "scattered",
242
+ "v" => "several",
243
+ "y" => "solitary",
244
+ },
245
+ habitat: {
246
+ "g" => "grasses",
247
+ "l" => "leaves",
248
+ "m" => "meadows",
249
+ "p" => "paths",
250
+ "u" => "urban",
251
+ "w" => "waste",
252
+ "d" => "woods",
253
+ }
254
+ }
255
+ end
256
+ end
@@ -0,0 +1,146 @@
1
+ require_relative "dataset"
2
+
3
+ module Datasets
4
+ module PenguinsRawData
5
+ Record = Struct.new(:study_name,
6
+ :sample_number,
7
+ :species,
8
+ :region,
9
+ :island,
10
+ :stage,
11
+ :individual_id,
12
+ :clutch_completion,
13
+ :date_egg,
14
+ :culmen_length_mm,
15
+ :culmen_depth_mm,
16
+ :flipper_length_mm,
17
+ :body_mass_g,
18
+ :sex,
19
+ :delta_15_n_permil,
20
+ :delta_13_c_permil,
21
+ :comments)
22
+ class SpeciesBase < Dataset
23
+ def initialize
24
+ super
25
+ species = self.class.name.split("::").last.downcase
26
+ @metadata.id = "palmerpenguins-raw-#{species}"
27
+ @metadata.url = self.class::URL
28
+ @metadata.licenses = ["CC0"]
29
+ @data_path = cache_dir_path + "penguins" + (species + ".csv")
30
+ end
31
+
32
+ attr_reader :data_path
33
+
34
+ def each
35
+ return to_enum(__method__) unless block_given?
36
+
37
+ open_data do |csv|
38
+ csv.each do |row|
39
+ next if row[0].nil?
40
+ record = Record.new(*row.fields)
41
+ yield record
42
+ end
43
+ end
44
+ end
45
+
46
+ private def open_data
47
+ download unless data_path.exist?
48
+ CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
49
+ yield csv
50
+ end
51
+ end
52
+
53
+ private def download
54
+ super(data_path, metadata.url)
55
+ end
56
+ end
57
+
58
+ # Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86
59
+ class Adelie < SpeciesBase
60
+ DOI = "doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86".freeze
61
+ URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.219.3&entityid=002f3893385f710df69eeebe893144ff".freeze
62
+ end
63
+
64
+ # Chinstrap penguin data from: https://doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7
65
+ class Chinstrap < SpeciesBase
66
+ DOI = "doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7".freeze
67
+ URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.221.2&entityid=fe853aa8f7a59aa84cdd3197619ef462".freeze
68
+ end
69
+
70
+ # Gentoo penguin data from: https://doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce
71
+ class Gentoo < SpeciesBase
72
+ DOI = "doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce".freeze
73
+ URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.220.3&entityid=e03b43c924f226486f2f0ab6709d2381".freeze
74
+ end
75
+ end
76
+
77
+ # This dataset provides the same dataset as https://github.com/allisonhorst/palmerpenguins
78
+ class Penguins < Dataset
79
+ Record = Struct.new(:species,
80
+ :island,
81
+ :bill_length_mm,
82
+ :bill_depth_mm,
83
+ :flipper_length_mm,
84
+ :body_mass_g,
85
+ :sex,
86
+ :year)
87
+
88
+ def initialize
89
+ super
90
+ @metadata.id = "palmerpenguins"
91
+ @metadata.name = "palmerpenguins"
92
+ @metadata.url = "https://allisonhorst.github.io/palmerpenguins/"
93
+ @metadata.licenses = ["CC0"]
94
+ @metadata.description = "A great dataset for data exploration & visualization, as an alternative to iris"
95
+ end
96
+
97
+ def each(&block)
98
+ return to_enum(__method__) unless block_given?
99
+
100
+ species_classes = [
101
+ PenguinsRawData::Adelie,
102
+ PenguinsRawData::Chinstrap,
103
+ PenguinsRawData::Gentoo,
104
+ ]
105
+
106
+ species_classes.each do |species_class|
107
+ species_class.new.each do |raw_record|
108
+ yield convert_record(raw_record)
109
+ end
110
+ end
111
+ end
112
+
113
+ private def convert_record(raw_record)
114
+ Record.new(*cleanse_fields(raw_record))
115
+ end
116
+
117
+ private def cleanse_fields(raw_record)
118
+ species = raw_record.species.split(' ')[0]
119
+ flipper_length_mm = raw_record.flipper_length_mm&.to_i
120
+ body_mass_g = raw_record.body_mass_g&.to_i
121
+ sex = normalize_sex(raw_record.sex)
122
+ year = raw_record.date_egg&.year
123
+
124
+ [
125
+ species,
126
+ raw_record.island,
127
+ raw_record.culmen_length_mm,
128
+ raw_record.culmen_depth_mm,
129
+ flipper_length_mm,
130
+ body_mass_g,
131
+ sex,
132
+ year
133
+ ]
134
+ end
135
+
136
+ private def normalize_sex(val)
137
+ val = val&.downcase
138
+ case val
139
+ when "female", "male", nil
140
+ val
141
+ else
142
+ nil
143
+ end
144
+ end
145
+ end
146
+ end