red-datasets 0.0.8 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +6 -0
- data/doc/text/news.md +93 -0
- data/lib/datasets.rb +9 -0
- data/lib/datasets/adult.rb +4 -3
- data/lib/datasets/cifar.rb +4 -12
- data/lib/datasets/cldr-plurals.rb +385 -0
- data/lib/datasets/communities.rb +198 -0
- data/lib/datasets/dataset.rb +20 -1
- data/lib/datasets/downloader.rb +54 -26
- data/lib/datasets/e-stat-japan.rb +320 -0
- data/lib/datasets/error.rb +4 -0
- data/lib/datasets/hepatitis.rb +207 -0
- data/lib/datasets/libsvm-dataset-list.rb +194 -54
- data/lib/datasets/libsvm.rb +1 -9
- data/lib/datasets/mnist.rb +6 -4
- data/lib/datasets/mushroom.rb +256 -0
- data/lib/datasets/penguins.rb +146 -0
- data/lib/datasets/rdatasets.rb +95 -0
- data/lib/datasets/seaborn-data.rb +49 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +169 -0
- data/lib/datasets/table.rb +83 -3
- data/lib/datasets/tar-gz-readable.rb +14 -0
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia.rb +2 -10
- data/red-datasets.gemspec +1 -0
- data/test/run-test.rb +2 -0
- data/test/test-cldr-plurals.rb +180 -0
- data/test/test-communities.rb +290 -0
- data/test/test-dataset.rb +27 -0
- data/test/test-downloader.rb +29 -0
- data/test/test-e-stat-japan.rb +383 -0
- data/test/test-hepatitis.rb +74 -0
- data/test/test-mushroom.rb +80 -0
- data/test/test-penguins.rb +251 -0
- data/test/test-rdatasets.rb +136 -0
- data/test/test-seaborn-data.rb +97 -0
- data/test/test-sudachi-synonym-dictionary.rb +48 -0
- data/test/test-table.rb +123 -18
- metadata +61 -15
data/lib/datasets/libsvm.rb
CHANGED
@@ -103,15 +103,7 @@ module Datasets
|
|
103
103
|
download(data_path, @file.url)
|
104
104
|
end
|
105
105
|
if data_path.extname == ".bz2"
|
106
|
-
|
107
|
-
pid = spawn("bzcat", data_path.to_s, {:out => output})
|
108
|
-
begin
|
109
|
-
output.close
|
110
|
-
yield(input)
|
111
|
-
ensure
|
112
|
-
input.close
|
113
|
-
Process.waitpid(pid)
|
114
|
-
end
|
106
|
+
extract_bz2(data_path, &block)
|
115
107
|
else
|
116
108
|
File.open(data_path, &block)
|
117
109
|
end
|
data/lib/datasets/mnist.rb
CHANGED
@@ -2,8 +2,6 @@ require 'zlib'
|
|
2
2
|
|
3
3
|
require_relative "dataset"
|
4
4
|
|
5
|
-
class SetTypeError < StandardError; end
|
6
|
-
|
7
5
|
module Datasets
|
8
6
|
class MNIST < Dataset
|
9
7
|
BASE_URL = "http://yann.lecun.com/exdb/mnist/"
|
@@ -67,7 +65,9 @@ module Datasets
|
|
67
65
|
n_bytes = n_uint32s * 4
|
68
66
|
mnist_magic_number = 2051
|
69
67
|
magic, n_images, n_rows, n_cols = f.read(n_bytes).unpack("N*")
|
70
|
-
|
68
|
+
if magic != mnist_magic_number
|
69
|
+
raise Error, "This is not #{dataset_name} image file"
|
70
|
+
end
|
71
71
|
n_images.times do |i|
|
72
72
|
data = f.read(n_rows * n_cols)
|
73
73
|
label = labels[i]
|
@@ -101,7 +101,9 @@ module Datasets
|
|
101
101
|
n_bytes = n_uint32s * 2
|
102
102
|
mnist_magic_number = 2049
|
103
103
|
magic, n_labels = f.read(n_bytes).unpack('N2')
|
104
|
-
|
104
|
+
if magic != mnist_magic_number
|
105
|
+
raise Error, "This is not #{dataset_name} label file"
|
106
|
+
end
|
105
107
|
f.read(n_labels).unpack('C*')
|
106
108
|
end
|
107
109
|
end
|
@@ -0,0 +1,256 @@
|
|
1
|
+
require "csv"
|
2
|
+
|
3
|
+
require_relative "dataset"
|
4
|
+
|
5
|
+
module Datasets
|
6
|
+
class Mushroom < Dataset
|
7
|
+
Record = Struct.new(
|
8
|
+
:label,
|
9
|
+
:cap_shape,
|
10
|
+
:cap_surface,
|
11
|
+
:cap_color,
|
12
|
+
:bruises,
|
13
|
+
:odor,
|
14
|
+
:gill_attachment,
|
15
|
+
:gill_spacing,
|
16
|
+
:gill_size,
|
17
|
+
:gill_color,
|
18
|
+
:stalk_shape,
|
19
|
+
:stalk_root,
|
20
|
+
:stalk_surface_above_ring,
|
21
|
+
:stalk_surface_below_ring,
|
22
|
+
:stalk_color_above_ring,
|
23
|
+
:stalk_color_below_ring,
|
24
|
+
:veil_type,
|
25
|
+
:veil_color,
|
26
|
+
:n_rings,
|
27
|
+
:ring_type,
|
28
|
+
:spore_print_color,
|
29
|
+
:population,
|
30
|
+
:habitat,
|
31
|
+
)
|
32
|
+
|
33
|
+
def initialize
|
34
|
+
super()
|
35
|
+
@metadata.id = "mushroom"
|
36
|
+
@metadata.name = "Mushroom"
|
37
|
+
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/mushroom"
|
38
|
+
@metadata.description = lambda do
|
39
|
+
read_names
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def each
|
44
|
+
return to_enum(__method__) unless block_given?
|
45
|
+
|
46
|
+
open_data do |csv|
|
47
|
+
csv.each do |row|
|
48
|
+
next if row[0].nil?
|
49
|
+
record = Record.new(*row)
|
50
|
+
record.members.each do |member|
|
51
|
+
record[member] = CONVERTERS[member][record[member]]
|
52
|
+
end
|
53
|
+
yield(record)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
def open_data
|
60
|
+
data_path = cache_dir_path + "agaricus-lepiota.data"
|
61
|
+
unless data_path.exist?
|
62
|
+
data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
|
63
|
+
download(data_path, data_url)
|
64
|
+
end
|
65
|
+
CSV.open(data_path) do |csv|
|
66
|
+
yield(csv)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def read_names
|
71
|
+
names_path = cache_dir_path + "agaricus-lepiota.names"
|
72
|
+
unless names_path.exist?
|
73
|
+
names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
|
74
|
+
download(names_path, names_url)
|
75
|
+
end
|
76
|
+
names_path.read
|
77
|
+
end
|
78
|
+
|
79
|
+
CONVERTERS = {
|
80
|
+
label: {
|
81
|
+
"p" => "poisonous",
|
82
|
+
"e" => "edible",
|
83
|
+
},
|
84
|
+
cap_shape: {
|
85
|
+
"b" => "bell",
|
86
|
+
"c" => "conical",
|
87
|
+
"x" => "convex",
|
88
|
+
"f" => "flat",
|
89
|
+
"k" => "knobbed",
|
90
|
+
"s" => "sunken",
|
91
|
+
},
|
92
|
+
cap_surface: {
|
93
|
+
"f" => "fibrous",
|
94
|
+
"g" => "grooves",
|
95
|
+
"y" => "scaly",
|
96
|
+
"s" => "smooth",
|
97
|
+
},
|
98
|
+
cap_color: {
|
99
|
+
"n" => "brown",
|
100
|
+
"b" => "buff",
|
101
|
+
"c" => "cinnamon",
|
102
|
+
"g" => "gray",
|
103
|
+
"r" => "green",
|
104
|
+
"p" => "pink",
|
105
|
+
"u" => "purple",
|
106
|
+
"e" => "red",
|
107
|
+
"w" => "white",
|
108
|
+
"y" => "yellow",
|
109
|
+
},
|
110
|
+
bruises: {
|
111
|
+
"t" => "bruises",
|
112
|
+
"f" => "no",
|
113
|
+
},
|
114
|
+
odor: {
|
115
|
+
"a" => "almond",
|
116
|
+
"l" => "anise",
|
117
|
+
"c" => "creosote",
|
118
|
+
"y" => "fishy",
|
119
|
+
"f" => "foul",
|
120
|
+
"m" => "musty",
|
121
|
+
"n" => "none",
|
122
|
+
"p" => "pungent",
|
123
|
+
"s" => "spicy",
|
124
|
+
},
|
125
|
+
gill_attachment: {
|
126
|
+
"a" => "attached",
|
127
|
+
"d" => "descending",
|
128
|
+
"f" => "free",
|
129
|
+
"n" => "notched",
|
130
|
+
},
|
131
|
+
gill_spacing: {
|
132
|
+
"c" => "close",
|
133
|
+
"w" => "crowded",
|
134
|
+
"d" => "distant",
|
135
|
+
},
|
136
|
+
gill_size: {
|
137
|
+
"b" => "broad",
|
138
|
+
"n" => "narrow",
|
139
|
+
},
|
140
|
+
gill_color: {
|
141
|
+
"k" => "black",
|
142
|
+
"n" => "brown",
|
143
|
+
"b" => "buff",
|
144
|
+
"h" => "chocolate",
|
145
|
+
"g" => "gray",
|
146
|
+
"r" => "green",
|
147
|
+
"o" => "orange",
|
148
|
+
"p" => "pink",
|
149
|
+
"u" => "purple",
|
150
|
+
"e" => "red",
|
151
|
+
"w" => "white",
|
152
|
+
"y" => "yellow",
|
153
|
+
},
|
154
|
+
stalk_shape: {
|
155
|
+
"e" => "enlarging",
|
156
|
+
"t" => "tapering",
|
157
|
+
},
|
158
|
+
stalk_root: {
|
159
|
+
"b" => "bulbous",
|
160
|
+
"c" => "club",
|
161
|
+
"u" => "cup",
|
162
|
+
"e" => "equal",
|
163
|
+
"z" => "rhizomorphs",
|
164
|
+
"r" => "rooted",
|
165
|
+
"?" => "missing",
|
166
|
+
},
|
167
|
+
stalk_surface_above_ring: {
|
168
|
+
"f" => "fibrous",
|
169
|
+
"y" => "scaly",
|
170
|
+
"k" => "silky",
|
171
|
+
"s" => "smooth",
|
172
|
+
},
|
173
|
+
stalk_surface_below_ring: {
|
174
|
+
"f" => "fibrous",
|
175
|
+
"y" => "scaly",
|
176
|
+
"k" => "silky",
|
177
|
+
"s" => "smooth",
|
178
|
+
},
|
179
|
+
stalk_color_above_ring: {
|
180
|
+
"n" => "brown",
|
181
|
+
"b" => "buff",
|
182
|
+
"c" => "cinnamon",
|
183
|
+
"g" => "gray",
|
184
|
+
"o" => "orange",
|
185
|
+
"p" => "pink",
|
186
|
+
"e" => "red",
|
187
|
+
"w" => "white",
|
188
|
+
"y" => "yellow",
|
189
|
+
},
|
190
|
+
stalk_color_below_ring: {
|
191
|
+
"n" => "brown",
|
192
|
+
"b" => "buff",
|
193
|
+
"c" => "cinnamon",
|
194
|
+
"g" => "gray",
|
195
|
+
"o" => "orange",
|
196
|
+
"p" => "pink",
|
197
|
+
"e" => "red",
|
198
|
+
"w" => "white",
|
199
|
+
"y" => "yellow",
|
200
|
+
},
|
201
|
+
veil_type: {
|
202
|
+
"p" => "partial",
|
203
|
+
"u" => "universal",
|
204
|
+
},
|
205
|
+
veil_color: {
|
206
|
+
"n" => "brown",
|
207
|
+
"o" => "orange",
|
208
|
+
"w" => "white",
|
209
|
+
"y" => "yellow",
|
210
|
+
},
|
211
|
+
n_rings: {
|
212
|
+
"n" => 0,
|
213
|
+
"o" => 1,
|
214
|
+
"t" => 2,
|
215
|
+
},
|
216
|
+
ring_type: {
|
217
|
+
"c" => "cobwebby",
|
218
|
+
"e" => "evanescent",
|
219
|
+
"f" => "flaring",
|
220
|
+
"l" => "large",
|
221
|
+
"n" => "none",
|
222
|
+
"p" => "pendant",
|
223
|
+
"s" => "sheathing",
|
224
|
+
"z" => "zone",
|
225
|
+
},
|
226
|
+
spore_print_color: {
|
227
|
+
"k" => "black",
|
228
|
+
"n" => "brown",
|
229
|
+
"b" => "buff",
|
230
|
+
"h" => "chocolate",
|
231
|
+
"r" => "green",
|
232
|
+
"o" => "orange",
|
233
|
+
"u" => "purple",
|
234
|
+
"w" => "white",
|
235
|
+
"y" => "yellow",
|
236
|
+
},
|
237
|
+
population: {
|
238
|
+
"a" => "abundant",
|
239
|
+
"c" => "clustered",
|
240
|
+
"n" => "numerous",
|
241
|
+
"s" => "scattered",
|
242
|
+
"v" => "several",
|
243
|
+
"y" => "solitary",
|
244
|
+
},
|
245
|
+
habitat: {
|
246
|
+
"g" => "grasses",
|
247
|
+
"l" => "leaves",
|
248
|
+
"m" => "meadows",
|
249
|
+
"p" => "paths",
|
250
|
+
"u" => "urban",
|
251
|
+
"w" => "waste",
|
252
|
+
"d" => "woods",
|
253
|
+
}
|
254
|
+
}
|
255
|
+
end
|
256
|
+
end
|
@@ -0,0 +1,146 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
module PenguinsRawData
|
5
|
+
Record = Struct.new(:study_name,
|
6
|
+
:sample_number,
|
7
|
+
:species,
|
8
|
+
:region,
|
9
|
+
:island,
|
10
|
+
:stage,
|
11
|
+
:individual_id,
|
12
|
+
:clutch_completion,
|
13
|
+
:date_egg,
|
14
|
+
:culmen_length_mm,
|
15
|
+
:culmen_depth_mm,
|
16
|
+
:flipper_length_mm,
|
17
|
+
:body_mass_g,
|
18
|
+
:sex,
|
19
|
+
:delta_15_n_permil,
|
20
|
+
:delta_13_c_permil,
|
21
|
+
:comments)
|
22
|
+
class SpeciesBase < Dataset
|
23
|
+
def initialize
|
24
|
+
super
|
25
|
+
species = self.class.name.split("::").last.downcase
|
26
|
+
@metadata.id = "palmerpenguins-raw-#{species}"
|
27
|
+
@metadata.url = self.class::URL
|
28
|
+
@metadata.licenses = ["CC0"]
|
29
|
+
@data_path = cache_dir_path + "penguins" + (species + ".csv")
|
30
|
+
end
|
31
|
+
|
32
|
+
attr_reader :data_path
|
33
|
+
|
34
|
+
def each
|
35
|
+
return to_enum(__method__) unless block_given?
|
36
|
+
|
37
|
+
open_data do |csv|
|
38
|
+
csv.each do |row|
|
39
|
+
next if row[0].nil?
|
40
|
+
record = Record.new(*row.fields)
|
41
|
+
yield record
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
private def open_data
|
47
|
+
download unless data_path.exist?
|
48
|
+
CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
|
49
|
+
yield csv
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
private def download
|
54
|
+
super(data_path, metadata.url)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86
|
59
|
+
class Adelie < SpeciesBase
|
60
|
+
DOI = "doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86".freeze
|
61
|
+
URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.219.3&entityid=002f3893385f710df69eeebe893144ff".freeze
|
62
|
+
end
|
63
|
+
|
64
|
+
# Chinstrap penguin data from: https://doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7
|
65
|
+
class Chinstrap < SpeciesBase
|
66
|
+
DOI = "doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7".freeze
|
67
|
+
URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.221.2&entityid=fe853aa8f7a59aa84cdd3197619ef462".freeze
|
68
|
+
end
|
69
|
+
|
70
|
+
# Gentoo penguin data from: https://doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce
|
71
|
+
class Gentoo < SpeciesBase
|
72
|
+
DOI = "doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce".freeze
|
73
|
+
URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.220.3&entityid=e03b43c924f226486f2f0ab6709d2381".freeze
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# This dataset provides the same dataset as https://github.com/allisonhorst/palmerpenguins
|
78
|
+
class Penguins < Dataset
|
79
|
+
Record = Struct.new(:species,
|
80
|
+
:island,
|
81
|
+
:bill_length_mm,
|
82
|
+
:bill_depth_mm,
|
83
|
+
:flipper_length_mm,
|
84
|
+
:body_mass_g,
|
85
|
+
:sex,
|
86
|
+
:year)
|
87
|
+
|
88
|
+
def initialize
|
89
|
+
super
|
90
|
+
@metadata.id = "palmerpenguins"
|
91
|
+
@metadata.name = "palmerpenguins"
|
92
|
+
@metadata.url = "https://allisonhorst.github.io/palmerpenguins/"
|
93
|
+
@metadata.licenses = ["CC0"]
|
94
|
+
@metadata.description = "A great dataset for data exploration & visualization, as an alternative to iris"
|
95
|
+
end
|
96
|
+
|
97
|
+
def each(&block)
|
98
|
+
return to_enum(__method__) unless block_given?
|
99
|
+
|
100
|
+
species_classes = [
|
101
|
+
PenguinsRawData::Adelie,
|
102
|
+
PenguinsRawData::Chinstrap,
|
103
|
+
PenguinsRawData::Gentoo,
|
104
|
+
]
|
105
|
+
|
106
|
+
species_classes.each do |species_class|
|
107
|
+
species_class.new.each do |raw_record|
|
108
|
+
yield convert_record(raw_record)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
private def convert_record(raw_record)
|
114
|
+
Record.new(*cleanse_fields(raw_record))
|
115
|
+
end
|
116
|
+
|
117
|
+
private def cleanse_fields(raw_record)
|
118
|
+
species = raw_record.species.split(' ')[0]
|
119
|
+
flipper_length_mm = raw_record.flipper_length_mm&.to_i
|
120
|
+
body_mass_g = raw_record.body_mass_g&.to_i
|
121
|
+
sex = normalize_sex(raw_record.sex)
|
122
|
+
year = raw_record.date_egg&.year
|
123
|
+
|
124
|
+
[
|
125
|
+
species,
|
126
|
+
raw_record.island,
|
127
|
+
raw_record.culmen_length_mm,
|
128
|
+
raw_record.culmen_depth_mm,
|
129
|
+
flipper_length_mm,
|
130
|
+
body_mass_g,
|
131
|
+
sex,
|
132
|
+
year
|
133
|
+
]
|
134
|
+
end
|
135
|
+
|
136
|
+
private def normalize_sex(val)
|
137
|
+
val = val&.downcase
|
138
|
+
case val
|
139
|
+
when "female", "male", nil
|
140
|
+
val
|
141
|
+
else
|
142
|
+
nil
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|