red-datasets 0.0.7 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +20 -4
- data/doc/text/news.md +102 -0
- data/lib/datasets.rb +19 -9
- data/lib/datasets/adult.rb +4 -3
- data/lib/datasets/cifar.rb +4 -12
- data/lib/datasets/cldr-plurals.rb +385 -0
- data/lib/datasets/communities.rb +198 -0
- data/lib/datasets/dataset.rb +20 -1
- data/lib/datasets/downloader.rb +54 -26
- data/lib/datasets/e-stat-japan.rb +320 -0
- data/lib/datasets/error.rb +4 -0
- data/lib/datasets/hepatitis.rb +207 -0
- data/lib/datasets/libsvm-dataset-list.rb +277 -0
- data/lib/datasets/libsvm.rb +135 -0
- data/lib/datasets/mnist.rb +0 -2
- data/lib/datasets/mushroom.rb +256 -0
- data/lib/datasets/penguins.rb +146 -0
- data/lib/datasets/postal-code-japan.rb +154 -0
- data/lib/datasets/rdatasets.rb +95 -0
- data/lib/datasets/table.rb +83 -3
- data/lib/datasets/tar_gz_readable.rb +14 -0
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia.rb +2 -10
- data/red-datasets.gemspec +4 -0
- data/test/run-test.rb +2 -0
- data/test/test-cldr-plurals.rb +180 -0
- data/test/test-communities.rb +290 -0
- data/test/test-dataset.rb +27 -0
- data/test/test-downloader.rb +29 -0
- data/test/test-e-stat-japan.rb +383 -0
- data/test/test-hepatitis.rb +74 -0
- data/test/test-libsvm-dataset-list.rb +47 -0
- data/test/test-libsvm.rb +205 -0
- data/test/test-mushroom.rb +80 -0
- data/test/test-penguins.rb +251 -0
- data/test/test-postal-code-japan.rb +69 -0
- data/test/test-rdatasets.rb +136 -0
- data/test/test-table.rb +123 -18
- metadata +88 -11
@@ -0,0 +1,135 @@
|
|
1
|
+
require "csv"
|
2
|
+
|
3
|
+
require_relative "dataset"
|
4
|
+
|
5
|
+
module Datasets
|
6
|
+
class LIBSVM < Dataset
|
7
|
+
class Record
|
8
|
+
attr_reader :label
|
9
|
+
attr_reader :features
|
10
|
+
def initialize(label, features)
|
11
|
+
@label = label
|
12
|
+
@features = features
|
13
|
+
end
|
14
|
+
|
15
|
+
def [](index)
|
16
|
+
@features[index]
|
17
|
+
end
|
18
|
+
|
19
|
+
def to_h
|
20
|
+
hash = {
|
21
|
+
label: @label,
|
22
|
+
}
|
23
|
+
@features.each_with_index do |feature, i|
|
24
|
+
hash[i] = feature
|
25
|
+
end
|
26
|
+
hash
|
27
|
+
end
|
28
|
+
|
29
|
+
def values
|
30
|
+
[@label] + @features
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def initialize(name,
|
35
|
+
note: nil,
|
36
|
+
default_feature_value: 0)
|
37
|
+
super()
|
38
|
+
@libsvm_dataset_metadata = fetch_dataset_info(name)
|
39
|
+
@file = choose_file(note)
|
40
|
+
@default_feature_value = default_feature_value
|
41
|
+
@metadata.id = "libsvm-#{normalize_name(name)}"
|
42
|
+
@metadata.name = "LIBSVM dataset: #{name}"
|
43
|
+
@metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
|
44
|
+
end
|
45
|
+
|
46
|
+
def each
|
47
|
+
return to_enum(__method__) unless block_given?
|
48
|
+
|
49
|
+
open_data do |input|
|
50
|
+
n_features = @libsvm_dataset_metadata.n_features
|
51
|
+
csv = CSV.new(input, col_sep: " ")
|
52
|
+
csv.each do |row|
|
53
|
+
label = parse_label(row.shift)
|
54
|
+
features = [@default_feature_value] * n_features
|
55
|
+
row.each do |column|
|
56
|
+
next if column.nil?
|
57
|
+
index, value = column.split(":", 2)
|
58
|
+
features[Integer(index, 10) - 1] = parse_value(value)
|
59
|
+
end
|
60
|
+
yield(Record.new(label, features))
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
def fetch_dataset_info(name)
|
67
|
+
list = LIBSVMDatasetList.new
|
68
|
+
available_datasets = []
|
69
|
+
list.each do |record|
|
70
|
+
available_datasets << record.name
|
71
|
+
if record.name == name
|
72
|
+
return record
|
73
|
+
end
|
74
|
+
end
|
75
|
+
message = "unavailable LIBSVM dataset: #{name.inspect}: "
|
76
|
+
message << "available datasets: ["
|
77
|
+
message << available_datasets.collect(&:inspect).join(", ")
|
78
|
+
message << "]"
|
79
|
+
raise ArgumentError, message
|
80
|
+
end
|
81
|
+
|
82
|
+
def choose_file(note)
|
83
|
+
files = @libsvm_dataset_metadata.files
|
84
|
+
return files.first if note.nil?
|
85
|
+
|
86
|
+
available_notes = []
|
87
|
+
@libsvm_dataset_metadata.files.find do |file|
|
88
|
+
return file if file.note == note
|
89
|
+
available_notes << file.note if file.note
|
90
|
+
end
|
91
|
+
|
92
|
+
name = @libsvm_dataset_metadata.name
|
93
|
+
message = "unavailable note: #{name}: #{note.inspect}: "
|
94
|
+
message << "available notes: ["
|
95
|
+
message << available_notes.collect(&:inspect).join(", ")
|
96
|
+
message << "]"
|
97
|
+
raise ArgumentError, message
|
98
|
+
end
|
99
|
+
|
100
|
+
def open_data(&block)
|
101
|
+
data_path = cache_dir_path + @file.name
|
102
|
+
unless data_path.exist?
|
103
|
+
download(data_path, @file.url)
|
104
|
+
end
|
105
|
+
if data_path.extname == ".bz2"
|
106
|
+
extract_bz2(data_path, &block)
|
107
|
+
else
|
108
|
+
File.open(data_path, &block)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def normalize_name(name)
|
113
|
+
name.gsub(/[()]/, "").gsub(/[ _;]+/, "-").downcase
|
114
|
+
end
|
115
|
+
|
116
|
+
def parse_label(label)
|
117
|
+
labels = label.split(",").collect do |value|
|
118
|
+
parse_value(value)
|
119
|
+
end
|
120
|
+
if labels.size == 1
|
121
|
+
labels[0]
|
122
|
+
else
|
123
|
+
labels
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
def parse_value(value)
|
128
|
+
if value.include?(".")
|
129
|
+
Float(value)
|
130
|
+
else
|
131
|
+
Integer(value, 10)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
data/lib/datasets/mnist.rb
CHANGED
@@ -0,0 +1,256 @@
|
|
1
|
+
require "csv"
|
2
|
+
|
3
|
+
require_relative "dataset"
|
4
|
+
|
5
|
+
module Datasets
|
6
|
+
class Mushroom < Dataset
|
7
|
+
Record = Struct.new(
|
8
|
+
:label,
|
9
|
+
:cap_shape,
|
10
|
+
:cap_surface,
|
11
|
+
:cap_color,
|
12
|
+
:bruises,
|
13
|
+
:odor,
|
14
|
+
:gill_attachment,
|
15
|
+
:gill_spacing,
|
16
|
+
:gill_size,
|
17
|
+
:gill_color,
|
18
|
+
:stalk_shape,
|
19
|
+
:stalk_root,
|
20
|
+
:stalk_surface_above_ring,
|
21
|
+
:stalk_surface_below_ring,
|
22
|
+
:stalk_color_above_ring,
|
23
|
+
:stalk_color_below_ring,
|
24
|
+
:veil_type,
|
25
|
+
:veil_color,
|
26
|
+
:n_rings,
|
27
|
+
:ring_type,
|
28
|
+
:spore_print_color,
|
29
|
+
:population,
|
30
|
+
:habitat,
|
31
|
+
)
|
32
|
+
|
33
|
+
def initialize
|
34
|
+
super()
|
35
|
+
@metadata.id = "mushroom"
|
36
|
+
@metadata.name = "Mushroom"
|
37
|
+
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/mushroom"
|
38
|
+
@metadata.description = lambda do
|
39
|
+
read_names
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def each
|
44
|
+
return to_enum(__method__) unless block_given?
|
45
|
+
|
46
|
+
open_data do |csv|
|
47
|
+
csv.each do |row|
|
48
|
+
next if row[0].nil?
|
49
|
+
record = Record.new(*row)
|
50
|
+
record.members.each do |member|
|
51
|
+
record[member] = CONVERTERS[member][record[member]]
|
52
|
+
end
|
53
|
+
yield(record)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
def open_data
|
60
|
+
data_path = cache_dir_path + "agaricus-lepiota.data"
|
61
|
+
unless data_path.exist?
|
62
|
+
data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
|
63
|
+
download(data_path, data_url)
|
64
|
+
end
|
65
|
+
CSV.open(data_path) do |csv|
|
66
|
+
yield(csv)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def read_names
|
71
|
+
names_path = cache_dir_path + "agaricus-lepiota.names"
|
72
|
+
unless names_path.exist?
|
73
|
+
names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
|
74
|
+
download(names_path, names_url)
|
75
|
+
end
|
76
|
+
names_path.read
|
77
|
+
end
|
78
|
+
|
79
|
+
CONVERTERS = {
|
80
|
+
label: {
|
81
|
+
"p" => "poisonous",
|
82
|
+
"e" => "edible",
|
83
|
+
},
|
84
|
+
cap_shape: {
|
85
|
+
"b" => "bell",
|
86
|
+
"c" => "conical",
|
87
|
+
"x" => "convex",
|
88
|
+
"f" => "flat",
|
89
|
+
"k" => "knobbed",
|
90
|
+
"s" => "sunken",
|
91
|
+
},
|
92
|
+
cap_surface: {
|
93
|
+
"f" => "fibrous",
|
94
|
+
"g" => "grooves",
|
95
|
+
"y" => "scaly",
|
96
|
+
"s" => "smooth",
|
97
|
+
},
|
98
|
+
cap_color: {
|
99
|
+
"n" => "brown",
|
100
|
+
"b" => "buff",
|
101
|
+
"c" => "cinnamon",
|
102
|
+
"g" => "gray",
|
103
|
+
"r" => "green",
|
104
|
+
"p" => "pink",
|
105
|
+
"u" => "purple",
|
106
|
+
"e" => "red",
|
107
|
+
"w" => "white",
|
108
|
+
"y" => "yellow",
|
109
|
+
},
|
110
|
+
bruises: {
|
111
|
+
"t" => "bruises",
|
112
|
+
"f" => "no",
|
113
|
+
},
|
114
|
+
odor: {
|
115
|
+
"a" => "almond",
|
116
|
+
"l" => "anise",
|
117
|
+
"c" => "creosote",
|
118
|
+
"y" => "fishy",
|
119
|
+
"f" => "foul",
|
120
|
+
"m" => "musty",
|
121
|
+
"n" => "none",
|
122
|
+
"p" => "pungent",
|
123
|
+
"s" => "spicy",
|
124
|
+
},
|
125
|
+
gill_attachment: {
|
126
|
+
"a" => "attached",
|
127
|
+
"d" => "descending",
|
128
|
+
"f" => "free",
|
129
|
+
"n" => "notched",
|
130
|
+
},
|
131
|
+
gill_spacing: {
|
132
|
+
"c" => "close",
|
133
|
+
"w" => "crowded",
|
134
|
+
"d" => "distant",
|
135
|
+
},
|
136
|
+
gill_size: {
|
137
|
+
"b" => "broad",
|
138
|
+
"n" => "narrow",
|
139
|
+
},
|
140
|
+
gill_color: {
|
141
|
+
"k" => "black",
|
142
|
+
"n" => "brown",
|
143
|
+
"b" => "buff",
|
144
|
+
"h" => "chocolate",
|
145
|
+
"g" => "gray",
|
146
|
+
"r" => "green",
|
147
|
+
"o" => "orange",
|
148
|
+
"p" => "pink",
|
149
|
+
"u" => "purple",
|
150
|
+
"e" => "red",
|
151
|
+
"w" => "white",
|
152
|
+
"y" => "yellow",
|
153
|
+
},
|
154
|
+
stalk_shape: {
|
155
|
+
"e" => "enlarging",
|
156
|
+
"t" => "tapering",
|
157
|
+
},
|
158
|
+
stalk_root: {
|
159
|
+
"b" => "bulbous",
|
160
|
+
"c" => "club",
|
161
|
+
"u" => "cup",
|
162
|
+
"e" => "equal",
|
163
|
+
"z" => "rhizomorphs",
|
164
|
+
"r" => "rooted",
|
165
|
+
"?" => "missing",
|
166
|
+
},
|
167
|
+
stalk_surface_above_ring: {
|
168
|
+
"f" => "fibrous",
|
169
|
+
"y" => "scaly",
|
170
|
+
"k" => "silky",
|
171
|
+
"s" => "smooth",
|
172
|
+
},
|
173
|
+
stalk_surface_below_ring: {
|
174
|
+
"f" => "fibrous",
|
175
|
+
"y" => "scaly",
|
176
|
+
"k" => "silky",
|
177
|
+
"s" => "smooth",
|
178
|
+
},
|
179
|
+
stalk_color_above_ring: {
|
180
|
+
"n" => "brown",
|
181
|
+
"b" => "buff",
|
182
|
+
"c" => "cinnamon",
|
183
|
+
"g" => "gray",
|
184
|
+
"o" => "orange",
|
185
|
+
"p" => "pink",
|
186
|
+
"e" => "red",
|
187
|
+
"w" => "white",
|
188
|
+
"y" => "yellow",
|
189
|
+
},
|
190
|
+
stalk_color_below_ring: {
|
191
|
+
"n" => "brown",
|
192
|
+
"b" => "buff",
|
193
|
+
"c" => "cinnamon",
|
194
|
+
"g" => "gray",
|
195
|
+
"o" => "orange",
|
196
|
+
"p" => "pink",
|
197
|
+
"e" => "red",
|
198
|
+
"w" => "white",
|
199
|
+
"y" => "yellow",
|
200
|
+
},
|
201
|
+
veil_type: {
|
202
|
+
"p" => "partial",
|
203
|
+
"u" => "universal",
|
204
|
+
},
|
205
|
+
veil_color: {
|
206
|
+
"n" => "brown",
|
207
|
+
"o" => "orange",
|
208
|
+
"w" => "white",
|
209
|
+
"y" => "yellow",
|
210
|
+
},
|
211
|
+
n_rings: {
|
212
|
+
"n" => 0,
|
213
|
+
"o" => 1,
|
214
|
+
"t" => 2,
|
215
|
+
},
|
216
|
+
ring_type: {
|
217
|
+
"c" => "cobwebby",
|
218
|
+
"e" => "evanescent",
|
219
|
+
"f" => "flaring",
|
220
|
+
"l" => "large",
|
221
|
+
"n" => "none",
|
222
|
+
"p" => "pendant",
|
223
|
+
"s" => "sheathing",
|
224
|
+
"z" => "zone",
|
225
|
+
},
|
226
|
+
spore_print_color: {
|
227
|
+
"k" => "black",
|
228
|
+
"n" => "brown",
|
229
|
+
"b" => "buff",
|
230
|
+
"h" => "chocolate",
|
231
|
+
"r" => "green",
|
232
|
+
"o" => "orange",
|
233
|
+
"u" => "purple",
|
234
|
+
"w" => "white",
|
235
|
+
"y" => "yellow",
|
236
|
+
},
|
237
|
+
population: {
|
238
|
+
"a" => "abundant",
|
239
|
+
"c" => "clustered",
|
240
|
+
"n" => "numerous",
|
241
|
+
"s" => "scattered",
|
242
|
+
"v" => "several",
|
243
|
+
"y" => "solitary",
|
244
|
+
},
|
245
|
+
habitat: {
|
246
|
+
"g" => "grasses",
|
247
|
+
"l" => "leaves",
|
248
|
+
"m" => "meadows",
|
249
|
+
"p" => "paths",
|
250
|
+
"u" => "urban",
|
251
|
+
"w" => "waste",
|
252
|
+
"d" => "woods",
|
253
|
+
}
|
254
|
+
}
|
255
|
+
end
|
256
|
+
end
|
@@ -0,0 +1,146 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
module PenguinsRawData
|
5
|
+
Record = Struct.new(:study_name,
|
6
|
+
:sample_number,
|
7
|
+
:species,
|
8
|
+
:region,
|
9
|
+
:island,
|
10
|
+
:stage,
|
11
|
+
:individual_id,
|
12
|
+
:clutch_completion,
|
13
|
+
:date_egg,
|
14
|
+
:culmen_length_mm,
|
15
|
+
:culmen_depth_mm,
|
16
|
+
:flipper_length_mm,
|
17
|
+
:body_mass_g,
|
18
|
+
:sex,
|
19
|
+
:delta_15_n_permil,
|
20
|
+
:delta_13_c_permil,
|
21
|
+
:comments)
|
22
|
+
class SpeciesBase < Dataset
|
23
|
+
def initialize
|
24
|
+
super
|
25
|
+
species = self.class.name.split("::").last.downcase
|
26
|
+
@metadata.id = "palmerpenguins-raw-#{species}"
|
27
|
+
@metadata.url = self.class::URL
|
28
|
+
@metadata.licenses = ["CC0"]
|
29
|
+
@data_path = cache_dir_path + "penguins" + (species + ".csv")
|
30
|
+
end
|
31
|
+
|
32
|
+
attr_reader :data_path
|
33
|
+
|
34
|
+
def each
|
35
|
+
return to_enum(__method__) unless block_given?
|
36
|
+
|
37
|
+
open_data do |csv|
|
38
|
+
csv.each do |row|
|
39
|
+
next if row[0].nil?
|
40
|
+
record = Record.new(*row.fields)
|
41
|
+
yield record
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
private def open_data
|
47
|
+
download unless data_path.exist?
|
48
|
+
CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
|
49
|
+
yield csv
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
private def download
|
54
|
+
super(data_path, metadata.url)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86
|
59
|
+
class Adelie < SpeciesBase
|
60
|
+
DOI = "doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86".freeze
|
61
|
+
URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.219.3&entityid=002f3893385f710df69eeebe893144ff".freeze
|
62
|
+
end
|
63
|
+
|
64
|
+
# Chinstrap penguin data from: https://doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7
|
65
|
+
class Chinstrap < SpeciesBase
|
66
|
+
DOI = "doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7".freeze
|
67
|
+
URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.221.2&entityid=fe853aa8f7a59aa84cdd3197619ef462".freeze
|
68
|
+
end
|
69
|
+
|
70
|
+
# Gentoo penguin data from: https://doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce
|
71
|
+
class Gentoo < SpeciesBase
|
72
|
+
DOI = "doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce".freeze
|
73
|
+
URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.220.3&entityid=e03b43c924f226486f2f0ab6709d2381".freeze
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# This dataset provides the same dataset as https://github.com/allisonhorst/palmerpenguins
|
78
|
+
class Penguins < Dataset
|
79
|
+
Record = Struct.new(:species,
|
80
|
+
:island,
|
81
|
+
:bill_length_mm,
|
82
|
+
:bill_depth_mm,
|
83
|
+
:flipper_length_mm,
|
84
|
+
:body_mass_g,
|
85
|
+
:sex,
|
86
|
+
:year)
|
87
|
+
|
88
|
+
def initialize
|
89
|
+
super
|
90
|
+
@metadata.id = "palmerpenguins"
|
91
|
+
@metadata.name = "palmerpenguins"
|
92
|
+
@metadata.url = "https://allisonhorst.github.io/palmerpenguins/"
|
93
|
+
@metadata.licenses = ["CC0"]
|
94
|
+
@metadata.description = "A great dataset for data exploration & visualization, as an alternative to iris"
|
95
|
+
end
|
96
|
+
|
97
|
+
def each(&block)
|
98
|
+
return to_enum(__method__) unless block_given?
|
99
|
+
|
100
|
+
species_classes = [
|
101
|
+
PenguinsRawData::Adelie,
|
102
|
+
PenguinsRawData::Chinstrap,
|
103
|
+
PenguinsRawData::Gentoo,
|
104
|
+
]
|
105
|
+
|
106
|
+
species_classes.each do |species_class|
|
107
|
+
species_class.new.each do |raw_record|
|
108
|
+
yield convert_record(raw_record)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
private def convert_record(raw_record)
|
114
|
+
Record.new(*cleanse_fields(raw_record))
|
115
|
+
end
|
116
|
+
|
117
|
+
private def cleanse_fields(raw_record)
|
118
|
+
species = raw_record.species.split(' ')[0]
|
119
|
+
flipper_length_mm = raw_record.flipper_length_mm&.to_i
|
120
|
+
body_mass_g = raw_record.body_mass_g&.to_i
|
121
|
+
sex = normalize_sex(raw_record.sex)
|
122
|
+
year = raw_record.date_egg&.year
|
123
|
+
|
124
|
+
[
|
125
|
+
species,
|
126
|
+
raw_record.island,
|
127
|
+
raw_record.culmen_length_mm,
|
128
|
+
raw_record.culmen_depth_mm,
|
129
|
+
flipper_length_mm,
|
130
|
+
body_mass_g,
|
131
|
+
sex,
|
132
|
+
year
|
133
|
+
]
|
134
|
+
end
|
135
|
+
|
136
|
+
private def normalize_sex(val)
|
137
|
+
val = val&.downcase
|
138
|
+
case val
|
139
|
+
when "female", "male", nil
|
140
|
+
val
|
141
|
+
else
|
142
|
+
nil
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|