red-datasets 0.0.7 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +20 -4
- data/doc/text/news.md +102 -0
- data/lib/datasets.rb +19 -9
- data/lib/datasets/adult.rb +4 -3
- data/lib/datasets/cifar.rb +4 -12
- data/lib/datasets/cldr-plurals.rb +385 -0
- data/lib/datasets/communities.rb +198 -0
- data/lib/datasets/dataset.rb +20 -1
- data/lib/datasets/downloader.rb +54 -26
- data/lib/datasets/e-stat-japan.rb +320 -0
- data/lib/datasets/error.rb +4 -0
- data/lib/datasets/hepatitis.rb +207 -0
- data/lib/datasets/libsvm-dataset-list.rb +277 -0
- data/lib/datasets/libsvm.rb +135 -0
- data/lib/datasets/mnist.rb +0 -2
- data/lib/datasets/mushroom.rb +256 -0
- data/lib/datasets/penguins.rb +146 -0
- data/lib/datasets/postal-code-japan.rb +154 -0
- data/lib/datasets/rdatasets.rb +95 -0
- data/lib/datasets/table.rb +83 -3
- data/lib/datasets/tar_gz_readable.rb +14 -0
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia.rb +2 -10
- data/red-datasets.gemspec +4 -0
- data/test/run-test.rb +2 -0
- data/test/test-cldr-plurals.rb +180 -0
- data/test/test-communities.rb +290 -0
- data/test/test-dataset.rb +27 -0
- data/test/test-downloader.rb +29 -0
- data/test/test-e-stat-japan.rb +383 -0
- data/test/test-hepatitis.rb +74 -0
- data/test/test-libsvm-dataset-list.rb +47 -0
- data/test/test-libsvm.rb +205 -0
- data/test/test-mushroom.rb +80 -0
- data/test/test-penguins.rb +251 -0
- data/test/test-postal-code-japan.rb +69 -0
- data/test/test-rdatasets.rb +136 -0
- data/test/test-table.rb +123 -18
- metadata +88 -11
@@ -0,0 +1,135 @@
|
|
1
|
+
require "csv"
|
2
|
+
|
3
|
+
require_relative "dataset"
|
4
|
+
|
5
|
+
module Datasets
|
6
|
+
class LIBSVM < Dataset
|
7
|
+
class Record
|
8
|
+
attr_reader :label
|
9
|
+
attr_reader :features
|
10
|
+
def initialize(label, features)
|
11
|
+
@label = label
|
12
|
+
@features = features
|
13
|
+
end
|
14
|
+
|
15
|
+
def [](index)
|
16
|
+
@features[index]
|
17
|
+
end
|
18
|
+
|
19
|
+
def to_h
|
20
|
+
hash = {
|
21
|
+
label: @label,
|
22
|
+
}
|
23
|
+
@features.each_with_index do |feature, i|
|
24
|
+
hash[i] = feature
|
25
|
+
end
|
26
|
+
hash
|
27
|
+
end
|
28
|
+
|
29
|
+
def values
|
30
|
+
[@label] + @features
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def initialize(name,
|
35
|
+
note: nil,
|
36
|
+
default_feature_value: 0)
|
37
|
+
super()
|
38
|
+
@libsvm_dataset_metadata = fetch_dataset_info(name)
|
39
|
+
@file = choose_file(note)
|
40
|
+
@default_feature_value = default_feature_value
|
41
|
+
@metadata.id = "libsvm-#{normalize_name(name)}"
|
42
|
+
@metadata.name = "LIBSVM dataset: #{name}"
|
43
|
+
@metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
|
44
|
+
end
|
45
|
+
|
46
|
+
def each
|
47
|
+
return to_enum(__method__) unless block_given?
|
48
|
+
|
49
|
+
open_data do |input|
|
50
|
+
n_features = @libsvm_dataset_metadata.n_features
|
51
|
+
csv = CSV.new(input, col_sep: " ")
|
52
|
+
csv.each do |row|
|
53
|
+
label = parse_label(row.shift)
|
54
|
+
features = [@default_feature_value] * n_features
|
55
|
+
row.each do |column|
|
56
|
+
next if column.nil?
|
57
|
+
index, value = column.split(":", 2)
|
58
|
+
features[Integer(index, 10) - 1] = parse_value(value)
|
59
|
+
end
|
60
|
+
yield(Record.new(label, features))
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
def fetch_dataset_info(name)
|
67
|
+
list = LIBSVMDatasetList.new
|
68
|
+
available_datasets = []
|
69
|
+
list.each do |record|
|
70
|
+
available_datasets << record.name
|
71
|
+
if record.name == name
|
72
|
+
return record
|
73
|
+
end
|
74
|
+
end
|
75
|
+
message = "unavailable LIBSVM dataset: #{name.inspect}: "
|
76
|
+
message << "available datasets: ["
|
77
|
+
message << available_datasets.collect(&:inspect).join(", ")
|
78
|
+
message << "]"
|
79
|
+
raise ArgumentError, message
|
80
|
+
end
|
81
|
+
|
82
|
+
def choose_file(note)
|
83
|
+
files = @libsvm_dataset_metadata.files
|
84
|
+
return files.first if note.nil?
|
85
|
+
|
86
|
+
available_notes = []
|
87
|
+
@libsvm_dataset_metadata.files.find do |file|
|
88
|
+
return file if file.note == note
|
89
|
+
available_notes << file.note if file.note
|
90
|
+
end
|
91
|
+
|
92
|
+
name = @libsvm_dataset_metadata.name
|
93
|
+
message = "unavailable note: #{name}: #{note.inspect}: "
|
94
|
+
message << "available notes: ["
|
95
|
+
message << available_notes.collect(&:inspect).join(", ")
|
96
|
+
message << "]"
|
97
|
+
raise ArgumentError, message
|
98
|
+
end
|
99
|
+
|
100
|
+
def open_data(&block)
|
101
|
+
data_path = cache_dir_path + @file.name
|
102
|
+
unless data_path.exist?
|
103
|
+
download(data_path, @file.url)
|
104
|
+
end
|
105
|
+
if data_path.extname == ".bz2"
|
106
|
+
extract_bz2(data_path, &block)
|
107
|
+
else
|
108
|
+
File.open(data_path, &block)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def normalize_name(name)
|
113
|
+
name.gsub(/[()]/, "").gsub(/[ _;]+/, "-").downcase
|
114
|
+
end
|
115
|
+
|
116
|
+
def parse_label(label)
|
117
|
+
labels = label.split(",").collect do |value|
|
118
|
+
parse_value(value)
|
119
|
+
end
|
120
|
+
if labels.size == 1
|
121
|
+
labels[0]
|
122
|
+
else
|
123
|
+
labels
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
def parse_value(value)
|
128
|
+
if value.include?(".")
|
129
|
+
Float(value)
|
130
|
+
else
|
131
|
+
Integer(value, 10)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
data/lib/datasets/mnist.rb
CHANGED
@@ -0,0 +1,256 @@
|
|
1
|
+
require "csv"
|
2
|
+
|
3
|
+
require_relative "dataset"
|
4
|
+
|
5
|
+
module Datasets
|
6
|
+
class Mushroom < Dataset
|
7
|
+
Record = Struct.new(
|
8
|
+
:label,
|
9
|
+
:cap_shape,
|
10
|
+
:cap_surface,
|
11
|
+
:cap_color,
|
12
|
+
:bruises,
|
13
|
+
:odor,
|
14
|
+
:gill_attachment,
|
15
|
+
:gill_spacing,
|
16
|
+
:gill_size,
|
17
|
+
:gill_color,
|
18
|
+
:stalk_shape,
|
19
|
+
:stalk_root,
|
20
|
+
:stalk_surface_above_ring,
|
21
|
+
:stalk_surface_below_ring,
|
22
|
+
:stalk_color_above_ring,
|
23
|
+
:stalk_color_below_ring,
|
24
|
+
:veil_type,
|
25
|
+
:veil_color,
|
26
|
+
:n_rings,
|
27
|
+
:ring_type,
|
28
|
+
:spore_print_color,
|
29
|
+
:population,
|
30
|
+
:habitat,
|
31
|
+
)
|
32
|
+
|
33
|
+
def initialize
|
34
|
+
super()
|
35
|
+
@metadata.id = "mushroom"
|
36
|
+
@metadata.name = "Mushroom"
|
37
|
+
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/mushroom"
|
38
|
+
@metadata.description = lambda do
|
39
|
+
read_names
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def each
|
44
|
+
return to_enum(__method__) unless block_given?
|
45
|
+
|
46
|
+
open_data do |csv|
|
47
|
+
csv.each do |row|
|
48
|
+
next if row[0].nil?
|
49
|
+
record = Record.new(*row)
|
50
|
+
record.members.each do |member|
|
51
|
+
record[member] = CONVERTERS[member][record[member]]
|
52
|
+
end
|
53
|
+
yield(record)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
def open_data
|
60
|
+
data_path = cache_dir_path + "agaricus-lepiota.data"
|
61
|
+
unless data_path.exist?
|
62
|
+
data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
|
63
|
+
download(data_path, data_url)
|
64
|
+
end
|
65
|
+
CSV.open(data_path) do |csv|
|
66
|
+
yield(csv)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def read_names
|
71
|
+
names_path = cache_dir_path + "agaricus-lepiota.names"
|
72
|
+
unless names_path.exist?
|
73
|
+
names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
|
74
|
+
download(names_path, names_url)
|
75
|
+
end
|
76
|
+
names_path.read
|
77
|
+
end
|
78
|
+
|
79
|
+
CONVERTERS = {
|
80
|
+
label: {
|
81
|
+
"p" => "poisonous",
|
82
|
+
"e" => "edible",
|
83
|
+
},
|
84
|
+
cap_shape: {
|
85
|
+
"b" => "bell",
|
86
|
+
"c" => "conical",
|
87
|
+
"x" => "convex",
|
88
|
+
"f" => "flat",
|
89
|
+
"k" => "knobbed",
|
90
|
+
"s" => "sunken",
|
91
|
+
},
|
92
|
+
cap_surface: {
|
93
|
+
"f" => "fibrous",
|
94
|
+
"g" => "grooves",
|
95
|
+
"y" => "scaly",
|
96
|
+
"s" => "smooth",
|
97
|
+
},
|
98
|
+
cap_color: {
|
99
|
+
"n" => "brown",
|
100
|
+
"b" => "buff",
|
101
|
+
"c" => "cinnamon",
|
102
|
+
"g" => "gray",
|
103
|
+
"r" => "green",
|
104
|
+
"p" => "pink",
|
105
|
+
"u" => "purple",
|
106
|
+
"e" => "red",
|
107
|
+
"w" => "white",
|
108
|
+
"y" => "yellow",
|
109
|
+
},
|
110
|
+
bruises: {
|
111
|
+
"t" => "bruises",
|
112
|
+
"f" => "no",
|
113
|
+
},
|
114
|
+
odor: {
|
115
|
+
"a" => "almond",
|
116
|
+
"l" => "anise",
|
117
|
+
"c" => "creosote",
|
118
|
+
"y" => "fishy",
|
119
|
+
"f" => "foul",
|
120
|
+
"m" => "musty",
|
121
|
+
"n" => "none",
|
122
|
+
"p" => "pungent",
|
123
|
+
"s" => "spicy",
|
124
|
+
},
|
125
|
+
gill_attachment: {
|
126
|
+
"a" => "attached",
|
127
|
+
"d" => "descending",
|
128
|
+
"f" => "free",
|
129
|
+
"n" => "notched",
|
130
|
+
},
|
131
|
+
gill_spacing: {
|
132
|
+
"c" => "close",
|
133
|
+
"w" => "crowded",
|
134
|
+
"d" => "distant",
|
135
|
+
},
|
136
|
+
gill_size: {
|
137
|
+
"b" => "broad",
|
138
|
+
"n" => "narrow",
|
139
|
+
},
|
140
|
+
gill_color: {
|
141
|
+
"k" => "black",
|
142
|
+
"n" => "brown",
|
143
|
+
"b" => "buff",
|
144
|
+
"h" => "chocolate",
|
145
|
+
"g" => "gray",
|
146
|
+
"r" => "green",
|
147
|
+
"o" => "orange",
|
148
|
+
"p" => "pink",
|
149
|
+
"u" => "purple",
|
150
|
+
"e" => "red",
|
151
|
+
"w" => "white",
|
152
|
+
"y" => "yellow",
|
153
|
+
},
|
154
|
+
stalk_shape: {
|
155
|
+
"e" => "enlarging",
|
156
|
+
"t" => "tapering",
|
157
|
+
},
|
158
|
+
stalk_root: {
|
159
|
+
"b" => "bulbous",
|
160
|
+
"c" => "club",
|
161
|
+
"u" => "cup",
|
162
|
+
"e" => "equal",
|
163
|
+
"z" => "rhizomorphs",
|
164
|
+
"r" => "rooted",
|
165
|
+
"?" => "missing",
|
166
|
+
},
|
167
|
+
stalk_surface_above_ring: {
|
168
|
+
"f" => "fibrous",
|
169
|
+
"y" => "scaly",
|
170
|
+
"k" => "silky",
|
171
|
+
"s" => "smooth",
|
172
|
+
},
|
173
|
+
stalk_surface_below_ring: {
|
174
|
+
"f" => "fibrous",
|
175
|
+
"y" => "scaly",
|
176
|
+
"k" => "silky",
|
177
|
+
"s" => "smooth",
|
178
|
+
},
|
179
|
+
stalk_color_above_ring: {
|
180
|
+
"n" => "brown",
|
181
|
+
"b" => "buff",
|
182
|
+
"c" => "cinnamon",
|
183
|
+
"g" => "gray",
|
184
|
+
"o" => "orange",
|
185
|
+
"p" => "pink",
|
186
|
+
"e" => "red",
|
187
|
+
"w" => "white",
|
188
|
+
"y" => "yellow",
|
189
|
+
},
|
190
|
+
stalk_color_below_ring: {
|
191
|
+
"n" => "brown",
|
192
|
+
"b" => "buff",
|
193
|
+
"c" => "cinnamon",
|
194
|
+
"g" => "gray",
|
195
|
+
"o" => "orange",
|
196
|
+
"p" => "pink",
|
197
|
+
"e" => "red",
|
198
|
+
"w" => "white",
|
199
|
+
"y" => "yellow",
|
200
|
+
},
|
201
|
+
veil_type: {
|
202
|
+
"p" => "partial",
|
203
|
+
"u" => "universal",
|
204
|
+
},
|
205
|
+
veil_color: {
|
206
|
+
"n" => "brown",
|
207
|
+
"o" => "orange",
|
208
|
+
"w" => "white",
|
209
|
+
"y" => "yellow",
|
210
|
+
},
|
211
|
+
n_rings: {
|
212
|
+
"n" => 0,
|
213
|
+
"o" => 1,
|
214
|
+
"t" => 2,
|
215
|
+
},
|
216
|
+
ring_type: {
|
217
|
+
"c" => "cobwebby",
|
218
|
+
"e" => "evanescent",
|
219
|
+
"f" => "flaring",
|
220
|
+
"l" => "large",
|
221
|
+
"n" => "none",
|
222
|
+
"p" => "pendant",
|
223
|
+
"s" => "sheathing",
|
224
|
+
"z" => "zone",
|
225
|
+
},
|
226
|
+
spore_print_color: {
|
227
|
+
"k" => "black",
|
228
|
+
"n" => "brown",
|
229
|
+
"b" => "buff",
|
230
|
+
"h" => "chocolate",
|
231
|
+
"r" => "green",
|
232
|
+
"o" => "orange",
|
233
|
+
"u" => "purple",
|
234
|
+
"w" => "white",
|
235
|
+
"y" => "yellow",
|
236
|
+
},
|
237
|
+
population: {
|
238
|
+
"a" => "abundant",
|
239
|
+
"c" => "clustered",
|
240
|
+
"n" => "numerous",
|
241
|
+
"s" => "scattered",
|
242
|
+
"v" => "several",
|
243
|
+
"y" => "solitary",
|
244
|
+
},
|
245
|
+
habitat: {
|
246
|
+
"g" => "grasses",
|
247
|
+
"l" => "leaves",
|
248
|
+
"m" => "meadows",
|
249
|
+
"p" => "paths",
|
250
|
+
"u" => "urban",
|
251
|
+
"w" => "waste",
|
252
|
+
"d" => "woods",
|
253
|
+
}
|
254
|
+
}
|
255
|
+
end
|
256
|
+
end
|
@@ -0,0 +1,146 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
module PenguinsRawData
|
5
|
+
Record = Struct.new(:study_name,
|
6
|
+
:sample_number,
|
7
|
+
:species,
|
8
|
+
:region,
|
9
|
+
:island,
|
10
|
+
:stage,
|
11
|
+
:individual_id,
|
12
|
+
:clutch_completion,
|
13
|
+
:date_egg,
|
14
|
+
:culmen_length_mm,
|
15
|
+
:culmen_depth_mm,
|
16
|
+
:flipper_length_mm,
|
17
|
+
:body_mass_g,
|
18
|
+
:sex,
|
19
|
+
:delta_15_n_permil,
|
20
|
+
:delta_13_c_permil,
|
21
|
+
:comments)
|
22
|
+
class SpeciesBase < Dataset
|
23
|
+
def initialize
|
24
|
+
super
|
25
|
+
species = self.class.name.split("::").last.downcase
|
26
|
+
@metadata.id = "palmerpenguins-raw-#{species}"
|
27
|
+
@metadata.url = self.class::URL
|
28
|
+
@metadata.licenses = ["CC0"]
|
29
|
+
@data_path = cache_dir_path + "penguins" + (species + ".csv")
|
30
|
+
end
|
31
|
+
|
32
|
+
attr_reader :data_path
|
33
|
+
|
34
|
+
def each
|
35
|
+
return to_enum(__method__) unless block_given?
|
36
|
+
|
37
|
+
open_data do |csv|
|
38
|
+
csv.each do |row|
|
39
|
+
next if row[0].nil?
|
40
|
+
record = Record.new(*row.fields)
|
41
|
+
yield record
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
private def open_data
|
47
|
+
download unless data_path.exist?
|
48
|
+
CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
|
49
|
+
yield csv
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
private def download
|
54
|
+
super(data_path, metadata.url)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86
|
59
|
+
class Adelie < SpeciesBase
|
60
|
+
DOI = "doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86".freeze
|
61
|
+
URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.219.3&entityid=002f3893385f710df69eeebe893144ff".freeze
|
62
|
+
end
|
63
|
+
|
64
|
+
# Chinstrap penguin data from: https://doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7
|
65
|
+
class Chinstrap < SpeciesBase
|
66
|
+
DOI = "doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7".freeze
|
67
|
+
URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.221.2&entityid=fe853aa8f7a59aa84cdd3197619ef462".freeze
|
68
|
+
end
|
69
|
+
|
70
|
+
# Gentoo penguin data from: https://doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce
|
71
|
+
class Gentoo < SpeciesBase
|
72
|
+
DOI = "doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce".freeze
|
73
|
+
URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.220.3&entityid=e03b43c924f226486f2f0ab6709d2381".freeze
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# This dataset provides the same dataset as https://github.com/allisonhorst/palmerpenguins
|
78
|
+
class Penguins < Dataset
|
79
|
+
Record = Struct.new(:species,
|
80
|
+
:island,
|
81
|
+
:bill_length_mm,
|
82
|
+
:bill_depth_mm,
|
83
|
+
:flipper_length_mm,
|
84
|
+
:body_mass_g,
|
85
|
+
:sex,
|
86
|
+
:year)
|
87
|
+
|
88
|
+
def initialize
|
89
|
+
super
|
90
|
+
@metadata.id = "palmerpenguins"
|
91
|
+
@metadata.name = "palmerpenguins"
|
92
|
+
@metadata.url = "https://allisonhorst.github.io/palmerpenguins/"
|
93
|
+
@metadata.licenses = ["CC0"]
|
94
|
+
@metadata.description = "A great dataset for data exploration & visualization, as an alternative to iris"
|
95
|
+
end
|
96
|
+
|
97
|
+
def each(&block)
|
98
|
+
return to_enum(__method__) unless block_given?
|
99
|
+
|
100
|
+
species_classes = [
|
101
|
+
PenguinsRawData::Adelie,
|
102
|
+
PenguinsRawData::Chinstrap,
|
103
|
+
PenguinsRawData::Gentoo,
|
104
|
+
]
|
105
|
+
|
106
|
+
species_classes.each do |species_class|
|
107
|
+
species_class.new.each do |raw_record|
|
108
|
+
yield convert_record(raw_record)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
private def convert_record(raw_record)
|
114
|
+
Record.new(*cleanse_fields(raw_record))
|
115
|
+
end
|
116
|
+
|
117
|
+
private def cleanse_fields(raw_record)
|
118
|
+
species = raw_record.species.split(' ')[0]
|
119
|
+
flipper_length_mm = raw_record.flipper_length_mm&.to_i
|
120
|
+
body_mass_g = raw_record.body_mass_g&.to_i
|
121
|
+
sex = normalize_sex(raw_record.sex)
|
122
|
+
year = raw_record.date_egg&.year
|
123
|
+
|
124
|
+
[
|
125
|
+
species,
|
126
|
+
raw_record.island,
|
127
|
+
raw_record.culmen_length_mm,
|
128
|
+
raw_record.culmen_depth_mm,
|
129
|
+
flipper_length_mm,
|
130
|
+
body_mass_g,
|
131
|
+
sex,
|
132
|
+
year
|
133
|
+
]
|
134
|
+
end
|
135
|
+
|
136
|
+
private def normalize_sex(val)
|
137
|
+
val = val&.downcase
|
138
|
+
case val
|
139
|
+
when "female", "male", nil
|
140
|
+
val
|
141
|
+
else
|
142
|
+
nil
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|