red-datasets 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/doc/text/news.md +33 -0
- data/lib/datasets.rb +1 -0
- data/lib/datasets/dataset.rb +12 -0
- data/lib/datasets/libsvm-dataset-list.rb +194 -54
- data/lib/datasets/libsvm.rb +1 -9
- data/lib/datasets/mushroom.rb +256 -0
- data/lib/datasets/table.rb +83 -3
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia.rb +2 -10
- data/test/test-mushroom.rb +80 -0
- data/test/test-table.rb +123 -18
- metadata +16 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 81ed53e83d75d517052aaf07c66fe177f12f986584141c951ac1dcfa2fc88646
|
4
|
+
data.tar.gz: 94b9f3b8042eaad65304bf7c3d2fc35519f8328b0ca4e9f8a7ad9be13781a91e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c73561ed005e4b58f27fc6de969605a22d57adf4bc5b5184e5cdb65739f1ac6b86f6ed67794bfe61164859fc4a1b0f80430bc819b2ea37ac455a560a6f008b13
|
7
|
+
data.tar.gz: 07560b09d68272dc7a959c16ec03975d1fa752f9d6930f0fd746c46e9236995606694f5899bad5bf770812c5a2d81e6f013353f680fc8adf65ad42bae514f57c
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,38 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 0.0.9 - 2019-09-09
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* `Datasets::LIBSVMDatasetList`: Improved performance.
|
8
|
+
|
9
|
+
* `Datasets::Mushroom`: Added.
|
10
|
+
[GitHub#33][Patch by Yasuo Honda]
|
11
|
+
|
12
|
+
* `Datasets::Table#n_columns`: Added.
|
13
|
+
|
14
|
+
* `Datasets::Table#n_rows`: Added.
|
15
|
+
|
16
|
+
* `Datasets::Table#[]`: Added support for index access.
|
17
|
+
|
18
|
+
* `Datasets::Table#coolumn_names`: Added.
|
19
|
+
|
20
|
+
* `Datasets::Table#size`: Added.
|
21
|
+
|
22
|
+
* `Datasets::Table#length`: Added.
|
23
|
+
|
24
|
+
* `Datasets::Table#each_column`: Added.
|
25
|
+
|
26
|
+
* `Datasets::Table#each_record`: Added.
|
27
|
+
|
28
|
+
* `Datasets::Table#find_record`: Added.
|
29
|
+
|
30
|
+
### Thanks
|
31
|
+
|
32
|
+
* Yasuo Honda
|
33
|
+
|
34
|
+
### Improvements
|
35
|
+
|
3
36
|
## 0.0.8 - 2019-03-24
|
4
37
|
|
5
38
|
### Improvements
|
data/lib/datasets.rb
CHANGED
@@ -7,6 +7,7 @@ require_relative "datasets/iris"
|
|
7
7
|
require_relative "datasets/libsvm"
|
8
8
|
require_relative "datasets/libsvm-dataset-list"
|
9
9
|
require_relative "datasets/mnist"
|
10
|
+
require_relative "datasets/mushroom"
|
10
11
|
require_relative "datasets/penn-treebank"
|
11
12
|
require_relative "datasets/postal-code-japan"
|
12
13
|
require_relative "datasets/wikipedia"
|
data/lib/datasets/dataset.rb
CHANGED
@@ -34,5 +34,17 @@ module Datasets
|
|
34
34
|
downloader = Downloader.new(url)
|
35
35
|
downloader.download(output_path)
|
36
36
|
end
|
37
|
+
|
38
|
+
def extract_bz2(path)
|
39
|
+
input, output = IO.pipe
|
40
|
+
pid = spawn("bzcat", path.to_s, {:out => output})
|
41
|
+
begin
|
42
|
+
output.close
|
43
|
+
yield(input)
|
44
|
+
ensure
|
45
|
+
input.close
|
46
|
+
Process.waitpid(pid)
|
47
|
+
end
|
48
|
+
end
|
37
49
|
end
|
38
50
|
end
|
@@ -1,5 +1,6 @@
|
|
1
|
-
require "
|
2
|
-
require "rexml/
|
1
|
+
require "rexml/streamlistener"
|
2
|
+
require "rexml/parsers/baseparser"
|
3
|
+
require "rexml/parsers/streamparser"
|
3
4
|
|
4
5
|
require_relative "dataset"
|
5
6
|
|
@@ -32,26 +33,17 @@ module Datasets
|
|
32
33
|
end
|
33
34
|
end
|
34
35
|
|
35
|
-
def each
|
36
|
+
def each(&block)
|
36
37
|
return to_enum(__method__) unless block_given?
|
37
38
|
|
38
39
|
open_data do |input|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
if is_header
|
44
|
-
is_header = false
|
45
|
-
next
|
40
|
+
catch do |abort_tag|
|
41
|
+
listener = IndexListener.new(abort_tag) do |href, record|
|
42
|
+
parse_detail(href, record)
|
43
|
+
yield(record)
|
46
44
|
end
|
47
|
-
|
48
|
-
|
49
|
-
href = a.attributes["href"]
|
50
|
-
record = Record.new
|
51
|
-
record.name = a.text
|
52
|
-
record.files = []
|
53
|
-
parse_detail(href, record)
|
54
|
-
yield(record)
|
45
|
+
parser = REXML::Parsers::StreamParser.new(input, listener)
|
46
|
+
parser.parse
|
55
47
|
end
|
56
48
|
end
|
57
49
|
end
|
@@ -69,17 +61,11 @@ module Datasets
|
|
69
61
|
|
70
62
|
def extract_description
|
71
63
|
open_data do |input|
|
72
|
-
document = REXML::Document.new(input)
|
73
64
|
description = []
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
next
|
79
|
-
end
|
80
|
-
break if element.name == "hr"
|
81
|
-
content = extract_text(element)
|
82
|
-
description << content unless content.empty?
|
65
|
+
catch do |abort_tag|
|
66
|
+
listener = DescriptionListener.new(abort_tag, description)
|
67
|
+
parser = REXML::Parsers::StreamParser.new(input, listener)
|
68
|
+
parser.parse
|
83
69
|
end
|
84
70
|
description.join("\n\n")
|
85
71
|
end
|
@@ -102,36 +88,190 @@ module Datasets
|
|
102
88
|
|
103
89
|
def parse_detail(href, record)
|
104
90
|
path, id = href.split("#")
|
105
|
-
open_detail(path) do |
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
91
|
+
open_detail(path) do |input|
|
92
|
+
catch do |abort_tag|
|
93
|
+
listener = DetailListener.new(abort_tag, id, @metadata.url, record)
|
94
|
+
parser = REXML::Parsers::StreamParser.new(input, listener)
|
95
|
+
parser.parse
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
class IndexListener
|
101
|
+
include REXML::StreamListener
|
102
|
+
|
103
|
+
def initialize(abort_tag, &block)
|
104
|
+
@abort_tag = abort_tag
|
105
|
+
@block = block
|
106
|
+
@row = nil
|
107
|
+
@in_td = false
|
108
|
+
end
|
109
|
+
|
110
|
+
def tag_start(name, attributes)
|
111
|
+
case name
|
112
|
+
when "tr"
|
113
|
+
@row = []
|
114
|
+
when "td"
|
115
|
+
@in_td = true
|
116
|
+
@row << {:text => ""}
|
117
|
+
when "a"
|
118
|
+
@row.last[:href] = attributes["href"] if @in_td
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def tag_end(name)
|
123
|
+
case name
|
124
|
+
when "table"
|
125
|
+
throw(@abort_tag)
|
126
|
+
when "tr"
|
127
|
+
name_column = @row[0]
|
128
|
+
return unless name_column
|
129
|
+
record = Record.new
|
130
|
+
record.name = name_column[:text]
|
131
|
+
record.files = []
|
132
|
+
@block.call(name_column[:href], record)
|
133
|
+
when "td"
|
134
|
+
@in_td = false
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
def text(data)
|
139
|
+
@row.last[:text] << data if @in_td
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
class DetailListener
|
144
|
+
include REXML::StreamListener
|
145
|
+
|
146
|
+
def initialize(abort_tag, id, base_url, record)
|
147
|
+
@abort_tag = abort_tag
|
148
|
+
@id = id
|
149
|
+
@base_url = base_url
|
150
|
+
@record = record
|
151
|
+
@in_target = false
|
152
|
+
@target_li_level = nil
|
153
|
+
@key = nil
|
154
|
+
@data = nil
|
155
|
+
@file = nil
|
156
|
+
end
|
157
|
+
|
158
|
+
def tag_start(name, attributes)
|
159
|
+
if @in_target
|
160
|
+
case name
|
161
|
+
when "li"
|
162
|
+
@target_li_level += 1
|
163
|
+
case @target_li_level
|
164
|
+
when 0
|
165
|
+
@key = nil
|
166
|
+
@data = nil
|
167
|
+
@file = nil
|
168
|
+
when 1
|
169
|
+
@file = File.new
|
131
170
|
end
|
171
|
+
when "a"
|
172
|
+
@file.url = @base_url + attributes["href"] if @file
|
173
|
+
end
|
174
|
+
else
|
175
|
+
if attributes["name"] == @id
|
176
|
+
@in_target = true
|
177
|
+
@target_li_level = -1
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
def tag_end(name)
|
183
|
+
if @in_target
|
184
|
+
case name
|
185
|
+
when "ul"
|
186
|
+
throw(@abort_tag) if @target_li_level == -1
|
187
|
+
when "li"
|
188
|
+
case @target_li_level
|
189
|
+
when 0
|
190
|
+
if @key
|
191
|
+
data = @data
|
192
|
+
data = data.gsub(/[ \t\n]+/, " ").strip if data.is_a?(String)
|
193
|
+
@record[@key] = data
|
194
|
+
end
|
195
|
+
when 1
|
196
|
+
@data << @file if @data and @file
|
197
|
+
end
|
198
|
+
@target_li_level -= 1
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
def text(data)
|
204
|
+
case @target_li_level
|
205
|
+
when 0
|
206
|
+
if @key
|
207
|
+
@data << data
|
208
|
+
else
|
209
|
+
case data.gsub(/[ \t\n]+/, " ")
|
210
|
+
when /\ASource: /
|
211
|
+
@key = :source
|
212
|
+
@data = $POSTMATCH
|
213
|
+
when /\APreprocessing: /
|
214
|
+
@key = :preprocessing
|
215
|
+
@data = $POSTMATCH
|
216
|
+
when /\A\# of classes: (\d+)/
|
217
|
+
@key = :n_classes
|
218
|
+
@data = Integer($1, 10)
|
219
|
+
when /\A\# of data: ([\d,]+)/
|
220
|
+
@key = :n_data
|
221
|
+
@data = Integer($1.gsub(/,/, ""), 10)
|
222
|
+
when /\A\# of features: ([\d,]+)/
|
223
|
+
@key = :n_features
|
224
|
+
@data = Integer($1.gsub(/,/, ""), 10)
|
225
|
+
when /\AFiles:/
|
226
|
+
@key = :files
|
227
|
+
@data = []
|
228
|
+
end
|
229
|
+
end
|
230
|
+
when 1
|
231
|
+
if @file.name.nil?
|
232
|
+
@file.name = data
|
233
|
+
else
|
234
|
+
@file.note = data.strip.gsub(/[()]/, "")
|
132
235
|
end
|
133
236
|
end
|
134
237
|
end
|
135
238
|
end
|
239
|
+
|
240
|
+
class DescriptionListener
|
241
|
+
include REXML::StreamListener
|
242
|
+
|
243
|
+
def initialize(abort_tag, description)
|
244
|
+
@abort_tag = abort_tag
|
245
|
+
@description = description
|
246
|
+
@in_content = false
|
247
|
+
@p = nil
|
248
|
+
end
|
249
|
+
|
250
|
+
def tag_start(name, attributes)
|
251
|
+
case name
|
252
|
+
when "p"
|
253
|
+
@in_content = true
|
254
|
+
@p = []
|
255
|
+
when "br"
|
256
|
+
@description << @p.join(" ")
|
257
|
+
@p = []
|
258
|
+
when "hr"
|
259
|
+
throw(@abort_tag)
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
def tag_end(name)
|
264
|
+
case name
|
265
|
+
when "p"
|
266
|
+
@description << @p.join(" ")
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
def text(data)
|
271
|
+
return unless @in_content
|
272
|
+
content = data.gsub(/[ \t\n]+/, " ").strip
|
273
|
+
@p << content unless content.empty?
|
274
|
+
end
|
275
|
+
end
|
136
276
|
end
|
137
277
|
end
|
data/lib/datasets/libsvm.rb
CHANGED
@@ -103,15 +103,7 @@ module Datasets
|
|
103
103
|
download(data_path, @file.url)
|
104
104
|
end
|
105
105
|
if data_path.extname == ".bz2"
|
106
|
-
|
107
|
-
pid = spawn("bzcat", data_path.to_s, {:out => output})
|
108
|
-
begin
|
109
|
-
output.close
|
110
|
-
yield(input)
|
111
|
-
ensure
|
112
|
-
input.close
|
113
|
-
Process.waitpid(pid)
|
114
|
-
end
|
106
|
+
extract_bz2(data_path, &block)
|
115
107
|
else
|
116
108
|
File.open(data_path, &block)
|
117
109
|
end
|
@@ -0,0 +1,256 @@
|
|
1
|
+
require "csv"
|
2
|
+
|
3
|
+
require_relative "dataset"
|
4
|
+
|
5
|
+
module Datasets
|
6
|
+
class Mushroom < Dataset
|
7
|
+
Record = Struct.new(
|
8
|
+
:label,
|
9
|
+
:cap_shape,
|
10
|
+
:cap_surface,
|
11
|
+
:cap_color,
|
12
|
+
:bruises,
|
13
|
+
:odor,
|
14
|
+
:gill_attachment,
|
15
|
+
:gill_spacing,
|
16
|
+
:gill_size,
|
17
|
+
:gill_color,
|
18
|
+
:stalk_shape,
|
19
|
+
:stalk_root,
|
20
|
+
:stalk_surface_above_ring,
|
21
|
+
:stalk_surface_below_ring,
|
22
|
+
:stalk_color_above_ring,
|
23
|
+
:stalk_color_below_ring,
|
24
|
+
:veil_type,
|
25
|
+
:veil_color,
|
26
|
+
:n_rings,
|
27
|
+
:ring_type,
|
28
|
+
:spore_print_color,
|
29
|
+
:population,
|
30
|
+
:habitat,
|
31
|
+
)
|
32
|
+
|
33
|
+
def initialize
|
34
|
+
super()
|
35
|
+
@metadata.id = "mushroom"
|
36
|
+
@metadata.name = "Mushroom"
|
37
|
+
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/mushroom"
|
38
|
+
@metadata.description = lambda do
|
39
|
+
read_names
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def each
|
44
|
+
return to_enum(__method__) unless block_given?
|
45
|
+
|
46
|
+
open_data do |csv|
|
47
|
+
csv.each do |row|
|
48
|
+
next if row[0].nil?
|
49
|
+
record = Record.new(*row)
|
50
|
+
record.members.each do |member|
|
51
|
+
record[member] = CONVERTERS[member][record[member]]
|
52
|
+
end
|
53
|
+
yield(record)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
def open_data
|
60
|
+
data_path = cache_dir_path + "agaricus-lepiota.data"
|
61
|
+
unless data_path.exist?
|
62
|
+
data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
|
63
|
+
download(data_path, data_url)
|
64
|
+
end
|
65
|
+
CSV.open(data_path) do |csv|
|
66
|
+
yield(csv)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def read_names
|
71
|
+
names_path = cache_dir_path + "agaricus-lepiota.names"
|
72
|
+
unless names_path.exist?
|
73
|
+
names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
|
74
|
+
download(names_path, names_url)
|
75
|
+
end
|
76
|
+
names_path.read
|
77
|
+
end
|
78
|
+
|
79
|
+
CONVERTERS = {
|
80
|
+
label: {
|
81
|
+
"p" => "poisonous",
|
82
|
+
"e" => "edible",
|
83
|
+
},
|
84
|
+
cap_shape: {
|
85
|
+
"b" => "bell",
|
86
|
+
"c" => "conical",
|
87
|
+
"x" => "convex",
|
88
|
+
"f" => "flat",
|
89
|
+
"k" => "knobbed",
|
90
|
+
"s" => "sunken",
|
91
|
+
},
|
92
|
+
cap_surface: {
|
93
|
+
"f" => "fibrous",
|
94
|
+
"g" => "grooves",
|
95
|
+
"y" => "scaly",
|
96
|
+
"s" => "smooth",
|
97
|
+
},
|
98
|
+
cap_color: {
|
99
|
+
"n" => "brown",
|
100
|
+
"b" => "buff",
|
101
|
+
"c" => "cinnamon",
|
102
|
+
"g" => "gray",
|
103
|
+
"r" => "green",
|
104
|
+
"p" => "pink",
|
105
|
+
"u" => "purple",
|
106
|
+
"e" => "red",
|
107
|
+
"w" => "white",
|
108
|
+
"y" => "yellow",
|
109
|
+
},
|
110
|
+
bruises: {
|
111
|
+
"t" => "bruises",
|
112
|
+
"f" => "no",
|
113
|
+
},
|
114
|
+
odor: {
|
115
|
+
"a" => "almond",
|
116
|
+
"l" => "anise",
|
117
|
+
"c" => "creosote",
|
118
|
+
"y" => "fishy",
|
119
|
+
"f" => "foul",
|
120
|
+
"m" => "musty",
|
121
|
+
"n" => "none",
|
122
|
+
"p" => "pungent",
|
123
|
+
"s" => "spicy",
|
124
|
+
},
|
125
|
+
gill_attachment: {
|
126
|
+
"a" => "attached",
|
127
|
+
"d" => "descending",
|
128
|
+
"f" => "free",
|
129
|
+
"n" => "notched",
|
130
|
+
},
|
131
|
+
gill_spacing: {
|
132
|
+
"c" => "close",
|
133
|
+
"w" => "crowded",
|
134
|
+
"d" => "distant",
|
135
|
+
},
|
136
|
+
gill_size: {
|
137
|
+
"b" => "broad",
|
138
|
+
"n" => "narrow",
|
139
|
+
},
|
140
|
+
gill_color: {
|
141
|
+
"k" => "black",
|
142
|
+
"n" => "brown",
|
143
|
+
"b" => "buff",
|
144
|
+
"h" => "chocolate",
|
145
|
+
"g" => "gray",
|
146
|
+
"r" => "green",
|
147
|
+
"o" => "orange",
|
148
|
+
"p" => "pink",
|
149
|
+
"u" => "purple",
|
150
|
+
"e" => "red",
|
151
|
+
"w" => "white",
|
152
|
+
"y" => "yellow",
|
153
|
+
},
|
154
|
+
stalk_shape: {
|
155
|
+
"e" => "enlarging",
|
156
|
+
"t" => "tapering",
|
157
|
+
},
|
158
|
+
stalk_root: {
|
159
|
+
"b" => "bulbous",
|
160
|
+
"c" => "club",
|
161
|
+
"u" => "cup",
|
162
|
+
"e" => "equal",
|
163
|
+
"z" => "rhizomorphs",
|
164
|
+
"r" => "rooted",
|
165
|
+
"?" => "missing",
|
166
|
+
},
|
167
|
+
stalk_surface_above_ring: {
|
168
|
+
"f" => "fibrous",
|
169
|
+
"y" => "scaly",
|
170
|
+
"k" => "silky",
|
171
|
+
"s" => "smooth",
|
172
|
+
},
|
173
|
+
stalk_surface_below_ring: {
|
174
|
+
"f" => "fibrous",
|
175
|
+
"y" => "scaly",
|
176
|
+
"k" => "silky",
|
177
|
+
"s" => "smooth",
|
178
|
+
},
|
179
|
+
stalk_color_above_ring: {
|
180
|
+
"n" => "brown",
|
181
|
+
"b" => "buff",
|
182
|
+
"c" => "cinnamon",
|
183
|
+
"g" => "gray",
|
184
|
+
"o" => "orange",
|
185
|
+
"p" => "pink",
|
186
|
+
"e" => "red",
|
187
|
+
"w" => "white",
|
188
|
+
"y" => "yellow",
|
189
|
+
},
|
190
|
+
stalk_color_below_ring: {
|
191
|
+
"n" => "brown",
|
192
|
+
"b" => "buff",
|
193
|
+
"c" => "cinnamon",
|
194
|
+
"g" => "gray",
|
195
|
+
"o" => "orange",
|
196
|
+
"p" => "pink",
|
197
|
+
"e" => "red",
|
198
|
+
"w" => "white",
|
199
|
+
"y" => "yellow",
|
200
|
+
},
|
201
|
+
veil_type: {
|
202
|
+
"p" => "partial",
|
203
|
+
"u" => "universal",
|
204
|
+
},
|
205
|
+
veil_color: {
|
206
|
+
"n" => "brown",
|
207
|
+
"o" => "orange",
|
208
|
+
"w" => "white",
|
209
|
+
"y" => "yellow",
|
210
|
+
},
|
211
|
+
n_rings: {
|
212
|
+
"n" => 0,
|
213
|
+
"o" => 1,
|
214
|
+
"t" => 2,
|
215
|
+
},
|
216
|
+
ring_type: {
|
217
|
+
"c" => "cobwebby",
|
218
|
+
"e" => "evanescent",
|
219
|
+
"f" => "flaring",
|
220
|
+
"l" => "large",
|
221
|
+
"n" => "none",
|
222
|
+
"p" => "pendant",
|
223
|
+
"s" => "sheathing",
|
224
|
+
"z" => "zone",
|
225
|
+
},
|
226
|
+
spore_print_color: {
|
227
|
+
"k" => "black",
|
228
|
+
"n" => "brown",
|
229
|
+
"b" => "buff",
|
230
|
+
"h" => "chocolate",
|
231
|
+
"r" => "green",
|
232
|
+
"o" => "orange",
|
233
|
+
"u" => "purple",
|
234
|
+
"w" => "white",
|
235
|
+
"y" => "yellow",
|
236
|
+
},
|
237
|
+
population: {
|
238
|
+
"a" => "abundant",
|
239
|
+
"c" => "clustered",
|
240
|
+
"n" => "numerous",
|
241
|
+
"s" => "scattered",
|
242
|
+
"v" => "several",
|
243
|
+
"y" => "solitary",
|
244
|
+
},
|
245
|
+
habitat: {
|
246
|
+
"g" => "grasses",
|
247
|
+
"l" => "leaves",
|
248
|
+
"m" => "meadows",
|
249
|
+
"p" => "paths",
|
250
|
+
"u" => "urban",
|
251
|
+
"w" => "waste",
|
252
|
+
"d" => "woods",
|
253
|
+
}
|
254
|
+
}
|
255
|
+
end
|
256
|
+
end
|
data/lib/datasets/table.rb
CHANGED
@@ -2,19 +2,99 @@ require "datasets/dictionary"
|
|
2
2
|
|
3
3
|
module Datasets
|
4
4
|
class Table
|
5
|
+
class Record
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
def initialize(table, index)
|
9
|
+
@table = table
|
10
|
+
@index = index
|
11
|
+
end
|
12
|
+
|
13
|
+
def [](column_name_or_column_index)
|
14
|
+
@table[column_name_or_column_index][@index]
|
15
|
+
end
|
16
|
+
|
17
|
+
def each
|
18
|
+
return to_enum(__method__) unless block_given?
|
19
|
+
@table.each_column.each do |column_name, column_values|
|
20
|
+
yield(column_name, column_values[@index])
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def values
|
25
|
+
@table.each_column.collect do |_column_name, column_values|
|
26
|
+
column_values[@index]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_h
|
31
|
+
hash = {}
|
32
|
+
each do |column_name, column_value|
|
33
|
+
hash[column_name] = column_value
|
34
|
+
end
|
35
|
+
hash
|
36
|
+
end
|
37
|
+
|
38
|
+
def inspect
|
39
|
+
"#<#{self.class.name} #{@table.dataset.metadata.name}[#{@index}] #{to_h.inspect}>"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
5
43
|
include Enumerable
|
6
44
|
|
45
|
+
attr_reader :dataset
|
7
46
|
def initialize(dataset)
|
8
47
|
@dataset = dataset
|
9
48
|
@dictionaries = {}
|
10
49
|
end
|
11
50
|
|
12
|
-
def
|
51
|
+
def n_columns
|
52
|
+
columner_data.size
|
53
|
+
end
|
54
|
+
alias_method :size, :n_columns
|
55
|
+
alias_method :length, :n_columns
|
56
|
+
|
57
|
+
def n_rows
|
58
|
+
first_column = columner_data.first
|
59
|
+
return 0 if first_column.nil?
|
60
|
+
first_column[1].size
|
61
|
+
end
|
62
|
+
|
63
|
+
def column_names
|
64
|
+
columner_data.keys
|
65
|
+
end
|
66
|
+
|
67
|
+
def each_column(&block)
|
13
68
|
columner_data.each(&block)
|
14
69
|
end
|
70
|
+
alias_method :each, :each_column
|
15
71
|
|
16
|
-
def
|
17
|
-
|
72
|
+
def each_record
|
73
|
+
return to_enum(__method__) unless block_given?
|
74
|
+
n_rows.times do |i|
|
75
|
+
yield(Record.new(self, i))
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def find_record(row)
|
80
|
+
row += n_rows if row < 0
|
81
|
+
return nil if row < 0
|
82
|
+
return nil if row >= n_rows
|
83
|
+
Record.new(self, row)
|
84
|
+
end
|
85
|
+
|
86
|
+
def [](name_or_index)
|
87
|
+
case name_or_index
|
88
|
+
when Integer
|
89
|
+
index = name_or_index
|
90
|
+
columner_data.each_with_index do |(_name, values), i|
|
91
|
+
return values if i == index
|
92
|
+
end
|
93
|
+
nil
|
94
|
+
else
|
95
|
+
name = name_or_index
|
96
|
+
columner_data[normalize_name(name)]
|
97
|
+
end
|
18
98
|
end
|
19
99
|
|
20
100
|
def dictionary_encode(name)
|
data/lib/datasets/version.rb
CHANGED
data/lib/datasets/wikipedia.rb
CHANGED
@@ -52,7 +52,7 @@ module Datasets
|
|
52
52
|
end
|
53
53
|
|
54
54
|
private
|
55
|
-
def open_data
|
55
|
+
def open_data(&block)
|
56
56
|
base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
|
57
57
|
data_path = cache_dir_path + base_name
|
58
58
|
unless data_path.exist?
|
@@ -60,15 +60,7 @@ module Datasets
|
|
60
60
|
download(data_path, data_url)
|
61
61
|
end
|
62
62
|
|
63
|
-
|
64
|
-
pid = spawn("bzcat", data_path.to_s, {:out => output})
|
65
|
-
begin
|
66
|
-
output.close
|
67
|
-
yield(input)
|
68
|
-
ensure
|
69
|
-
input.close
|
70
|
-
Process.waitpid(pid)
|
71
|
-
end
|
63
|
+
extract_bz2(data_path, &block)
|
72
64
|
end
|
73
65
|
|
74
66
|
def type_in_path
|
@@ -0,0 +1,80 @@
|
|
1
|
+
class MushroomTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::Mushroom.new
|
4
|
+
end
|
5
|
+
|
6
|
+
def record(*args)
|
7
|
+
Datasets::Mushroom::Record.new(*args)
|
8
|
+
end
|
9
|
+
|
10
|
+
test("#each") do
|
11
|
+
records = @dataset.each.to_a
|
12
|
+
assert_equal([
|
13
|
+
8124,
|
14
|
+
{
|
15
|
+
:label => "poisonous",
|
16
|
+
:cap_shape => "convex",
|
17
|
+
:cap_surface => "smooth",
|
18
|
+
:cap_color => "brown",
|
19
|
+
:bruises => "bruises",
|
20
|
+
:odor => "pungent",
|
21
|
+
:gill_attachment => "free",
|
22
|
+
:gill_spacing => "close",
|
23
|
+
:gill_size => "narrow",
|
24
|
+
:gill_color => "black",
|
25
|
+
:stalk_shape => "enlarging",
|
26
|
+
:stalk_root => "equal",
|
27
|
+
:stalk_surface_above_ring => "smooth",
|
28
|
+
:stalk_surface_below_ring => "smooth",
|
29
|
+
:stalk_color_above_ring => "white",
|
30
|
+
:stalk_color_below_ring => "white",
|
31
|
+
:veil_type => "partial",
|
32
|
+
:veil_color => "white",
|
33
|
+
:n_rings => 1,
|
34
|
+
:ring_type => "pendant",
|
35
|
+
:spore_print_color => "black",
|
36
|
+
:population => "scattered",
|
37
|
+
:habitat => "urban"
|
38
|
+
},
|
39
|
+
{
|
40
|
+
:label => "edible",
|
41
|
+
:cap_shape => "convex",
|
42
|
+
:cap_surface => "smooth",
|
43
|
+
:cap_color => "brown",
|
44
|
+
:bruises => "no",
|
45
|
+
:odor => "none",
|
46
|
+
:gill_attachment => "attached",
|
47
|
+
:gill_spacing => "close",
|
48
|
+
:gill_size => "broad",
|
49
|
+
:gill_color => "yellow",
|
50
|
+
:stalk_shape => "enlarging",
|
51
|
+
:stalk_root => "missing",
|
52
|
+
:stalk_surface_above_ring => "smooth",
|
53
|
+
:stalk_surface_below_ring => "smooth",
|
54
|
+
:stalk_color_above_ring => "orange",
|
55
|
+
:stalk_color_below_ring => "orange",
|
56
|
+
:veil_type => "partial",
|
57
|
+
:veil_color => "orange",
|
58
|
+
:n_rings => 1,
|
59
|
+
:ring_type => "pendant",
|
60
|
+
:spore_print_color => "orange",
|
61
|
+
:population => "clustered",
|
62
|
+
:habitat => "leaves"
|
63
|
+
}
|
64
|
+
],
|
65
|
+
[
|
66
|
+
records.size,
|
67
|
+
records[0].to_h,
|
68
|
+
records[-1].to_h
|
69
|
+
])
|
70
|
+
end
|
71
|
+
|
72
|
+
sub_test_case("#metadata") do
|
73
|
+
test("#description") do
|
74
|
+
description = @dataset.metadata.description
|
75
|
+
assert do
|
76
|
+
description.start_with?("1. Title: Mushroom Database")
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
data/test/test-table.rb
CHANGED
@@ -3,9 +3,129 @@ class TableTest < Test::Unit::TestCase
|
|
3
3
|
@table = Datasets::Iris.new.to_table
|
4
4
|
end
|
5
5
|
|
6
|
-
test("#
|
7
|
-
assert_equal(
|
8
|
-
|
6
|
+
test("#n_columns") do
|
7
|
+
assert_equal(5, @table.n_columns)
|
8
|
+
end
|
9
|
+
|
10
|
+
test("#n_rows") do
|
11
|
+
assert_equal(150, @table.n_rows)
|
12
|
+
end
|
13
|
+
|
14
|
+
test("#column_names") do
|
15
|
+
assert_equal([
|
16
|
+
:sepal_length,
|
17
|
+
:sepal_width,
|
18
|
+
:petal_length,
|
19
|
+
:petal_width,
|
20
|
+
:label,
|
21
|
+
],
|
22
|
+
@table.column_names)
|
23
|
+
end
|
24
|
+
|
25
|
+
test("#each") do
|
26
|
+
shorten_hash = {}
|
27
|
+
@table.each do |name, values|
|
28
|
+
shorten_hash[name] = values.first(5)
|
29
|
+
end
|
30
|
+
assert_equal({
|
31
|
+
:label => ["Iris-setosa"] * 5,
|
32
|
+
:petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
|
33
|
+
:petal_width => [0.2, 0.2, 0.2, 0.2, 0.2],
|
34
|
+
:sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
|
35
|
+
:sepal_width => [3.5, 3.0, 3.2, 3.1, 3.6],
|
36
|
+
},
|
37
|
+
shorten_hash)
|
38
|
+
end
|
39
|
+
|
40
|
+
test("#each_column") do
|
41
|
+
shorten_hash = {}
|
42
|
+
@table.each_column do |name, values|
|
43
|
+
shorten_hash[name] = values.first(5)
|
44
|
+
end
|
45
|
+
assert_equal({
|
46
|
+
:label => ["Iris-setosa"] * 5,
|
47
|
+
:petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
|
48
|
+
:petal_width => [0.2, 0.2, 0.2, 0.2, 0.2],
|
49
|
+
:sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
|
50
|
+
:sepal_width => [3.5, 3.0, 3.2, 3.1, 3.6],
|
51
|
+
},
|
52
|
+
shorten_hash)
|
53
|
+
end
|
54
|
+
|
55
|
+
test("#each_record") do
|
56
|
+
records = []
|
57
|
+
@table.each_record do |record|
|
58
|
+
records << record
|
59
|
+
break if records.size == 3
|
60
|
+
end
|
61
|
+
assert_equal([
|
62
|
+
{
|
63
|
+
label: "Iris-setosa",
|
64
|
+
petal_length: 1.4,
|
65
|
+
petal_width: 0.2,
|
66
|
+
sepal_length: 5.1,
|
67
|
+
sepal_width: 3.5,
|
68
|
+
},
|
69
|
+
{
|
70
|
+
label: "Iris-setosa",
|
71
|
+
petal_length: 1.4,
|
72
|
+
petal_width: 0.2,
|
73
|
+
sepal_length: 4.9,
|
74
|
+
sepal_width: 3.0,
|
75
|
+
},
|
76
|
+
{
|
77
|
+
label: "Iris-setosa",
|
78
|
+
petal_length: 1.3,
|
79
|
+
petal_width: 0.2,
|
80
|
+
sepal_length: 4.7,
|
81
|
+
sepal_width: 3.2,
|
82
|
+
},
|
83
|
+
],
|
84
|
+
records.collect(&:to_h))
|
85
|
+
end
|
86
|
+
|
87
|
+
sub_test_case("#find_record") do
|
88
|
+
test("positive") do
|
89
|
+
assert_equal({
|
90
|
+
label: "Iris-setosa",
|
91
|
+
petal_length: 1.4,
|
92
|
+
petal_width: 0.2,
|
93
|
+
sepal_length: 4.9,
|
94
|
+
sepal_width: 3.0,
|
95
|
+
},
|
96
|
+
@table.find_record(1).to_h)
|
97
|
+
end
|
98
|
+
|
99
|
+
test("positive - over") do
|
100
|
+
assert_nil(@table.find_record(151))
|
101
|
+
end
|
102
|
+
|
103
|
+
test("negative") do
|
104
|
+
assert_equal({
|
105
|
+
label: "Iris-virginica",
|
106
|
+
petal_length: 5.1,
|
107
|
+
petal_width: 1.8,
|
108
|
+
sepal_length: 5.9,
|
109
|
+
sepal_width: 3.0,
|
110
|
+
},
|
111
|
+
@table.find_record(-1).to_h)
|
112
|
+
end
|
113
|
+
|
114
|
+
test("negative - over") do
|
115
|
+
assert_nil(@table.find_record(-151))
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
sub_test_case("#[]") do
|
120
|
+
test("index") do
|
121
|
+
assert_equal([1.4, 1.4, 1.3, 1.5, 1.4],
|
122
|
+
@table[2].first(5))
|
123
|
+
end
|
124
|
+
|
125
|
+
test("name") do
|
126
|
+
assert_equal([1.4, 1.4, 1.3, 1.5, 1.4],
|
127
|
+
@table[:petal_length].first(5))
|
128
|
+
end
|
9
129
|
end
|
10
130
|
|
11
131
|
test("#dictionary_encode") do
|
@@ -58,21 +178,6 @@ class TableTest < Test::Unit::TestCase
|
|
58
178
|
end
|
59
179
|
end
|
60
180
|
|
61
|
-
test("#each") do
|
62
|
-
shorten_hash = {}
|
63
|
-
@table.each do |name, values|
|
64
|
-
shorten_hash[name] = values.first(5)
|
65
|
-
end
|
66
|
-
assert_equal({
|
67
|
-
:label => ["Iris-setosa"] * 5,
|
68
|
-
:petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
|
69
|
-
:petal_width => [0.2, 0.2, 0.2, 0.2, 0.2],
|
70
|
-
:sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
|
71
|
-
:sepal_width => [3.5, 3.0, 3.2, 3.1, 3.6],
|
72
|
-
},
|
73
|
-
shorten_hash)
|
74
|
-
end
|
75
|
-
|
76
181
|
test("#to_h") do
|
77
182
|
shorten_hash = {}
|
78
183
|
@table.to_h.each do |name, values|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: red-datasets
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- tomisuker
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2019-
|
12
|
+
date: 2019-09-09 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: csv
|
@@ -138,6 +138,7 @@ files:
|
|
138
138
|
- lib/datasets/libsvm.rb
|
139
139
|
- lib/datasets/metadata.rb
|
140
140
|
- lib/datasets/mnist.rb
|
141
|
+
- lib/datasets/mushroom.rb
|
141
142
|
- lib/datasets/penn-treebank.rb
|
142
143
|
- lib/datasets/postal-code-japan.rb
|
143
144
|
- lib/datasets/table.rb
|
@@ -155,6 +156,7 @@ files:
|
|
155
156
|
- test/test-libsvm-dataset-list.rb
|
156
157
|
- test/test-libsvm.rb
|
157
158
|
- test/test-mnist.rb
|
159
|
+
- test/test-mushroom.rb
|
158
160
|
- test/test-penn-treebank.rb
|
159
161
|
- test/test-postal-code-japan.rb
|
160
162
|
- test/test-table.rb
|
@@ -180,23 +182,24 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
180
182
|
version: '0'
|
181
183
|
requirements: []
|
182
184
|
rubyforge_project:
|
183
|
-
rubygems_version: 2.7.6
|
185
|
+
rubygems_version: 2.7.6.2
|
184
186
|
signing_key:
|
185
187
|
specification_version: 4
|
186
188
|
summary: Red Datasets provides classes that provide common datasets such as iris dataset.
|
187
189
|
test_files:
|
188
|
-
- test/test-iris.rb
|
189
|
-
- test/test-wikipedia.rb
|
190
|
-
- test/test-fashion-mnist.rb
|
191
|
-
- test/test-wine.rb
|
192
|
-
- test/test-postal-code-japan.rb
|
193
|
-
- test/test-mnist.rb
|
194
|
-
- test/helper.rb
|
195
190
|
- test/test-adult.rb
|
196
191
|
- test/test-libsvm.rb
|
197
|
-
- test/
|
198
|
-
- test/test-table.rb
|
199
|
-
- test/test-cifar.rb
|
192
|
+
- test/test-wikipedia.rb
|
200
193
|
- test/test-libsvm-dataset-list.rb
|
194
|
+
- test/helper.rb
|
195
|
+
- test/test-iris.rb
|
196
|
+
- test/test-table.rb
|
197
|
+
- test/run-test.rb
|
198
|
+
- test/test-wine.rb
|
201
199
|
- test/test-penn-treebank.rb
|
200
|
+
- test/test-postal-code-japan.rb
|
201
|
+
- test/test-cifar.rb
|
202
|
+
- test/test-mnist.rb
|
203
|
+
- test/test-mushroom.rb
|
202
204
|
- test/test-dictionary.rb
|
205
|
+
- test/test-fashion-mnist.rb
|