red-datasets 0.0.7 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +20 -4
- data/doc/text/news.md +102 -0
- data/lib/datasets.rb +19 -9
- data/lib/datasets/adult.rb +4 -3
- data/lib/datasets/cifar.rb +4 -12
- data/lib/datasets/cldr-plurals.rb +385 -0
- data/lib/datasets/communities.rb +198 -0
- data/lib/datasets/dataset.rb +20 -1
- data/lib/datasets/downloader.rb +54 -26
- data/lib/datasets/e-stat-japan.rb +320 -0
- data/lib/datasets/error.rb +4 -0
- data/lib/datasets/hepatitis.rb +207 -0
- data/lib/datasets/libsvm-dataset-list.rb +277 -0
- data/lib/datasets/libsvm.rb +135 -0
- data/lib/datasets/mnist.rb +0 -2
- data/lib/datasets/mushroom.rb +256 -0
- data/lib/datasets/penguins.rb +146 -0
- data/lib/datasets/postal-code-japan.rb +154 -0
- data/lib/datasets/rdatasets.rb +95 -0
- data/lib/datasets/table.rb +83 -3
- data/lib/datasets/tar_gz_readable.rb +14 -0
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia.rb +2 -10
- data/red-datasets.gemspec +4 -0
- data/test/run-test.rb +2 -0
- data/test/test-cldr-plurals.rb +180 -0
- data/test/test-communities.rb +290 -0
- data/test/test-dataset.rb +27 -0
- data/test/test-downloader.rb +29 -0
- data/test/test-e-stat-japan.rb +383 -0
- data/test/test-hepatitis.rb +74 -0
- data/test/test-libsvm-dataset-list.rb +47 -0
- data/test/test-libsvm.rb +205 -0
- data/test/test-mushroom.rb +80 -0
- data/test/test-penguins.rb +251 -0
- data/test/test-postal-code-japan.rb +69 -0
- data/test/test-rdatasets.rb +136 -0
- data/test/test-table.rb +123 -18
- metadata +88 -11
@@ -0,0 +1,207 @@
|
|
1
|
+
require "csv"
|
2
|
+
|
3
|
+
require_relative "dataset"
|
4
|
+
|
5
|
+
module Datasets
|
6
|
+
class Hepatitis < Dataset
|
7
|
+
class Record < Struct.new(:label,
|
8
|
+
:age,
|
9
|
+
:sex,
|
10
|
+
:steroid,
|
11
|
+
:antivirals,
|
12
|
+
:fatigue,
|
13
|
+
:malaise,
|
14
|
+
:anorexia,
|
15
|
+
:liver_big,
|
16
|
+
:liver_firm,
|
17
|
+
:spleen_palpable,
|
18
|
+
:spiders,
|
19
|
+
:ascites,
|
20
|
+
:varices,
|
21
|
+
:bilirubin,
|
22
|
+
:alkaline_phosphate,
|
23
|
+
:sgot,
|
24
|
+
:albumin,
|
25
|
+
:protime,
|
26
|
+
:histology)
|
27
|
+
def initialize(*values)
|
28
|
+
super()
|
29
|
+
members.zip(values) do |member, value|
|
30
|
+
__send__("#{member}=", value)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def label=(label)
|
35
|
+
case label
|
36
|
+
when "1"
|
37
|
+
super(:die)
|
38
|
+
when "2"
|
39
|
+
super(:live)
|
40
|
+
else
|
41
|
+
super(label)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def age=(age)
|
46
|
+
super(normalize_integer(age))
|
47
|
+
end
|
48
|
+
|
49
|
+
def sex=(sex)
|
50
|
+
case sex
|
51
|
+
when "1"
|
52
|
+
super(:male)
|
53
|
+
when "2"
|
54
|
+
super(:female)
|
55
|
+
else
|
56
|
+
super(sex)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def steroid=(steroid)
|
61
|
+
super(normalize_boolean(steroid))
|
62
|
+
end
|
63
|
+
|
64
|
+
def antivirals=(antivirals)
|
65
|
+
super(normalize_boolean(antivirals))
|
66
|
+
end
|
67
|
+
|
68
|
+
def fatigue=(fatigue)
|
69
|
+
super(normalize_boolean(fatigue))
|
70
|
+
end
|
71
|
+
|
72
|
+
def malaise=(malaise)
|
73
|
+
super(normalize_boolean(malaise))
|
74
|
+
end
|
75
|
+
|
76
|
+
def anorexia=(anorexia)
|
77
|
+
super(normalize_boolean(anorexia))
|
78
|
+
end
|
79
|
+
|
80
|
+
def liver_big=(liver_big)
|
81
|
+
super(normalize_boolean(liver_big))
|
82
|
+
end
|
83
|
+
|
84
|
+
def liver_firm=(liver_firm)
|
85
|
+
super(normalize_boolean(liver_firm))
|
86
|
+
end
|
87
|
+
|
88
|
+
def spleen_palpable=(spleen_palpable)
|
89
|
+
super(normalize_boolean(spleen_palpable))
|
90
|
+
end
|
91
|
+
|
92
|
+
def spiders=(spiders)
|
93
|
+
super(normalize_boolean(spiders))
|
94
|
+
end
|
95
|
+
|
96
|
+
def ascites=(ascites)
|
97
|
+
super(normalize_boolean(ascites))
|
98
|
+
end
|
99
|
+
|
100
|
+
def varices=(varices)
|
101
|
+
super(normalize_boolean(varices))
|
102
|
+
end
|
103
|
+
|
104
|
+
def bilirubin=(bilirubin)
|
105
|
+
super(normalize_float(bilirubin))
|
106
|
+
end
|
107
|
+
|
108
|
+
def alkaline_phosphate=(alkaline_phosphate)
|
109
|
+
super(normalize_integer(alkaline_phosphate))
|
110
|
+
end
|
111
|
+
|
112
|
+
def sgot=(sgot)
|
113
|
+
super(normalize_integer(sgot))
|
114
|
+
end
|
115
|
+
|
116
|
+
def albumin=(albumin)
|
117
|
+
super(normalize_float(albumin))
|
118
|
+
end
|
119
|
+
|
120
|
+
def protime=(protime)
|
121
|
+
super(normalize_integer(protime))
|
122
|
+
end
|
123
|
+
|
124
|
+
def histology=(histology)
|
125
|
+
super(normalize_boolean(histology))
|
126
|
+
end
|
127
|
+
|
128
|
+
private
|
129
|
+
def normalize_boolean(value)
|
130
|
+
case value
|
131
|
+
when "?"
|
132
|
+
nil
|
133
|
+
when "1"
|
134
|
+
false
|
135
|
+
when "2"
|
136
|
+
true
|
137
|
+
else
|
138
|
+
value
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
def normalize_float(value)
|
143
|
+
case value
|
144
|
+
when "?"
|
145
|
+
nil
|
146
|
+
else
|
147
|
+
Float(value)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def normalize_integer(value)
|
152
|
+
case value
|
153
|
+
when "?"
|
154
|
+
nil
|
155
|
+
else
|
156
|
+
Integer(value, 10)
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def initialize
|
162
|
+
super()
|
163
|
+
@metadata.id = "hepatitis"
|
164
|
+
@metadata.name = "Hepatitis"
|
165
|
+
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/hepatitis"
|
166
|
+
@metadata.description = lambda do
|
167
|
+
read_names
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
def each
|
172
|
+
return to_enum(__method__) unless block_given?
|
173
|
+
|
174
|
+
open_data do |csv|
|
175
|
+
csv.each do |row|
|
176
|
+
record = Record.new(*row)
|
177
|
+
yield(record)
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
private
|
183
|
+
def base_url
|
184
|
+
"https://archive.ics.uci.edu/ml/machine-learning-databases/hepatitis"
|
185
|
+
end
|
186
|
+
|
187
|
+
def open_data
|
188
|
+
data_path = cache_dir_path + "hepatitis.csv"
|
189
|
+
unless data_path.exist?
|
190
|
+
data_url = "#{base_url}/hepatitis.data"
|
191
|
+
download(data_path, data_url)
|
192
|
+
end
|
193
|
+
CSV.open(data_path) do |csv|
|
194
|
+
yield(csv)
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
def read_names
|
199
|
+
names_path = cache_dir_path + "hepatitis.names"
|
200
|
+
unless names_path.exist?
|
201
|
+
names_url = "#{base_url}/hepatitis.names"
|
202
|
+
download(names_path, names_url)
|
203
|
+
end
|
204
|
+
names_path.read
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
@@ -0,0 +1,277 @@
|
|
1
|
+
require "rexml/streamlistener"
|
2
|
+
require "rexml/parsers/baseparser"
|
3
|
+
require "rexml/parsers/streamparser"
|
4
|
+
|
5
|
+
require_relative "dataset"
|
6
|
+
|
7
|
+
module Datasets
|
8
|
+
class LIBSVMDatasetList < Dataset
|
9
|
+
File = Struct.new(:name,
|
10
|
+
:url,
|
11
|
+
:note)
|
12
|
+
class Record < Struct.new(:name,
|
13
|
+
:source,
|
14
|
+
:preprocessing,
|
15
|
+
:n_classes,
|
16
|
+
:n_data,
|
17
|
+
:n_features,
|
18
|
+
:files)
|
19
|
+
def to_h
|
20
|
+
hash = super
|
21
|
+
hash[:files] = hash[:files].collect(&:to_h)
|
22
|
+
hash
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def initialize
|
27
|
+
super()
|
28
|
+
@metadata.id = "libsvm-dataset-list"
|
29
|
+
@metadata.name = "LIBSVM dataset list"
|
30
|
+
@metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
|
31
|
+
@metadata.description = lambda do
|
32
|
+
extract_description
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def each(&block)
|
37
|
+
return to_enum(__method__) unless block_given?
|
38
|
+
|
39
|
+
open_data do |input|
|
40
|
+
catch do |abort_tag|
|
41
|
+
listener = IndexListener.new(abort_tag) do |href, record|
|
42
|
+
parse_detail(href, record)
|
43
|
+
yield(record)
|
44
|
+
end
|
45
|
+
parser = REXML::Parsers::StreamParser.new(input, listener)
|
46
|
+
parser.parse
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
def open_data
|
53
|
+
data_path = cache_dir_path + "index.html"
|
54
|
+
unless data_path.exist?
|
55
|
+
download(data_path, @metadata.url)
|
56
|
+
end
|
57
|
+
::File.open(data_path) do |input|
|
58
|
+
yield(input)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def extract_description
|
63
|
+
open_data do |input|
|
64
|
+
description = []
|
65
|
+
catch do |abort_tag|
|
66
|
+
listener = DescriptionListener.new(abort_tag, description)
|
67
|
+
parser = REXML::Parsers::StreamParser.new(input, listener)
|
68
|
+
parser.parse
|
69
|
+
end
|
70
|
+
description.join("\n\n")
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def extract_text(element)
|
75
|
+
texts = REXML::XPath.match(element, ".//text()")
|
76
|
+
texts.join("").gsub(/[ \t\n]+/, " ").strip
|
77
|
+
end
|
78
|
+
|
79
|
+
def open_detail(detail)
|
80
|
+
data_path = cache_dir_path + detail
|
81
|
+
unless data_path.exist?
|
82
|
+
download(data_path, @metadata.url + detail)
|
83
|
+
end
|
84
|
+
::File.open(data_path) do |input|
|
85
|
+
yield(input)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def parse_detail(href, record)
|
90
|
+
path, id = href.split("#")
|
91
|
+
open_detail(path) do |input|
|
92
|
+
catch do |abort_tag|
|
93
|
+
listener = DetailListener.new(abort_tag, id, @metadata.url, record)
|
94
|
+
parser = REXML::Parsers::StreamParser.new(input, listener)
|
95
|
+
parser.parse
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
class IndexListener
|
101
|
+
include REXML::StreamListener
|
102
|
+
|
103
|
+
def initialize(abort_tag, &block)
|
104
|
+
@abort_tag = abort_tag
|
105
|
+
@block = block
|
106
|
+
@row = nil
|
107
|
+
@in_td = false
|
108
|
+
end
|
109
|
+
|
110
|
+
def tag_start(name, attributes)
|
111
|
+
case name
|
112
|
+
when "tr"
|
113
|
+
@row = []
|
114
|
+
when "td"
|
115
|
+
@in_td = true
|
116
|
+
@row << {:text => ""}
|
117
|
+
when "a"
|
118
|
+
@row.last[:href] = attributes["href"] if @in_td
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def tag_end(name)
|
123
|
+
case name
|
124
|
+
when "table"
|
125
|
+
throw(@abort_tag)
|
126
|
+
when "tr"
|
127
|
+
name_column = @row[0]
|
128
|
+
return unless name_column
|
129
|
+
record = Record.new
|
130
|
+
record.name = name_column[:text]
|
131
|
+
record.files = []
|
132
|
+
@block.call(name_column[:href], record)
|
133
|
+
when "td"
|
134
|
+
@in_td = false
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
def text(data)
|
139
|
+
@row.last[:text] << data if @in_td
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
class DetailListener
|
144
|
+
include REXML::StreamListener
|
145
|
+
|
146
|
+
def initialize(abort_tag, id, base_url, record)
|
147
|
+
@abort_tag = abort_tag
|
148
|
+
@id = id
|
149
|
+
@base_url = base_url
|
150
|
+
@record = record
|
151
|
+
@in_target = false
|
152
|
+
@target_li_level = nil
|
153
|
+
@key = nil
|
154
|
+
@data = nil
|
155
|
+
@file = nil
|
156
|
+
end
|
157
|
+
|
158
|
+
def tag_start(name, attributes)
|
159
|
+
if @in_target
|
160
|
+
case name
|
161
|
+
when "li"
|
162
|
+
@target_li_level += 1
|
163
|
+
case @target_li_level
|
164
|
+
when 0
|
165
|
+
@key = nil
|
166
|
+
@data = nil
|
167
|
+
@file = nil
|
168
|
+
when 1
|
169
|
+
@file = File.new
|
170
|
+
end
|
171
|
+
when "a"
|
172
|
+
@file.url = @base_url + attributes["href"] if @file
|
173
|
+
end
|
174
|
+
else
|
175
|
+
if attributes["name"] == @id
|
176
|
+
@in_target = true
|
177
|
+
@target_li_level = -1
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
def tag_end(name)
|
183
|
+
if @in_target
|
184
|
+
case name
|
185
|
+
when "ul"
|
186
|
+
throw(@abort_tag) if @target_li_level == -1
|
187
|
+
when "li"
|
188
|
+
case @target_li_level
|
189
|
+
when 0
|
190
|
+
if @key
|
191
|
+
data = @data
|
192
|
+
data = data.gsub(/[ \t\n]+/, " ").strip if data.is_a?(String)
|
193
|
+
@record[@key] = data
|
194
|
+
end
|
195
|
+
when 1
|
196
|
+
@data << @file if @data and @file
|
197
|
+
end
|
198
|
+
@target_li_level -= 1
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
def text(data)
|
204
|
+
case @target_li_level
|
205
|
+
when 0
|
206
|
+
if @key
|
207
|
+
@data << data
|
208
|
+
else
|
209
|
+
case data.gsub(/[ \t\n]+/, " ")
|
210
|
+
when /\ASource: /
|
211
|
+
@key = :source
|
212
|
+
@data = $POSTMATCH
|
213
|
+
when /\APreprocessing: /
|
214
|
+
@key = :preprocessing
|
215
|
+
@data = $POSTMATCH
|
216
|
+
when /\A\# of classes: (\d+)/
|
217
|
+
@key = :n_classes
|
218
|
+
@data = Integer($1, 10)
|
219
|
+
when /\A\# of data: ([\d,]+)/
|
220
|
+
@key = :n_data
|
221
|
+
@data = Integer($1.gsub(/,/, ""), 10)
|
222
|
+
when /\A\# of features: ([\d,]+)/
|
223
|
+
@key = :n_features
|
224
|
+
@data = Integer($1.gsub(/,/, ""), 10)
|
225
|
+
when /\AFiles:/
|
226
|
+
@key = :files
|
227
|
+
@data = []
|
228
|
+
end
|
229
|
+
end
|
230
|
+
when 1
|
231
|
+
if @file.name.nil?
|
232
|
+
@file.name = data
|
233
|
+
else
|
234
|
+
@file.note = data.strip.gsub(/[()]/, "")
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
class DescriptionListener
|
241
|
+
include REXML::StreamListener
|
242
|
+
|
243
|
+
def initialize(abort_tag, description)
|
244
|
+
@abort_tag = abort_tag
|
245
|
+
@description = description
|
246
|
+
@in_content = false
|
247
|
+
@p = nil
|
248
|
+
end
|
249
|
+
|
250
|
+
def tag_start(name, attributes)
|
251
|
+
case name
|
252
|
+
when "p"
|
253
|
+
@in_content = true
|
254
|
+
@p = []
|
255
|
+
when "br"
|
256
|
+
@description << @p.join(" ")
|
257
|
+
@p = []
|
258
|
+
when "hr"
|
259
|
+
throw(@abort_tag)
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
def tag_end(name)
|
264
|
+
case name
|
265
|
+
when "p"
|
266
|
+
@description << @p.join(" ")
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
def text(data)
|
271
|
+
return unless @in_content
|
272
|
+
content = data.gsub(/[ \t\n]+/, " ").strip
|
273
|
+
@p << content unless content.empty?
|
274
|
+
end
|
275
|
+
end
|
276
|
+
end
|
277
|
+
end
|