red-datasets 0.0.8 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +6 -0
- data/doc/text/news.md +93 -0
- data/lib/datasets.rb +9 -0
- data/lib/datasets/adult.rb +4 -3
- data/lib/datasets/cifar.rb +4 -12
- data/lib/datasets/cldr-plurals.rb +385 -0
- data/lib/datasets/communities.rb +198 -0
- data/lib/datasets/dataset.rb +20 -1
- data/lib/datasets/downloader.rb +54 -26
- data/lib/datasets/e-stat-japan.rb +320 -0
- data/lib/datasets/error.rb +4 -0
- data/lib/datasets/hepatitis.rb +207 -0
- data/lib/datasets/libsvm-dataset-list.rb +194 -54
- data/lib/datasets/libsvm.rb +1 -9
- data/lib/datasets/mnist.rb +6 -4
- data/lib/datasets/mushroom.rb +256 -0
- data/lib/datasets/penguins.rb +146 -0
- data/lib/datasets/rdatasets.rb +95 -0
- data/lib/datasets/seaborn-data.rb +49 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +169 -0
- data/lib/datasets/table.rb +83 -3
- data/lib/datasets/tar-gz-readable.rb +14 -0
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia.rb +2 -10
- data/red-datasets.gemspec +1 -0
- data/test/run-test.rb +2 -0
- data/test/test-cldr-plurals.rb +180 -0
- data/test/test-communities.rb +290 -0
- data/test/test-dataset.rb +27 -0
- data/test/test-downloader.rb +29 -0
- data/test/test-e-stat-japan.rb +383 -0
- data/test/test-hepatitis.rb +74 -0
- data/test/test-mushroom.rb +80 -0
- data/test/test-penguins.rb +251 -0
- data/test/test-rdatasets.rb +136 -0
- data/test/test-seaborn-data.rb +97 -0
- data/test/test-sudachi-synonym-dictionary.rb +48 -0
- data/test/test-table.rb +123 -18
- metadata +61 -15
@@ -0,0 +1,207 @@
|
|
1
|
+
require "csv"
|
2
|
+
|
3
|
+
require_relative "dataset"
|
4
|
+
|
5
|
+
module Datasets
|
6
|
+
class Hepatitis < Dataset
|
7
|
+
class Record < Struct.new(:label,
|
8
|
+
:age,
|
9
|
+
:sex,
|
10
|
+
:steroid,
|
11
|
+
:antivirals,
|
12
|
+
:fatigue,
|
13
|
+
:malaise,
|
14
|
+
:anorexia,
|
15
|
+
:liver_big,
|
16
|
+
:liver_firm,
|
17
|
+
:spleen_palpable,
|
18
|
+
:spiders,
|
19
|
+
:ascites,
|
20
|
+
:varices,
|
21
|
+
:bilirubin,
|
22
|
+
:alkaline_phosphate,
|
23
|
+
:sgot,
|
24
|
+
:albumin,
|
25
|
+
:protime,
|
26
|
+
:histology)
|
27
|
+
def initialize(*values)
|
28
|
+
super()
|
29
|
+
members.zip(values) do |member, value|
|
30
|
+
__send__("#{member}=", value)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def label=(label)
|
35
|
+
case label
|
36
|
+
when "1"
|
37
|
+
super(:die)
|
38
|
+
when "2"
|
39
|
+
super(:live)
|
40
|
+
else
|
41
|
+
super(label)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def age=(age)
|
46
|
+
super(normalize_integer(age))
|
47
|
+
end
|
48
|
+
|
49
|
+
def sex=(sex)
|
50
|
+
case sex
|
51
|
+
when "1"
|
52
|
+
super(:male)
|
53
|
+
when "2"
|
54
|
+
super(:female)
|
55
|
+
else
|
56
|
+
super(sex)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def steroid=(steroid)
|
61
|
+
super(normalize_boolean(steroid))
|
62
|
+
end
|
63
|
+
|
64
|
+
def antivirals=(antivirals)
|
65
|
+
super(normalize_boolean(antivirals))
|
66
|
+
end
|
67
|
+
|
68
|
+
def fatigue=(fatigue)
|
69
|
+
super(normalize_boolean(fatigue))
|
70
|
+
end
|
71
|
+
|
72
|
+
def malaise=(malaise)
|
73
|
+
super(normalize_boolean(malaise))
|
74
|
+
end
|
75
|
+
|
76
|
+
def anorexia=(anorexia)
|
77
|
+
super(normalize_boolean(anorexia))
|
78
|
+
end
|
79
|
+
|
80
|
+
def liver_big=(liver_big)
|
81
|
+
super(normalize_boolean(liver_big))
|
82
|
+
end
|
83
|
+
|
84
|
+
def liver_firm=(liver_firm)
|
85
|
+
super(normalize_boolean(liver_firm))
|
86
|
+
end
|
87
|
+
|
88
|
+
def spleen_palpable=(spleen_palpable)
|
89
|
+
super(normalize_boolean(spleen_palpable))
|
90
|
+
end
|
91
|
+
|
92
|
+
def spiders=(spiders)
|
93
|
+
super(normalize_boolean(spiders))
|
94
|
+
end
|
95
|
+
|
96
|
+
def ascites=(ascites)
|
97
|
+
super(normalize_boolean(ascites))
|
98
|
+
end
|
99
|
+
|
100
|
+
def varices=(varices)
|
101
|
+
super(normalize_boolean(varices))
|
102
|
+
end
|
103
|
+
|
104
|
+
def bilirubin=(bilirubin)
|
105
|
+
super(normalize_float(bilirubin))
|
106
|
+
end
|
107
|
+
|
108
|
+
def alkaline_phosphate=(alkaline_phosphate)
|
109
|
+
super(normalize_integer(alkaline_phosphate))
|
110
|
+
end
|
111
|
+
|
112
|
+
def sgot=(sgot)
|
113
|
+
super(normalize_integer(sgot))
|
114
|
+
end
|
115
|
+
|
116
|
+
def albumin=(albumin)
|
117
|
+
super(normalize_float(albumin))
|
118
|
+
end
|
119
|
+
|
120
|
+
def protime=(protime)
|
121
|
+
super(normalize_integer(protime))
|
122
|
+
end
|
123
|
+
|
124
|
+
def histology=(histology)
|
125
|
+
super(normalize_boolean(histology))
|
126
|
+
end
|
127
|
+
|
128
|
+
private
|
129
|
+
def normalize_boolean(value)
|
130
|
+
case value
|
131
|
+
when "?"
|
132
|
+
nil
|
133
|
+
when "1"
|
134
|
+
false
|
135
|
+
when "2"
|
136
|
+
true
|
137
|
+
else
|
138
|
+
value
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
def normalize_float(value)
|
143
|
+
case value
|
144
|
+
when "?"
|
145
|
+
nil
|
146
|
+
else
|
147
|
+
Float(value)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def normalize_integer(value)
|
152
|
+
case value
|
153
|
+
when "?"
|
154
|
+
nil
|
155
|
+
else
|
156
|
+
Integer(value, 10)
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def initialize
|
162
|
+
super()
|
163
|
+
@metadata.id = "hepatitis"
|
164
|
+
@metadata.name = "Hepatitis"
|
165
|
+
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/hepatitis"
|
166
|
+
@metadata.description = lambda do
|
167
|
+
read_names
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
def each
|
172
|
+
return to_enum(__method__) unless block_given?
|
173
|
+
|
174
|
+
open_data do |csv|
|
175
|
+
csv.each do |row|
|
176
|
+
record = Record.new(*row)
|
177
|
+
yield(record)
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
private
|
183
|
+
def base_url
|
184
|
+
"https://archive.ics.uci.edu/ml/machine-learning-databases/hepatitis"
|
185
|
+
end
|
186
|
+
|
187
|
+
def open_data
|
188
|
+
data_path = cache_dir_path + "hepatitis.csv"
|
189
|
+
unless data_path.exist?
|
190
|
+
data_url = "#{base_url}/hepatitis.data"
|
191
|
+
download(data_path, data_url)
|
192
|
+
end
|
193
|
+
CSV.open(data_path) do |csv|
|
194
|
+
yield(csv)
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
def read_names
|
199
|
+
names_path = cache_dir_path + "hepatitis.names"
|
200
|
+
unless names_path.exist?
|
201
|
+
names_url = "#{base_url}/hepatitis.names"
|
202
|
+
download(names_path, names_url)
|
203
|
+
end
|
204
|
+
names_path.read
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
@@ -1,5 +1,6 @@
|
|
1
|
-
require "
|
2
|
-
require "rexml/
|
1
|
+
require "rexml/streamlistener"
|
2
|
+
require "rexml/parsers/baseparser"
|
3
|
+
require "rexml/parsers/streamparser"
|
3
4
|
|
4
5
|
require_relative "dataset"
|
5
6
|
|
@@ -32,26 +33,17 @@ module Datasets
|
|
32
33
|
end
|
33
34
|
end
|
34
35
|
|
35
|
-
def each
|
36
|
+
def each(&block)
|
36
37
|
return to_enum(__method__) unless block_given?
|
37
38
|
|
38
39
|
open_data do |input|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
if is_header
|
44
|
-
is_header = false
|
45
|
-
next
|
40
|
+
catch do |abort_tag|
|
41
|
+
listener = IndexListener.new(abort_tag) do |href, record|
|
42
|
+
parse_detail(href, record)
|
43
|
+
yield(record)
|
46
44
|
end
|
47
|
-
|
48
|
-
|
49
|
-
href = a.attributes["href"]
|
50
|
-
record = Record.new
|
51
|
-
record.name = a.text
|
52
|
-
record.files = []
|
53
|
-
parse_detail(href, record)
|
54
|
-
yield(record)
|
45
|
+
parser = REXML::Parsers::StreamParser.new(input, listener)
|
46
|
+
parser.parse
|
55
47
|
end
|
56
48
|
end
|
57
49
|
end
|
@@ -69,17 +61,11 @@ module Datasets
|
|
69
61
|
|
70
62
|
def extract_description
|
71
63
|
open_data do |input|
|
72
|
-
document = REXML::Document.new(input)
|
73
64
|
description = []
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
next
|
79
|
-
end
|
80
|
-
break if element.name == "hr"
|
81
|
-
content = extract_text(element)
|
82
|
-
description << content unless content.empty?
|
65
|
+
catch do |abort_tag|
|
66
|
+
listener = DescriptionListener.new(abort_tag, description)
|
67
|
+
parser = REXML::Parsers::StreamParser.new(input, listener)
|
68
|
+
parser.parse
|
83
69
|
end
|
84
70
|
description.join("\n\n")
|
85
71
|
end
|
@@ -102,36 +88,190 @@ module Datasets
|
|
102
88
|
|
103
89
|
def parse_detail(href, record)
|
104
90
|
path, id = href.split("#")
|
105
|
-
open_detail(path) do |
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
91
|
+
open_detail(path) do |input|
|
92
|
+
catch do |abort_tag|
|
93
|
+
listener = DetailListener.new(abort_tag, id, @metadata.url, record)
|
94
|
+
parser = REXML::Parsers::StreamParser.new(input, listener)
|
95
|
+
parser.parse
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
class IndexListener
|
101
|
+
include REXML::StreamListener
|
102
|
+
|
103
|
+
def initialize(abort_tag, &block)
|
104
|
+
@abort_tag = abort_tag
|
105
|
+
@block = block
|
106
|
+
@row = nil
|
107
|
+
@in_td = false
|
108
|
+
end
|
109
|
+
|
110
|
+
def tag_start(name, attributes)
|
111
|
+
case name
|
112
|
+
when "tr"
|
113
|
+
@row = []
|
114
|
+
when "td"
|
115
|
+
@in_td = true
|
116
|
+
@row << {:text => ""}
|
117
|
+
when "a"
|
118
|
+
@row.last[:href] = attributes["href"] if @in_td
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def tag_end(name)
|
123
|
+
case name
|
124
|
+
when "table"
|
125
|
+
throw(@abort_tag)
|
126
|
+
when "tr"
|
127
|
+
name_column = @row[0]
|
128
|
+
return unless name_column
|
129
|
+
record = Record.new
|
130
|
+
record.name = name_column[:text]
|
131
|
+
record.files = []
|
132
|
+
@block.call(name_column[:href], record)
|
133
|
+
when "td"
|
134
|
+
@in_td = false
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
def text(data)
|
139
|
+
@row.last[:text] << data if @in_td
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
class DetailListener
|
144
|
+
include REXML::StreamListener
|
145
|
+
|
146
|
+
def initialize(abort_tag, id, base_url, record)
|
147
|
+
@abort_tag = abort_tag
|
148
|
+
@id = id
|
149
|
+
@base_url = base_url
|
150
|
+
@record = record
|
151
|
+
@in_target = false
|
152
|
+
@target_li_level = nil
|
153
|
+
@key = nil
|
154
|
+
@data = nil
|
155
|
+
@file = nil
|
156
|
+
end
|
157
|
+
|
158
|
+
def tag_start(name, attributes)
|
159
|
+
if @in_target
|
160
|
+
case name
|
161
|
+
when "li"
|
162
|
+
@target_li_level += 1
|
163
|
+
case @target_li_level
|
164
|
+
when 0
|
165
|
+
@key = nil
|
166
|
+
@data = nil
|
167
|
+
@file = nil
|
168
|
+
when 1
|
169
|
+
@file = File.new
|
131
170
|
end
|
171
|
+
when "a"
|
172
|
+
@file.url = @base_url + attributes["href"] if @file
|
173
|
+
end
|
174
|
+
else
|
175
|
+
if attributes["name"] == @id
|
176
|
+
@in_target = true
|
177
|
+
@target_li_level = -1
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
def tag_end(name)
|
183
|
+
if @in_target
|
184
|
+
case name
|
185
|
+
when "ul"
|
186
|
+
throw(@abort_tag) if @target_li_level == -1
|
187
|
+
when "li"
|
188
|
+
case @target_li_level
|
189
|
+
when 0
|
190
|
+
if @key
|
191
|
+
data = @data
|
192
|
+
data = data.gsub(/[ \t\n]+/, " ").strip if data.is_a?(String)
|
193
|
+
@record[@key] = data
|
194
|
+
end
|
195
|
+
when 1
|
196
|
+
@data << @file if @data and @file
|
197
|
+
end
|
198
|
+
@target_li_level -= 1
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
def text(data)
|
204
|
+
case @target_li_level
|
205
|
+
when 0
|
206
|
+
if @key
|
207
|
+
@data << data
|
208
|
+
else
|
209
|
+
case data.gsub(/[ \t\n]+/, " ")
|
210
|
+
when /\ASource: /
|
211
|
+
@key = :source
|
212
|
+
@data = $POSTMATCH
|
213
|
+
when /\APreprocessing: /
|
214
|
+
@key = :preprocessing
|
215
|
+
@data = $POSTMATCH
|
216
|
+
when /\A\# of classes: (\d+)/
|
217
|
+
@key = :n_classes
|
218
|
+
@data = Integer($1, 10)
|
219
|
+
when /\A\# of data: ([\d,]+)/
|
220
|
+
@key = :n_data
|
221
|
+
@data = Integer($1.gsub(/,/, ""), 10)
|
222
|
+
when /\A\# of features: ([\d,]+)/
|
223
|
+
@key = :n_features
|
224
|
+
@data = Integer($1.gsub(/,/, ""), 10)
|
225
|
+
when /\AFiles:/
|
226
|
+
@key = :files
|
227
|
+
@data = []
|
228
|
+
end
|
229
|
+
end
|
230
|
+
when 1
|
231
|
+
if @file.name.nil?
|
232
|
+
@file.name = data
|
233
|
+
else
|
234
|
+
@file.note = data.strip.gsub(/[()]/, "")
|
132
235
|
end
|
133
236
|
end
|
134
237
|
end
|
135
238
|
end
|
239
|
+
|
240
|
+
class DescriptionListener
|
241
|
+
include REXML::StreamListener
|
242
|
+
|
243
|
+
def initialize(abort_tag, description)
|
244
|
+
@abort_tag = abort_tag
|
245
|
+
@description = description
|
246
|
+
@in_content = false
|
247
|
+
@p = nil
|
248
|
+
end
|
249
|
+
|
250
|
+
def tag_start(name, attributes)
|
251
|
+
case name
|
252
|
+
when "p"
|
253
|
+
@in_content = true
|
254
|
+
@p = []
|
255
|
+
when "br"
|
256
|
+
@description << @p.join(" ")
|
257
|
+
@p = []
|
258
|
+
when "hr"
|
259
|
+
throw(@abort_tag)
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
def tag_end(name)
|
264
|
+
case name
|
265
|
+
when "p"
|
266
|
+
@description << @p.join(" ")
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
def text(data)
|
271
|
+
return unless @in_content
|
272
|
+
content = data.gsub(/[ \t\n]+/, " ").strip
|
273
|
+
@p << content unless content.empty?
|
274
|
+
end
|
275
|
+
end
|
136
276
|
end
|
137
277
|
end
|