red-datasets 0.0.7 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,4 @@
1
+ module Datasets
2
+ class Error < StandardError
3
+ end
4
+ end
@@ -0,0 +1,207 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class Hepatitis < Dataset
7
+ class Record < Struct.new(:label,
8
+ :age,
9
+ :sex,
10
+ :steroid,
11
+ :antivirals,
12
+ :fatigue,
13
+ :malaise,
14
+ :anorexia,
15
+ :liver_big,
16
+ :liver_firm,
17
+ :spleen_palpable,
18
+ :spiders,
19
+ :ascites,
20
+ :varices,
21
+ :bilirubin,
22
+ :alkaline_phosphate,
23
+ :sgot,
24
+ :albumin,
25
+ :protime,
26
+ :histology)
27
+ def initialize(*values)
28
+ super()
29
+ members.zip(values) do |member, value|
30
+ __send__("#{member}=", value)
31
+ end
32
+ end
33
+
34
+ def label=(label)
35
+ case label
36
+ when "1"
37
+ super(:die)
38
+ when "2"
39
+ super(:live)
40
+ else
41
+ super(label)
42
+ end
43
+ end
44
+
45
+ def age=(age)
46
+ super(normalize_integer(age))
47
+ end
48
+
49
+ def sex=(sex)
50
+ case sex
51
+ when "1"
52
+ super(:male)
53
+ when "2"
54
+ super(:female)
55
+ else
56
+ super(sex)
57
+ end
58
+ end
59
+
60
+ def steroid=(steroid)
61
+ super(normalize_boolean(steroid))
62
+ end
63
+
64
+ def antivirals=(antivirals)
65
+ super(normalize_boolean(antivirals))
66
+ end
67
+
68
+ def fatigue=(fatigue)
69
+ super(normalize_boolean(fatigue))
70
+ end
71
+
72
+ def malaise=(malaise)
73
+ super(normalize_boolean(malaise))
74
+ end
75
+
76
+ def anorexia=(anorexia)
77
+ super(normalize_boolean(anorexia))
78
+ end
79
+
80
+ def liver_big=(liver_big)
81
+ super(normalize_boolean(liver_big))
82
+ end
83
+
84
+ def liver_firm=(liver_firm)
85
+ super(normalize_boolean(liver_firm))
86
+ end
87
+
88
+ def spleen_palpable=(spleen_palpable)
89
+ super(normalize_boolean(spleen_palpable))
90
+ end
91
+
92
+ def spiders=(spiders)
93
+ super(normalize_boolean(spiders))
94
+ end
95
+
96
+ def ascites=(ascites)
97
+ super(normalize_boolean(ascites))
98
+ end
99
+
100
+ def varices=(varices)
101
+ super(normalize_boolean(varices))
102
+ end
103
+
104
+ def bilirubin=(bilirubin)
105
+ super(normalize_float(bilirubin))
106
+ end
107
+
108
+ def alkaline_phosphate=(alkaline_phosphate)
109
+ super(normalize_integer(alkaline_phosphate))
110
+ end
111
+
112
+ def sgot=(sgot)
113
+ super(normalize_integer(sgot))
114
+ end
115
+
116
+ def albumin=(albumin)
117
+ super(normalize_float(albumin))
118
+ end
119
+
120
+ def protime=(protime)
121
+ super(normalize_integer(protime))
122
+ end
123
+
124
+ def histology=(histology)
125
+ super(normalize_boolean(histology))
126
+ end
127
+
128
+ private
129
+ def normalize_boolean(value)
130
+ case value
131
+ when "?"
132
+ nil
133
+ when "1"
134
+ false
135
+ when "2"
136
+ true
137
+ else
138
+ value
139
+ end
140
+ end
141
+
142
+ def normalize_float(value)
143
+ case value
144
+ when "?"
145
+ nil
146
+ else
147
+ Float(value)
148
+ end
149
+ end
150
+
151
+ def normalize_integer(value)
152
+ case value
153
+ when "?"
154
+ nil
155
+ else
156
+ Integer(value, 10)
157
+ end
158
+ end
159
+ end
160
+
161
+ def initialize
162
+ super()
163
+ @metadata.id = "hepatitis"
164
+ @metadata.name = "Hepatitis"
165
+ @metadata.url = "https://archive.ics.uci.edu/ml/datasets/hepatitis"
166
+ @metadata.description = lambda do
167
+ read_names
168
+ end
169
+ end
170
+
171
+ def each
172
+ return to_enum(__method__) unless block_given?
173
+
174
+ open_data do |csv|
175
+ csv.each do |row|
176
+ record = Record.new(*row)
177
+ yield(record)
178
+ end
179
+ end
180
+ end
181
+
182
+ private
183
+ def base_url
184
+ "https://archive.ics.uci.edu/ml/machine-learning-databases/hepatitis"
185
+ end
186
+
187
+ def open_data
188
+ data_path = cache_dir_path + "hepatitis.csv"
189
+ unless data_path.exist?
190
+ data_url = "#{base_url}/hepatitis.data"
191
+ download(data_path, data_url)
192
+ end
193
+ CSV.open(data_path) do |csv|
194
+ yield(csv)
195
+ end
196
+ end
197
+
198
+ def read_names
199
+ names_path = cache_dir_path + "hepatitis.names"
200
+ unless names_path.exist?
201
+ names_url = "#{base_url}/hepatitis.names"
202
+ download(names_path, names_url)
203
+ end
204
+ names_path.read
205
+ end
206
+ end
207
+ end
@@ -0,0 +1,277 @@
1
+ require "rexml/streamlistener"
2
+ require "rexml/parsers/baseparser"
3
+ require "rexml/parsers/streamparser"
4
+
5
+ require_relative "dataset"
6
+
7
+ module Datasets
8
+ class LIBSVMDatasetList < Dataset
9
+ File = Struct.new(:name,
10
+ :url,
11
+ :note)
12
+ class Record < Struct.new(:name,
13
+ :source,
14
+ :preprocessing,
15
+ :n_classes,
16
+ :n_data,
17
+ :n_features,
18
+ :files)
19
+ def to_h
20
+ hash = super
21
+ hash[:files] = hash[:files].collect(&:to_h)
22
+ hash
23
+ end
24
+ end
25
+
26
+ def initialize
27
+ super()
28
+ @metadata.id = "libsvm-dataset-list"
29
+ @metadata.name = "LIBSVM dataset list"
30
+ @metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
31
+ @metadata.description = lambda do
32
+ extract_description
33
+ end
34
+ end
35
+
36
+ def each(&block)
37
+ return to_enum(__method__) unless block_given?
38
+
39
+ open_data do |input|
40
+ catch do |abort_tag|
41
+ listener = IndexListener.new(abort_tag) do |href, record|
42
+ parse_detail(href, record)
43
+ yield(record)
44
+ end
45
+ parser = REXML::Parsers::StreamParser.new(input, listener)
46
+ parser.parse
47
+ end
48
+ end
49
+ end
50
+
51
+ private
52
+ def open_data
53
+ data_path = cache_dir_path + "index.html"
54
+ unless data_path.exist?
55
+ download(data_path, @metadata.url)
56
+ end
57
+ ::File.open(data_path) do |input|
58
+ yield(input)
59
+ end
60
+ end
61
+
62
+ def extract_description
63
+ open_data do |input|
64
+ description = []
65
+ catch do |abort_tag|
66
+ listener = DescriptionListener.new(abort_tag, description)
67
+ parser = REXML::Parsers::StreamParser.new(input, listener)
68
+ parser.parse
69
+ end
70
+ description.join("\n\n")
71
+ end
72
+ end
73
+
74
+ def extract_text(element)
75
+ texts = REXML::XPath.match(element, ".//text()")
76
+ texts.join("").gsub(/[ \t\n]+/, " ").strip
77
+ end
78
+
79
+ def open_detail(detail)
80
+ data_path = cache_dir_path + detail
81
+ unless data_path.exist?
82
+ download(data_path, @metadata.url + detail)
83
+ end
84
+ ::File.open(data_path) do |input|
85
+ yield(input)
86
+ end
87
+ end
88
+
89
+ def parse_detail(href, record)
90
+ path, id = href.split("#")
91
+ open_detail(path) do |input|
92
+ catch do |abort_tag|
93
+ listener = DetailListener.new(abort_tag, id, @metadata.url, record)
94
+ parser = REXML::Parsers::StreamParser.new(input, listener)
95
+ parser.parse
96
+ end
97
+ end
98
+ end
99
+
100
+ class IndexListener
101
+ include REXML::StreamListener
102
+
103
+ def initialize(abort_tag, &block)
104
+ @abort_tag = abort_tag
105
+ @block = block
106
+ @row = nil
107
+ @in_td = false
108
+ end
109
+
110
+ def tag_start(name, attributes)
111
+ case name
112
+ when "tr"
113
+ @row = []
114
+ when "td"
115
+ @in_td = true
116
+ @row << {:text => ""}
117
+ when "a"
118
+ @row.last[:href] = attributes["href"] if @in_td
119
+ end
120
+ end
121
+
122
+ def tag_end(name)
123
+ case name
124
+ when "table"
125
+ throw(@abort_tag)
126
+ when "tr"
127
+ name_column = @row[0]
128
+ return unless name_column
129
+ record = Record.new
130
+ record.name = name_column[:text]
131
+ record.files = []
132
+ @block.call(name_column[:href], record)
133
+ when "td"
134
+ @in_td = false
135
+ end
136
+ end
137
+
138
+ def text(data)
139
+ @row.last[:text] << data if @in_td
140
+ end
141
+ end
142
+
143
+ class DetailListener
144
+ include REXML::StreamListener
145
+
146
+ def initialize(abort_tag, id, base_url, record)
147
+ @abort_tag = abort_tag
148
+ @id = id
149
+ @base_url = base_url
150
+ @record = record
151
+ @in_target = false
152
+ @target_li_level = nil
153
+ @key = nil
154
+ @data = nil
155
+ @file = nil
156
+ end
157
+
158
+ def tag_start(name, attributes)
159
+ if @in_target
160
+ case name
161
+ when "li"
162
+ @target_li_level += 1
163
+ case @target_li_level
164
+ when 0
165
+ @key = nil
166
+ @data = nil
167
+ @file = nil
168
+ when 1
169
+ @file = File.new
170
+ end
171
+ when "a"
172
+ @file.url = @base_url + attributes["href"] if @file
173
+ end
174
+ else
175
+ if attributes["name"] == @id
176
+ @in_target = true
177
+ @target_li_level = -1
178
+ end
179
+ end
180
+ end
181
+
182
+ def tag_end(name)
183
+ if @in_target
184
+ case name
185
+ when "ul"
186
+ throw(@abort_tag) if @target_li_level == -1
187
+ when "li"
188
+ case @target_li_level
189
+ when 0
190
+ if @key
191
+ data = @data
192
+ data = data.gsub(/[ \t\n]+/, " ").strip if data.is_a?(String)
193
+ @record[@key] = data
194
+ end
195
+ when 1
196
+ @data << @file if @data and @file
197
+ end
198
+ @target_li_level -= 1
199
+ end
200
+ end
201
+ end
202
+
203
+ def text(data)
204
+ case @target_li_level
205
+ when 0
206
+ if @key
207
+ @data << data
208
+ else
209
+ case data.gsub(/[ \t\n]+/, " ")
210
+ when /\ASource: /
211
+ @key = :source
212
+ @data = $POSTMATCH
213
+ when /\APreprocessing: /
214
+ @key = :preprocessing
215
+ @data = $POSTMATCH
216
+ when /\A\# of classes: (\d+)/
217
+ @key = :n_classes
218
+ @data = Integer($1, 10)
219
+ when /\A\# of data: ([\d,]+)/
220
+ @key = :n_data
221
+ @data = Integer($1.gsub(/,/, ""), 10)
222
+ when /\A\# of features: ([\d,]+)/
223
+ @key = :n_features
224
+ @data = Integer($1.gsub(/,/, ""), 10)
225
+ when /\AFiles:/
226
+ @key = :files
227
+ @data = []
228
+ end
229
+ end
230
+ when 1
231
+ if @file.name.nil?
232
+ @file.name = data
233
+ else
234
+ @file.note = data.strip.gsub(/[()]/, "")
235
+ end
236
+ end
237
+ end
238
+ end
239
+
240
+ class DescriptionListener
241
+ include REXML::StreamListener
242
+
243
+ def initialize(abort_tag, description)
244
+ @abort_tag = abort_tag
245
+ @description = description
246
+ @in_content = false
247
+ @p = nil
248
+ end
249
+
250
+ def tag_start(name, attributes)
251
+ case name
252
+ when "p"
253
+ @in_content = true
254
+ @p = []
255
+ when "br"
256
+ @description << @p.join(" ")
257
+ @p = []
258
+ when "hr"
259
+ throw(@abort_tag)
260
+ end
261
+ end
262
+
263
+ def tag_end(name)
264
+ case name
265
+ when "p"
266
+ @description << @p.join(" ")
267
+ end
268
+ end
269
+
270
+ def text(data)
271
+ return unless @in_content
272
+ content = data.gsub(/[ \t\n]+/, " ").strip
273
+ @p << content unless content.empty?
274
+ end
275
+ end
276
+ end
277
+ end