red-datasets 0.0.8 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,4 @@
1
+ module Datasets
2
+ class Error < StandardError
3
+ end
4
+ end
@@ -0,0 +1,207 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class Hepatitis < Dataset
7
+ class Record < Struct.new(:label,
8
+ :age,
9
+ :sex,
10
+ :steroid,
11
+ :antivirals,
12
+ :fatigue,
13
+ :malaise,
14
+ :anorexia,
15
+ :liver_big,
16
+ :liver_firm,
17
+ :spleen_palpable,
18
+ :spiders,
19
+ :ascites,
20
+ :varices,
21
+ :bilirubin,
22
+ :alkaline_phosphate,
23
+ :sgot,
24
+ :albumin,
25
+ :protime,
26
+ :histology)
27
+ def initialize(*values)
28
+ super()
29
+ members.zip(values) do |member, value|
30
+ __send__("#{member}=", value)
31
+ end
32
+ end
33
+
34
+ def label=(label)
35
+ case label
36
+ when "1"
37
+ super(:die)
38
+ when "2"
39
+ super(:live)
40
+ else
41
+ super(label)
42
+ end
43
+ end
44
+
45
+ def age=(age)
46
+ super(normalize_integer(age))
47
+ end
48
+
49
+ def sex=(sex)
50
+ case sex
51
+ when "1"
52
+ super(:male)
53
+ when "2"
54
+ super(:female)
55
+ else
56
+ super(sex)
57
+ end
58
+ end
59
+
60
+ def steroid=(steroid)
61
+ super(normalize_boolean(steroid))
62
+ end
63
+
64
+ def antivirals=(antivirals)
65
+ super(normalize_boolean(antivirals))
66
+ end
67
+
68
+ def fatigue=(fatigue)
69
+ super(normalize_boolean(fatigue))
70
+ end
71
+
72
+ def malaise=(malaise)
73
+ super(normalize_boolean(malaise))
74
+ end
75
+
76
+ def anorexia=(anorexia)
77
+ super(normalize_boolean(anorexia))
78
+ end
79
+
80
+ def liver_big=(liver_big)
81
+ super(normalize_boolean(liver_big))
82
+ end
83
+
84
+ def liver_firm=(liver_firm)
85
+ super(normalize_boolean(liver_firm))
86
+ end
87
+
88
+ def spleen_palpable=(spleen_palpable)
89
+ super(normalize_boolean(spleen_palpable))
90
+ end
91
+
92
+ def spiders=(spiders)
93
+ super(normalize_boolean(spiders))
94
+ end
95
+
96
+ def ascites=(ascites)
97
+ super(normalize_boolean(ascites))
98
+ end
99
+
100
+ def varices=(varices)
101
+ super(normalize_boolean(varices))
102
+ end
103
+
104
+ def bilirubin=(bilirubin)
105
+ super(normalize_float(bilirubin))
106
+ end
107
+
108
+ def alkaline_phosphate=(alkaline_phosphate)
109
+ super(normalize_integer(alkaline_phosphate))
110
+ end
111
+
112
+ def sgot=(sgot)
113
+ super(normalize_integer(sgot))
114
+ end
115
+
116
+ def albumin=(albumin)
117
+ super(normalize_float(albumin))
118
+ end
119
+
120
+ def protime=(protime)
121
+ super(normalize_integer(protime))
122
+ end
123
+
124
+ def histology=(histology)
125
+ super(normalize_boolean(histology))
126
+ end
127
+
128
+ private
129
+ def normalize_boolean(value)
130
+ case value
131
+ when "?"
132
+ nil
133
+ when "1"
134
+ false
135
+ when "2"
136
+ true
137
+ else
138
+ value
139
+ end
140
+ end
141
+
142
+ def normalize_float(value)
143
+ case value
144
+ when "?"
145
+ nil
146
+ else
147
+ Float(value)
148
+ end
149
+ end
150
+
151
+ def normalize_integer(value)
152
+ case value
153
+ when "?"
154
+ nil
155
+ else
156
+ Integer(value, 10)
157
+ end
158
+ end
159
+ end
160
+
161
+ def initialize
162
+ super()
163
+ @metadata.id = "hepatitis"
164
+ @metadata.name = "Hepatitis"
165
+ @metadata.url = "https://archive.ics.uci.edu/ml/datasets/hepatitis"
166
+ @metadata.description = lambda do
167
+ read_names
168
+ end
169
+ end
170
+
171
+ def each
172
+ return to_enum(__method__) unless block_given?
173
+
174
+ open_data do |csv|
175
+ csv.each do |row|
176
+ record = Record.new(*row)
177
+ yield(record)
178
+ end
179
+ end
180
+ end
181
+
182
+ private
183
+ def base_url
184
+ "https://archive.ics.uci.edu/ml/machine-learning-databases/hepatitis"
185
+ end
186
+
187
+ def open_data
188
+ data_path = cache_dir_path + "hepatitis.csv"
189
+ unless data_path.exist?
190
+ data_url = "#{base_url}/hepatitis.data"
191
+ download(data_path, data_url)
192
+ end
193
+ CSV.open(data_path) do |csv|
194
+ yield(csv)
195
+ end
196
+ end
197
+
198
+ def read_names
199
+ names_path = cache_dir_path + "hepatitis.names"
200
+ unless names_path.exist?
201
+ names_url = "#{base_url}/hepatitis.names"
202
+ download(names_path, names_url)
203
+ end
204
+ names_path.read
205
+ end
206
+ end
207
+ end
@@ -1,5 +1,6 @@
1
- require "English"
2
- require "rexml/document"
1
+ require "rexml/streamlistener"
2
+ require "rexml/parsers/baseparser"
3
+ require "rexml/parsers/streamparser"
3
4
 
4
5
  require_relative "dataset"
5
6
 
@@ -32,26 +33,17 @@ module Datasets
32
33
  end
33
34
  end
34
35
 
35
- def each
36
+ def each(&block)
36
37
  return to_enum(__method__) unless block_given?
37
38
 
38
39
  open_data do |input|
39
- # TODO: Improve performance
40
- document = REXML::Document.new(input)
41
- is_header = true
42
- document.each_element("//tr") do |tr|
43
- if is_header
44
- is_header = false
45
- next
40
+ catch do |abort_tag|
41
+ listener = IndexListener.new(abort_tag) do |href, record|
42
+ parse_detail(href, record)
43
+ yield(record)
46
44
  end
47
- name = tr.elements.first
48
- a = name.elements.first
49
- href = a.attributes["href"]
50
- record = Record.new
51
- record.name = a.text
52
- record.files = []
53
- parse_detail(href, record)
54
- yield(record)
45
+ parser = REXML::Parsers::StreamParser.new(input, listener)
46
+ parser.parse
55
47
  end
56
48
  end
57
49
  end
@@ -69,17 +61,11 @@ module Datasets
69
61
 
70
62
  def extract_description
71
63
  open_data do |input|
72
- document = REXML::Document.new(input)
73
64
  description = []
74
- in_content = false
75
- document.each_element("//body/*") do |element|
76
- unless in_content
77
- in_content = (element.name == "h1")
78
- next
79
- end
80
- break if element.name == "hr"
81
- content = extract_text(element)
82
- description << content unless content.empty?
65
+ catch do |abort_tag|
66
+ listener = DescriptionListener.new(abort_tag, description)
67
+ parser = REXML::Parsers::StreamParser.new(input, listener)
68
+ parser.parse
83
69
  end
84
70
  description.join("\n\n")
85
71
  end
@@ -102,36 +88,190 @@ module Datasets
102
88
 
103
89
  def parse_detail(href, record)
104
90
  path, id = href.split("#")
105
- open_detail(path) do |detail|
106
- detail_document = REXML::Document.new(detail)
107
- anchor = REXML::XPath.match(detail_document, "//*[@name='#{id}']")[0]
108
- ul = anchor.next_sibling
109
- ul.each_element do |li|
110
- text = extract_text(li)
111
- case text
112
- when /\ASource: /
113
- record.source = $POSTMATCH
114
- when /\APreprocessing: /
115
- record.preprocessing = $POSTMATCH
116
- when /\A\# of classes: (\d+)/
117
- record.n_classes = Integer($1, 10)
118
- when /\A\# of data: ([\d,]+)/
119
- record.n_data = Integer($1.gsub(/,/, ""), 10)
120
- when /\A\# of features: ([\d,]+)/
121
- record.n_features = Integer($1.gsub(/,/, ""), 10)
122
- when /\AFiles:/
123
- li.elements.first.each_element do |file_li|
124
- file_a = file_li.elements.first
125
- file = File.new
126
- file.name = file_a.text
127
- file.url = @metadata.url + file_a.attributes["href"]
128
- file_note = file_li.text
129
- file.note = file_note.strip.gsub(/[()]/, "") if file_note
130
- record.files << file
91
+ open_detail(path) do |input|
92
+ catch do |abort_tag|
93
+ listener = DetailListener.new(abort_tag, id, @metadata.url, record)
94
+ parser = REXML::Parsers::StreamParser.new(input, listener)
95
+ parser.parse
96
+ end
97
+ end
98
+ end
99
+
100
+ class IndexListener
101
+ include REXML::StreamListener
102
+
103
+ def initialize(abort_tag, &block)
104
+ @abort_tag = abort_tag
105
+ @block = block
106
+ @row = nil
107
+ @in_td = false
108
+ end
109
+
110
+ def tag_start(name, attributes)
111
+ case name
112
+ when "tr"
113
+ @row = []
114
+ when "td"
115
+ @in_td = true
116
+ @row << {:text => ""}
117
+ when "a"
118
+ @row.last[:href] = attributes["href"] if @in_td
119
+ end
120
+ end
121
+
122
+ def tag_end(name)
123
+ case name
124
+ when "table"
125
+ throw(@abort_tag)
126
+ when "tr"
127
+ name_column = @row[0]
128
+ return unless name_column
129
+ record = Record.new
130
+ record.name = name_column[:text]
131
+ record.files = []
132
+ @block.call(name_column[:href], record)
133
+ when "td"
134
+ @in_td = false
135
+ end
136
+ end
137
+
138
+ def text(data)
139
+ @row.last[:text] << data if @in_td
140
+ end
141
+ end
142
+
143
+ class DetailListener
144
+ include REXML::StreamListener
145
+
146
+ def initialize(abort_tag, id, base_url, record)
147
+ @abort_tag = abort_tag
148
+ @id = id
149
+ @base_url = base_url
150
+ @record = record
151
+ @in_target = false
152
+ @target_li_level = nil
153
+ @key = nil
154
+ @data = nil
155
+ @file = nil
156
+ end
157
+
158
+ def tag_start(name, attributes)
159
+ if @in_target
160
+ case name
161
+ when "li"
162
+ @target_li_level += 1
163
+ case @target_li_level
164
+ when 0
165
+ @key = nil
166
+ @data = nil
167
+ @file = nil
168
+ when 1
169
+ @file = File.new
131
170
  end
171
+ when "a"
172
+ @file.url = @base_url + attributes["href"] if @file
173
+ end
174
+ else
175
+ if attributes["name"] == @id
176
+ @in_target = true
177
+ @target_li_level = -1
178
+ end
179
+ end
180
+ end
181
+
182
+ def tag_end(name)
183
+ if @in_target
184
+ case name
185
+ when "ul"
186
+ throw(@abort_tag) if @target_li_level == -1
187
+ when "li"
188
+ case @target_li_level
189
+ when 0
190
+ if @key
191
+ data = @data
192
+ data = data.gsub(/[ \t\n]+/, " ").strip if data.is_a?(String)
193
+ @record[@key] = data
194
+ end
195
+ when 1
196
+ @data << @file if @data and @file
197
+ end
198
+ @target_li_level -= 1
199
+ end
200
+ end
201
+ end
202
+
203
+ def text(data)
204
+ case @target_li_level
205
+ when 0
206
+ if @key
207
+ @data << data
208
+ else
209
+ case data.gsub(/[ \t\n]+/, " ")
210
+ when /\ASource: /
211
+ @key = :source
212
+ @data = $POSTMATCH
213
+ when /\APreprocessing: /
214
+ @key = :preprocessing
215
+ @data = $POSTMATCH
216
+ when /\A\# of classes: (\d+)/
217
+ @key = :n_classes
218
+ @data = Integer($1, 10)
219
+ when /\A\# of data: ([\d,]+)/
220
+ @key = :n_data
221
+ @data = Integer($1.gsub(/,/, ""), 10)
222
+ when /\A\# of features: ([\d,]+)/
223
+ @key = :n_features
224
+ @data = Integer($1.gsub(/,/, ""), 10)
225
+ when /\AFiles:/
226
+ @key = :files
227
+ @data = []
228
+ end
229
+ end
230
+ when 1
231
+ if @file.name.nil?
232
+ @file.name = data
233
+ else
234
+ @file.note = data.strip.gsub(/[()]/, "")
132
235
  end
133
236
  end
134
237
  end
135
238
  end
239
+
240
+ class DescriptionListener
241
+ include REXML::StreamListener
242
+
243
+ def initialize(abort_tag, description)
244
+ @abort_tag = abort_tag
245
+ @description = description
246
+ @in_content = false
247
+ @p = nil
248
+ end
249
+
250
+ def tag_start(name, attributes)
251
+ case name
252
+ when "p"
253
+ @in_content = true
254
+ @p = []
255
+ when "br"
256
+ @description << @p.join(" ")
257
+ @p = []
258
+ when "hr"
259
+ throw(@abort_tag)
260
+ end
261
+ end
262
+
263
+ def tag_end(name)
264
+ case name
265
+ when "p"
266
+ @description << @p.join(" ")
267
+ end
268
+ end
269
+
270
+ def text(data)
271
+ return unless @in_content
272
+ content = data.gsub(/[ \t\n]+/, " ").strip
273
+ @p << content unless content.empty?
274
+ end
275
+ end
136
276
  end
137
277
  end