red-datasets 0.0.6 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -7
  3. data/doc/text/news.md +124 -0
  4. data/lib/datasets.rb +18 -6
  5. data/lib/datasets/adult.rb +84 -0
  6. data/lib/datasets/cldr-plurals.rb +385 -0
  7. data/lib/datasets/communities.rb +198 -0
  8. data/lib/datasets/dataset.rb +13 -0
  9. data/lib/datasets/dictionary.rb +59 -0
  10. data/lib/datasets/downloader.rb +37 -62
  11. data/lib/datasets/e-stat-japan.rb +320 -0
  12. data/lib/datasets/error.rb +4 -0
  13. data/lib/datasets/fashion-mnist.rb +12 -0
  14. data/lib/datasets/hepatitis.rb +207 -0
  15. data/lib/datasets/iris.rb +1 -1
  16. data/lib/datasets/libsvm-dataset-list.rb +277 -0
  17. data/lib/datasets/libsvm.rb +135 -0
  18. data/lib/datasets/mnist.rb +11 -8
  19. data/lib/datasets/mushroom.rb +256 -0
  20. data/lib/datasets/penguins.rb +125 -0
  21. data/lib/datasets/penn-treebank.rb +2 -9
  22. data/lib/datasets/postal-code-japan.rb +154 -0
  23. data/lib/datasets/table.rb +99 -3
  24. data/lib/datasets/version.rb +1 -1
  25. data/lib/datasets/wikipedia.rb +2 -10
  26. data/lib/datasets/wine.rb +64 -0
  27. data/red-datasets.gemspec +4 -0
  28. data/test/helper.rb +1 -0
  29. data/test/run-test.rb +2 -0
  30. data/test/test-adult.rb +126 -0
  31. data/test/test-cldr-plurals.rb +180 -0
  32. data/test/test-communities.rb +290 -0
  33. data/test/test-dictionary.rb +43 -0
  34. data/test/test-e-stat-japan.rb +383 -0
  35. data/test/test-fashion-mnist.rb +137 -0
  36. data/test/test-hepatitis.rb +74 -0
  37. data/test/test-libsvm-dataset-list.rb +47 -0
  38. data/test/test-libsvm.rb +205 -0
  39. data/test/test-mnist.rb +95 -70
  40. data/test/test-mushroom.rb +80 -0
  41. data/test/test-penguins.rb +239 -0
  42. data/test/test-penn-treebank.rb +6 -6
  43. data/test/test-postal-code-japan.rb +69 -0
  44. data/test/test-table.rb +144 -19
  45. data/test/test-wine.rb +58 -0
  46. metadata +89 -8
@@ -0,0 +1,4 @@
1
+ module Datasets
2
+ class Error < StandardError
3
+ end
4
+ end
@@ -0,0 +1,12 @@
1
+ require_relative 'mnist'
2
+
3
+ module Datasets
4
+ class FashionMNIST < MNIST
5
+ BASE_URL = "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/"
6
+
7
+ private
8
+ def dataset_name
9
+ "Fashion-MNIST"
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,207 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class Hepatitis < Dataset
7
+ class Record < Struct.new(:label,
8
+ :age,
9
+ :sex,
10
+ :steroid,
11
+ :antivirals,
12
+ :fatigue,
13
+ :malaise,
14
+ :anorexia,
15
+ :liver_big,
16
+ :liver_firm,
17
+ :spleen_palpable,
18
+ :spiders,
19
+ :ascites,
20
+ :varices,
21
+ :bilirubin,
22
+ :alkaline_phosphate,
23
+ :sgot,
24
+ :albumin,
25
+ :protime,
26
+ :histology)
27
+ def initialize(*values)
28
+ super()
29
+ members.zip(values) do |member, value|
30
+ __send__("#{member}=", value)
31
+ end
32
+ end
33
+
34
+ def label=(label)
35
+ case label
36
+ when "1"
37
+ super(:die)
38
+ when "2"
39
+ super(:live)
40
+ else
41
+ super(label)
42
+ end
43
+ end
44
+
45
+ def age=(age)
46
+ super(normalize_integer(age))
47
+ end
48
+
49
+ def sex=(sex)
50
+ case sex
51
+ when "1"
52
+ super(:male)
53
+ when "2"
54
+ super(:female)
55
+ else
56
+ super(sex)
57
+ end
58
+ end
59
+
60
+ def steroid=(steroid)
61
+ super(normalize_boolean(steroid))
62
+ end
63
+
64
+ def antivirals=(antivirals)
65
+ super(normalize_boolean(antivirals))
66
+ end
67
+
68
+ def fatigue=(fatigue)
69
+ super(normalize_boolean(fatigue))
70
+ end
71
+
72
+ def malaise=(malaise)
73
+ super(normalize_boolean(malaise))
74
+ end
75
+
76
+ def anorexia=(anorexia)
77
+ super(normalize_boolean(anorexia))
78
+ end
79
+
80
+ def liver_big=(liver_big)
81
+ super(normalize_boolean(liver_big))
82
+ end
83
+
84
+ def liver_firm=(liver_firm)
85
+ super(normalize_boolean(liver_firm))
86
+ end
87
+
88
+ def spleen_palpable=(spleen_palpable)
89
+ super(normalize_boolean(spleen_palpable))
90
+ end
91
+
92
+ def spiders=(spiders)
93
+ super(normalize_boolean(spiders))
94
+ end
95
+
96
+ def ascites=(ascites)
97
+ super(normalize_boolean(ascites))
98
+ end
99
+
100
+ def varices=(varices)
101
+ super(normalize_boolean(varices))
102
+ end
103
+
104
+ def bilirubin=(bilirubin)
105
+ super(normalize_float(bilirubin))
106
+ end
107
+
108
+ def alkaline_phosphate=(alkaline_phosphate)
109
+ super(normalize_integer(alkaline_phosphate))
110
+ end
111
+
112
+ def sgot=(sgot)
113
+ super(normalize_integer(sgot))
114
+ end
115
+
116
+ def albumin=(albumin)
117
+ super(normalize_float(albumin))
118
+ end
119
+
120
+ def protime=(protime)
121
+ super(normalize_integer(protime))
122
+ end
123
+
124
+ def histology=(histology)
125
+ super(normalize_boolean(histology))
126
+ end
127
+
128
+ private
129
+ def normalize_boolean(value)
130
+ case value
131
+ when "?"
132
+ nil
133
+ when "1"
134
+ false
135
+ when "2"
136
+ true
137
+ else
138
+ value
139
+ end
140
+ end
141
+
142
+ def normalize_float(value)
143
+ case value
144
+ when "?"
145
+ nil
146
+ else
147
+ Float(value)
148
+ end
149
+ end
150
+
151
+ def normalize_integer(value)
152
+ case value
153
+ when "?"
154
+ nil
155
+ else
156
+ Integer(value, 10)
157
+ end
158
+ end
159
+ end
160
+
161
+ def initialize
162
+ super()
163
+ @metadata.id = "hepatitis"
164
+ @metadata.name = "Hepatitis"
165
+ @metadata.url = "https://archive.ics.uci.edu/ml/datasets/hepatitis"
166
+ @metadata.description = lambda do
167
+ read_names
168
+ end
169
+ end
170
+
171
+ def each
172
+ return to_enum(__method__) unless block_given?
173
+
174
+ open_data do |csv|
175
+ csv.each do |row|
176
+ record = Record.new(*row)
177
+ yield(record)
178
+ end
179
+ end
180
+ end
181
+
182
+ private
183
+ def base_url
184
+ "https://archive.ics.uci.edu/ml/machine-learning-databases/hepatitis"
185
+ end
186
+
187
+ def open_data
188
+ data_path = cache_dir_path + "hepatitis.csv"
189
+ unless data_path.exist?
190
+ data_url = "#{base_url}/hepatitis.data"
191
+ download(data_path, data_url)
192
+ end
193
+ CSV.open(data_path) do |csv|
194
+ yield(csv)
195
+ end
196
+ end
197
+
198
+ def read_names
199
+ names_path = cache_dir_path + "hepatitis.names"
200
+ unless names_path.exist?
201
+ names_url = "#{base_url}/hepatitis.names"
202
+ download(names_path, names_url)
203
+ end
204
+ names_path.read
205
+ end
206
+ end
207
+ end
data/lib/datasets/iris.rb CHANGED
@@ -8,7 +8,7 @@ module Datasets
8
8
  :sepal_width,
9
9
  :petal_length,
10
10
  :petal_width,
11
- :class)
11
+ :label)
12
12
 
13
13
  def initialize
14
14
  super()
@@ -0,0 +1,277 @@
1
+ require "rexml/streamlistener"
2
+ require "rexml/parsers/baseparser"
3
+ require "rexml/parsers/streamparser"
4
+
5
+ require_relative "dataset"
6
+
7
+ module Datasets
8
+ class LIBSVMDatasetList < Dataset
9
+ File = Struct.new(:name,
10
+ :url,
11
+ :note)
12
+ class Record < Struct.new(:name,
13
+ :source,
14
+ :preprocessing,
15
+ :n_classes,
16
+ :n_data,
17
+ :n_features,
18
+ :files)
19
+ def to_h
20
+ hash = super
21
+ hash[:files] = hash[:files].collect(&:to_h)
22
+ hash
23
+ end
24
+ end
25
+
26
+ def initialize
27
+ super()
28
+ @metadata.id = "libsvm-dataset-list"
29
+ @metadata.name = "LIBSVM dataset list"
30
+ @metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
31
+ @metadata.description = lambda do
32
+ extract_description
33
+ end
34
+ end
35
+
36
+ def each(&block)
37
+ return to_enum(__method__) unless block_given?
38
+
39
+ open_data do |input|
40
+ catch do |abort_tag|
41
+ listener = IndexListener.new(abort_tag) do |href, record|
42
+ parse_detail(href, record)
43
+ yield(record)
44
+ end
45
+ parser = REXML::Parsers::StreamParser.new(input, listener)
46
+ parser.parse
47
+ end
48
+ end
49
+ end
50
+
51
+ private
52
+ def open_data
53
+ data_path = cache_dir_path + "index.html"
54
+ unless data_path.exist?
55
+ download(data_path, @metadata.url)
56
+ end
57
+ ::File.open(data_path) do |input|
58
+ yield(input)
59
+ end
60
+ end
61
+
62
+ def extract_description
63
+ open_data do |input|
64
+ description = []
65
+ catch do |abort_tag|
66
+ listener = DescriptionListener.new(abort_tag, description)
67
+ parser = REXML::Parsers::StreamParser.new(input, listener)
68
+ parser.parse
69
+ end
70
+ description.join("\n\n")
71
+ end
72
+ end
73
+
74
+ def extract_text(element)
75
+ texts = REXML::XPath.match(element, ".//text()")
76
+ texts.join("").gsub(/[ \t\n]+/, " ").strip
77
+ end
78
+
79
+ def open_detail(detail)
80
+ data_path = cache_dir_path + detail
81
+ unless data_path.exist?
82
+ download(data_path, @metadata.url + detail)
83
+ end
84
+ ::File.open(data_path) do |input|
85
+ yield(input)
86
+ end
87
+ end
88
+
89
+ def parse_detail(href, record)
90
+ path, id = href.split("#")
91
+ open_detail(path) do |input|
92
+ catch do |abort_tag|
93
+ listener = DetailListener.new(abort_tag, id, @metadata.url, record)
94
+ parser = REXML::Parsers::StreamParser.new(input, listener)
95
+ parser.parse
96
+ end
97
+ end
98
+ end
99
+
100
+ class IndexListener
101
+ include REXML::StreamListener
102
+
103
+ def initialize(abort_tag, &block)
104
+ @abort_tag = abort_tag
105
+ @block = block
106
+ @row = nil
107
+ @in_td = false
108
+ end
109
+
110
+ def tag_start(name, attributes)
111
+ case name
112
+ when "tr"
113
+ @row = []
114
+ when "td"
115
+ @in_td = true
116
+ @row << {:text => ""}
117
+ when "a"
118
+ @row.last[:href] = attributes["href"] if @in_td
119
+ end
120
+ end
121
+
122
+ def tag_end(name)
123
+ case name
124
+ when "table"
125
+ throw(@abort_tag)
126
+ when "tr"
127
+ name_column = @row[0]
128
+ return unless name_column
129
+ record = Record.new
130
+ record.name = name_column[:text]
131
+ record.files = []
132
+ @block.call(name_column[:href], record)
133
+ when "td"
134
+ @in_td = false
135
+ end
136
+ end
137
+
138
+ def text(data)
139
+ @row.last[:text] << data if @in_td
140
+ end
141
+ end
142
+
143
+ class DetailListener
144
+ include REXML::StreamListener
145
+
146
+ def initialize(abort_tag, id, base_url, record)
147
+ @abort_tag = abort_tag
148
+ @id = id
149
+ @base_url = base_url
150
+ @record = record
151
+ @in_target = false
152
+ @target_li_level = nil
153
+ @key = nil
154
+ @data = nil
155
+ @file = nil
156
+ end
157
+
158
+ def tag_start(name, attributes)
159
+ if @in_target
160
+ case name
161
+ when "li"
162
+ @target_li_level += 1
163
+ case @target_li_level
164
+ when 0
165
+ @key = nil
166
+ @data = nil
167
+ @file = nil
168
+ when 1
169
+ @file = File.new
170
+ end
171
+ when "a"
172
+ @file.url = @base_url + attributes["href"] if @file
173
+ end
174
+ else
175
+ if attributes["name"] == @id
176
+ @in_target = true
177
+ @target_li_level = -1
178
+ end
179
+ end
180
+ end
181
+
182
+ def tag_end(name)
183
+ if @in_target
184
+ case name
185
+ when "ul"
186
+ throw(@abort_tag) if @target_li_level == -1
187
+ when "li"
188
+ case @target_li_level
189
+ when 0
190
+ if @key
191
+ data = @data
192
+ data = data.gsub(/[ \t\n]+/, " ").strip if data.is_a?(String)
193
+ @record[@key] = data
194
+ end
195
+ when 1
196
+ @data << @file if @data and @file
197
+ end
198
+ @target_li_level -= 1
199
+ end
200
+ end
201
+ end
202
+
203
+ def text(data)
204
+ case @target_li_level
205
+ when 0
206
+ if @key
207
+ @data << data
208
+ else
209
+ case data.gsub(/[ \t\n]+/, " ")
210
+ when /\ASource: /
211
+ @key = :source
212
+ @data = $POSTMATCH
213
+ when /\APreprocessing: /
214
+ @key = :preprocessing
215
+ @data = $POSTMATCH
216
+ when /\A\# of classes: (\d+)/
217
+ @key = :n_classes
218
+ @data = Integer($1, 10)
219
+ when /\A\# of data: ([\d,]+)/
220
+ @key = :n_data
221
+ @data = Integer($1.gsub(/,/, ""), 10)
222
+ when /\A\# of features: ([\d,]+)/
223
+ @key = :n_features
224
+ @data = Integer($1.gsub(/,/, ""), 10)
225
+ when /\AFiles:/
226
+ @key = :files
227
+ @data = []
228
+ end
229
+ end
230
+ when 1
231
+ if @file.name.nil?
232
+ @file.name = data
233
+ else
234
+ @file.note = data.strip.gsub(/[()]/, "")
235
+ end
236
+ end
237
+ end
238
+ end
239
+
240
+ class DescriptionListener
241
+ include REXML::StreamListener
242
+
243
+ def initialize(abort_tag, description)
244
+ @abort_tag = abort_tag
245
+ @description = description
246
+ @in_content = false
247
+ @p = nil
248
+ end
249
+
250
+ def tag_start(name, attributes)
251
+ case name
252
+ when "p"
253
+ @in_content = true
254
+ @p = []
255
+ when "br"
256
+ @description << @p.join(" ")
257
+ @p = []
258
+ when "hr"
259
+ throw(@abort_tag)
260
+ end
261
+ end
262
+
263
+ def tag_end(name)
264
+ case name
265
+ when "p"
266
+ @description << @p.join(" ")
267
+ end
268
+ end
269
+
270
+ def text(data)
271
+ return unless @in_content
272
+ content = data.gsub(/[ \t\n]+/, " ").strip
273
+ @p << content unless content.empty?
274
+ end
275
+ end
276
+ end
277
+ end