red-datasets 0.0.8 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c7a9199546e7a001c97e45c6fa28db15c0d96b748e527d9705dfee4e4b1db6fd
4
- data.tar.gz: c659f6ae1e658ad91210e4427be063463124d89ef90388d34ebfb73ceb49068a
3
+ metadata.gz: 81ed53e83d75d517052aaf07c66fe177f12f986584141c951ac1dcfa2fc88646
4
+ data.tar.gz: 94b9f3b8042eaad65304bf7c3d2fc35519f8328b0ca4e9f8a7ad9be13781a91e
5
5
  SHA512:
6
- metadata.gz: d8a23c4a165a596df22ce5bbe1f8f0cd5c0f002deecafbb26cd5e5f75abb3c0224c1013898162a67787159258d1b801395fc4d949c17939d95940664cffd5600
7
- data.tar.gz: f2fd4eb733e6205f138c4005627e815e3787040a8a4b6cce7eca9fd5d4adaa12263e17e8f5bd9394a851e5210f28736ee3c682c81e110da304ae17fb3f0bedba
6
+ metadata.gz: c73561ed005e4b58f27fc6de969605a22d57adf4bc5b5184e5cdb65739f1ac6b86f6ed67794bfe61164859fc4a1b0f80430bc819b2ea37ac455a560a6f008b13
7
+ data.tar.gz: 07560b09d68272dc7a959c16ec03975d1fa752f9d6930f0fd746c46e9236995606694f5899bad5bf770812c5a2d81e6f013353f680fc8adf65ad42bae514f57c
@@ -1,5 +1,38 @@
1
1
  # News
2
2
 
3
+ ## 0.0.9 - 2019-09-09
4
+
5
+ ### Improvements
6
+
7
+ * `Datasets::LIBSVMDatasetList`: Improved performance.
8
+
9
+ * `Datasets::Mushroom`: Added.
10
+ [GitHub#33][Patch by Yasuo Honda]
11
+
12
+ * `Datasets::Table#n_columns`: Added.
13
+
14
+ * `Datasets::Table#n_rows`: Added.
15
+
16
+ * `Datasets::Table#[]`: Added support for index access.
17
+
18
+ * `Datasets::Table#coolumn_names`: Added.
19
+
20
+ * `Datasets::Table#size`: Added.
21
+
22
+ * `Datasets::Table#length`: Added.
23
+
24
+ * `Datasets::Table#each_column`: Added.
25
+
26
+ * `Datasets::Table#each_record`: Added.
27
+
28
+ * `Datasets::Table#find_record`: Added.
29
+
30
+ ### Thanks
31
+
32
+ * Yasuo Honda
33
+
34
+ ### Improvements
35
+
3
36
  ## 0.0.8 - 2019-03-24
4
37
 
5
38
  ### Improvements
@@ -7,6 +7,7 @@ require_relative "datasets/iris"
7
7
  require_relative "datasets/libsvm"
8
8
  require_relative "datasets/libsvm-dataset-list"
9
9
  require_relative "datasets/mnist"
10
+ require_relative "datasets/mushroom"
10
11
  require_relative "datasets/penn-treebank"
11
12
  require_relative "datasets/postal-code-japan"
12
13
  require_relative "datasets/wikipedia"
@@ -34,5 +34,17 @@ module Datasets
34
34
  downloader = Downloader.new(url)
35
35
  downloader.download(output_path)
36
36
  end
37
+
38
+ def extract_bz2(path)
39
+ input, output = IO.pipe
40
+ pid = spawn("bzcat", path.to_s, {:out => output})
41
+ begin
42
+ output.close
43
+ yield(input)
44
+ ensure
45
+ input.close
46
+ Process.waitpid(pid)
47
+ end
48
+ end
37
49
  end
38
50
  end
@@ -1,5 +1,6 @@
1
- require "English"
2
- require "rexml/document"
1
+ require "rexml/streamlistener"
2
+ require "rexml/parsers/baseparser"
3
+ require "rexml/parsers/streamparser"
3
4
 
4
5
  require_relative "dataset"
5
6
 
@@ -32,26 +33,17 @@ module Datasets
32
33
  end
33
34
  end
34
35
 
35
- def each
36
+ def each(&block)
36
37
  return to_enum(__method__) unless block_given?
37
38
 
38
39
  open_data do |input|
39
- # TODO: Improve performance
40
- document = REXML::Document.new(input)
41
- is_header = true
42
- document.each_element("//tr") do |tr|
43
- if is_header
44
- is_header = false
45
- next
40
+ catch do |abort_tag|
41
+ listener = IndexListener.new(abort_tag) do |href, record|
42
+ parse_detail(href, record)
43
+ yield(record)
46
44
  end
47
- name = tr.elements.first
48
- a = name.elements.first
49
- href = a.attributes["href"]
50
- record = Record.new
51
- record.name = a.text
52
- record.files = []
53
- parse_detail(href, record)
54
- yield(record)
45
+ parser = REXML::Parsers::StreamParser.new(input, listener)
46
+ parser.parse
55
47
  end
56
48
  end
57
49
  end
@@ -69,17 +61,11 @@ module Datasets
69
61
 
70
62
  def extract_description
71
63
  open_data do |input|
72
- document = REXML::Document.new(input)
73
64
  description = []
74
- in_content = false
75
- document.each_element("//body/*") do |element|
76
- unless in_content
77
- in_content = (element.name == "h1")
78
- next
79
- end
80
- break if element.name == "hr"
81
- content = extract_text(element)
82
- description << content unless content.empty?
65
+ catch do |abort_tag|
66
+ listener = DescriptionListener.new(abort_tag, description)
67
+ parser = REXML::Parsers::StreamParser.new(input, listener)
68
+ parser.parse
83
69
  end
84
70
  description.join("\n\n")
85
71
  end
@@ -102,36 +88,190 @@ module Datasets
102
88
 
103
89
  def parse_detail(href, record)
104
90
  path, id = href.split("#")
105
- open_detail(path) do |detail|
106
- detail_document = REXML::Document.new(detail)
107
- anchor = REXML::XPath.match(detail_document, "//*[@name='#{id}']")[0]
108
- ul = anchor.next_sibling
109
- ul.each_element do |li|
110
- text = extract_text(li)
111
- case text
112
- when /\ASource: /
113
- record.source = $POSTMATCH
114
- when /\APreprocessing: /
115
- record.preprocessing = $POSTMATCH
116
- when /\A\# of classes: (\d+)/
117
- record.n_classes = Integer($1, 10)
118
- when /\A\# of data: ([\d,]+)/
119
- record.n_data = Integer($1.gsub(/,/, ""), 10)
120
- when /\A\# of features: ([\d,]+)/
121
- record.n_features = Integer($1.gsub(/,/, ""), 10)
122
- when /\AFiles:/
123
- li.elements.first.each_element do |file_li|
124
- file_a = file_li.elements.first
125
- file = File.new
126
- file.name = file_a.text
127
- file.url = @metadata.url + file_a.attributes["href"]
128
- file_note = file_li.text
129
- file.note = file_note.strip.gsub(/[()]/, "") if file_note
130
- record.files << file
91
+ open_detail(path) do |input|
92
+ catch do |abort_tag|
93
+ listener = DetailListener.new(abort_tag, id, @metadata.url, record)
94
+ parser = REXML::Parsers::StreamParser.new(input, listener)
95
+ parser.parse
96
+ end
97
+ end
98
+ end
99
+
100
+ class IndexListener
101
+ include REXML::StreamListener
102
+
103
+ def initialize(abort_tag, &block)
104
+ @abort_tag = abort_tag
105
+ @block = block
106
+ @row = nil
107
+ @in_td = false
108
+ end
109
+
110
+ def tag_start(name, attributes)
111
+ case name
112
+ when "tr"
113
+ @row = []
114
+ when "td"
115
+ @in_td = true
116
+ @row << {:text => ""}
117
+ when "a"
118
+ @row.last[:href] = attributes["href"] if @in_td
119
+ end
120
+ end
121
+
122
+ def tag_end(name)
123
+ case name
124
+ when "table"
125
+ throw(@abort_tag)
126
+ when "tr"
127
+ name_column = @row[0]
128
+ return unless name_column
129
+ record = Record.new
130
+ record.name = name_column[:text]
131
+ record.files = []
132
+ @block.call(name_column[:href], record)
133
+ when "td"
134
+ @in_td = false
135
+ end
136
+ end
137
+
138
+ def text(data)
139
+ @row.last[:text] << data if @in_td
140
+ end
141
+ end
142
+
143
+ class DetailListener
144
+ include REXML::StreamListener
145
+
146
+ def initialize(abort_tag, id, base_url, record)
147
+ @abort_tag = abort_tag
148
+ @id = id
149
+ @base_url = base_url
150
+ @record = record
151
+ @in_target = false
152
+ @target_li_level = nil
153
+ @key = nil
154
+ @data = nil
155
+ @file = nil
156
+ end
157
+
158
+ def tag_start(name, attributes)
159
+ if @in_target
160
+ case name
161
+ when "li"
162
+ @target_li_level += 1
163
+ case @target_li_level
164
+ when 0
165
+ @key = nil
166
+ @data = nil
167
+ @file = nil
168
+ when 1
169
+ @file = File.new
131
170
  end
171
+ when "a"
172
+ @file.url = @base_url + attributes["href"] if @file
173
+ end
174
+ else
175
+ if attributes["name"] == @id
176
+ @in_target = true
177
+ @target_li_level = -1
178
+ end
179
+ end
180
+ end
181
+
182
+ def tag_end(name)
183
+ if @in_target
184
+ case name
185
+ when "ul"
186
+ throw(@abort_tag) if @target_li_level == -1
187
+ when "li"
188
+ case @target_li_level
189
+ when 0
190
+ if @key
191
+ data = @data
192
+ data = data.gsub(/[ \t\n]+/, " ").strip if data.is_a?(String)
193
+ @record[@key] = data
194
+ end
195
+ when 1
196
+ @data << @file if @data and @file
197
+ end
198
+ @target_li_level -= 1
199
+ end
200
+ end
201
+ end
202
+
203
+ def text(data)
204
+ case @target_li_level
205
+ when 0
206
+ if @key
207
+ @data << data
208
+ else
209
+ case data.gsub(/[ \t\n]+/, " ")
210
+ when /\ASource: /
211
+ @key = :source
212
+ @data = $POSTMATCH
213
+ when /\APreprocessing: /
214
+ @key = :preprocessing
215
+ @data = $POSTMATCH
216
+ when /\A\# of classes: (\d+)/
217
+ @key = :n_classes
218
+ @data = Integer($1, 10)
219
+ when /\A\# of data: ([\d,]+)/
220
+ @key = :n_data
221
+ @data = Integer($1.gsub(/,/, ""), 10)
222
+ when /\A\# of features: ([\d,]+)/
223
+ @key = :n_features
224
+ @data = Integer($1.gsub(/,/, ""), 10)
225
+ when /\AFiles:/
226
+ @key = :files
227
+ @data = []
228
+ end
229
+ end
230
+ when 1
231
+ if @file.name.nil?
232
+ @file.name = data
233
+ else
234
+ @file.note = data.strip.gsub(/[()]/, "")
132
235
  end
133
236
  end
134
237
  end
135
238
  end
239
+
240
+ class DescriptionListener
241
+ include REXML::StreamListener
242
+
243
+ def initialize(abort_tag, description)
244
+ @abort_tag = abort_tag
245
+ @description = description
246
+ @in_content = false
247
+ @p = nil
248
+ end
249
+
250
+ def tag_start(name, attributes)
251
+ case name
252
+ when "p"
253
+ @in_content = true
254
+ @p = []
255
+ when "br"
256
+ @description << @p.join(" ")
257
+ @p = []
258
+ when "hr"
259
+ throw(@abort_tag)
260
+ end
261
+ end
262
+
263
+ def tag_end(name)
264
+ case name
265
+ when "p"
266
+ @description << @p.join(" ")
267
+ end
268
+ end
269
+
270
+ def text(data)
271
+ return unless @in_content
272
+ content = data.gsub(/[ \t\n]+/, " ").strip
273
+ @p << content unless content.empty?
274
+ end
275
+ end
136
276
  end
137
277
  end
@@ -103,15 +103,7 @@ module Datasets
103
103
  download(data_path, @file.url)
104
104
  end
105
105
  if data_path.extname == ".bz2"
106
- input, output = IO.pipe
107
- pid = spawn("bzcat", data_path.to_s, {:out => output})
108
- begin
109
- output.close
110
- yield(input)
111
- ensure
112
- input.close
113
- Process.waitpid(pid)
114
- end
106
+ extract_bz2(data_path, &block)
115
107
  else
116
108
  File.open(data_path, &block)
117
109
  end
@@ -0,0 +1,256 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class Mushroom < Dataset
7
+ Record = Struct.new(
8
+ :label,
9
+ :cap_shape,
10
+ :cap_surface,
11
+ :cap_color,
12
+ :bruises,
13
+ :odor,
14
+ :gill_attachment,
15
+ :gill_spacing,
16
+ :gill_size,
17
+ :gill_color,
18
+ :stalk_shape,
19
+ :stalk_root,
20
+ :stalk_surface_above_ring,
21
+ :stalk_surface_below_ring,
22
+ :stalk_color_above_ring,
23
+ :stalk_color_below_ring,
24
+ :veil_type,
25
+ :veil_color,
26
+ :n_rings,
27
+ :ring_type,
28
+ :spore_print_color,
29
+ :population,
30
+ :habitat,
31
+ )
32
+
33
+ def initialize
34
+ super()
35
+ @metadata.id = "mushroom"
36
+ @metadata.name = "Mushroom"
37
+ @metadata.url = "https://archive.ics.uci.edu/ml/datasets/mushroom"
38
+ @metadata.description = lambda do
39
+ read_names
40
+ end
41
+ end
42
+
43
+ def each
44
+ return to_enum(__method__) unless block_given?
45
+
46
+ open_data do |csv|
47
+ csv.each do |row|
48
+ next if row[0].nil?
49
+ record = Record.new(*row)
50
+ record.members.each do |member|
51
+ record[member] = CONVERTERS[member][record[member]]
52
+ end
53
+ yield(record)
54
+ end
55
+ end
56
+ end
57
+
58
+ private
59
+ def open_data
60
+ data_path = cache_dir_path + "agaricus-lepiota.data"
61
+ unless data_path.exist?
62
+ data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
63
+ download(data_path, data_url)
64
+ end
65
+ CSV.open(data_path) do |csv|
66
+ yield(csv)
67
+ end
68
+ end
69
+
70
+ def read_names
71
+ names_path = cache_dir_path + "agaricus-lepiota.names"
72
+ unless names_path.exist?
73
+ names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
74
+ download(names_path, names_url)
75
+ end
76
+ names_path.read
77
+ end
78
+
79
+ CONVERTERS = {
80
+ label: {
81
+ "p" => "poisonous",
82
+ "e" => "edible",
83
+ },
84
+ cap_shape: {
85
+ "b" => "bell",
86
+ "c" => "conical",
87
+ "x" => "convex",
88
+ "f" => "flat",
89
+ "k" => "knobbed",
90
+ "s" => "sunken",
91
+ },
92
+ cap_surface: {
93
+ "f" => "fibrous",
94
+ "g" => "grooves",
95
+ "y" => "scaly",
96
+ "s" => "smooth",
97
+ },
98
+ cap_color: {
99
+ "n" => "brown",
100
+ "b" => "buff",
101
+ "c" => "cinnamon",
102
+ "g" => "gray",
103
+ "r" => "green",
104
+ "p" => "pink",
105
+ "u" => "purple",
106
+ "e" => "red",
107
+ "w" => "white",
108
+ "y" => "yellow",
109
+ },
110
+ bruises: {
111
+ "t" => "bruises",
112
+ "f" => "no",
113
+ },
114
+ odor: {
115
+ "a" => "almond",
116
+ "l" => "anise",
117
+ "c" => "creosote",
118
+ "y" => "fishy",
119
+ "f" => "foul",
120
+ "m" => "musty",
121
+ "n" => "none",
122
+ "p" => "pungent",
123
+ "s" => "spicy",
124
+ },
125
+ gill_attachment: {
126
+ "a" => "attached",
127
+ "d" => "descending",
128
+ "f" => "free",
129
+ "n" => "notched",
130
+ },
131
+ gill_spacing: {
132
+ "c" => "close",
133
+ "w" => "crowded",
134
+ "d" => "distant",
135
+ },
136
+ gill_size: {
137
+ "b" => "broad",
138
+ "n" => "narrow",
139
+ },
140
+ gill_color: {
141
+ "k" => "black",
142
+ "n" => "brown",
143
+ "b" => "buff",
144
+ "h" => "chocolate",
145
+ "g" => "gray",
146
+ "r" => "green",
147
+ "o" => "orange",
148
+ "p" => "pink",
149
+ "u" => "purple",
150
+ "e" => "red",
151
+ "w" => "white",
152
+ "y" => "yellow",
153
+ },
154
+ stalk_shape: {
155
+ "e" => "enlarging",
156
+ "t" => "tapering",
157
+ },
158
+ stalk_root: {
159
+ "b" => "bulbous",
160
+ "c" => "club",
161
+ "u" => "cup",
162
+ "e" => "equal",
163
+ "z" => "rhizomorphs",
164
+ "r" => "rooted",
165
+ "?" => "missing",
166
+ },
167
+ stalk_surface_above_ring: {
168
+ "f" => "fibrous",
169
+ "y" => "scaly",
170
+ "k" => "silky",
171
+ "s" => "smooth",
172
+ },
173
+ stalk_surface_below_ring: {
174
+ "f" => "fibrous",
175
+ "y" => "scaly",
176
+ "k" => "silky",
177
+ "s" => "smooth",
178
+ },
179
+ stalk_color_above_ring: {
180
+ "n" => "brown",
181
+ "b" => "buff",
182
+ "c" => "cinnamon",
183
+ "g" => "gray",
184
+ "o" => "orange",
185
+ "p" => "pink",
186
+ "e" => "red",
187
+ "w" => "white",
188
+ "y" => "yellow",
189
+ },
190
+ stalk_color_below_ring: {
191
+ "n" => "brown",
192
+ "b" => "buff",
193
+ "c" => "cinnamon",
194
+ "g" => "gray",
195
+ "o" => "orange",
196
+ "p" => "pink",
197
+ "e" => "red",
198
+ "w" => "white",
199
+ "y" => "yellow",
200
+ },
201
+ veil_type: {
202
+ "p" => "partial",
203
+ "u" => "universal",
204
+ },
205
+ veil_color: {
206
+ "n" => "brown",
207
+ "o" => "orange",
208
+ "w" => "white",
209
+ "y" => "yellow",
210
+ },
211
+ n_rings: {
212
+ "n" => 0,
213
+ "o" => 1,
214
+ "t" => 2,
215
+ },
216
+ ring_type: {
217
+ "c" => "cobwebby",
218
+ "e" => "evanescent",
219
+ "f" => "flaring",
220
+ "l" => "large",
221
+ "n" => "none",
222
+ "p" => "pendant",
223
+ "s" => "sheathing",
224
+ "z" => "zone",
225
+ },
226
+ spore_print_color: {
227
+ "k" => "black",
228
+ "n" => "brown",
229
+ "b" => "buff",
230
+ "h" => "chocolate",
231
+ "r" => "green",
232
+ "o" => "orange",
233
+ "u" => "purple",
234
+ "w" => "white",
235
+ "y" => "yellow",
236
+ },
237
+ population: {
238
+ "a" => "abundant",
239
+ "c" => "clustered",
240
+ "n" => "numerous",
241
+ "s" => "scattered",
242
+ "v" => "several",
243
+ "y" => "solitary",
244
+ },
245
+ habitat: {
246
+ "g" => "grasses",
247
+ "l" => "leaves",
248
+ "m" => "meadows",
249
+ "p" => "paths",
250
+ "u" => "urban",
251
+ "w" => "waste",
252
+ "d" => "woods",
253
+ }
254
+ }
255
+ end
256
+ end
@@ -2,19 +2,99 @@ require "datasets/dictionary"
2
2
 
3
3
  module Datasets
4
4
  class Table
5
+ class Record
6
+ include Enumerable
7
+
8
+ def initialize(table, index)
9
+ @table = table
10
+ @index = index
11
+ end
12
+
13
+ def [](column_name_or_column_index)
14
+ @table[column_name_or_column_index][@index]
15
+ end
16
+
17
+ def each
18
+ return to_enum(__method__) unless block_given?
19
+ @table.each_column.each do |column_name, column_values|
20
+ yield(column_name, column_values[@index])
21
+ end
22
+ end
23
+
24
+ def values
25
+ @table.each_column.collect do |_column_name, column_values|
26
+ column_values[@index]
27
+ end
28
+ end
29
+
30
+ def to_h
31
+ hash = {}
32
+ each do |column_name, column_value|
33
+ hash[column_name] = column_value
34
+ end
35
+ hash
36
+ end
37
+
38
+ def inspect
39
+ "#<#{self.class.name} #{@table.dataset.metadata.name}[#{@index}] #{to_h.inspect}>"
40
+ end
41
+ end
42
+
5
43
  include Enumerable
6
44
 
45
+ attr_reader :dataset
7
46
  def initialize(dataset)
8
47
  @dataset = dataset
9
48
  @dictionaries = {}
10
49
  end
11
50
 
12
- def each(&block)
51
+ def n_columns
52
+ columner_data.size
53
+ end
54
+ alias_method :size, :n_columns
55
+ alias_method :length, :n_columns
56
+
57
+ def n_rows
58
+ first_column = columner_data.first
59
+ return 0 if first_column.nil?
60
+ first_column[1].size
61
+ end
62
+
63
+ def column_names
64
+ columner_data.keys
65
+ end
66
+
67
+ def each_column(&block)
13
68
  columner_data.each(&block)
14
69
  end
70
+ alias_method :each, :each_column
15
71
 
16
- def [](name)
17
- columner_data[normalize_name(name)]
72
+ def each_record
73
+ return to_enum(__method__) unless block_given?
74
+ n_rows.times do |i|
75
+ yield(Record.new(self, i))
76
+ end
77
+ end
78
+
79
+ def find_record(row)
80
+ row += n_rows if row < 0
81
+ return nil if row < 0
82
+ return nil if row >= n_rows
83
+ Record.new(self, row)
84
+ end
85
+
86
+ def [](name_or_index)
87
+ case name_or_index
88
+ when Integer
89
+ index = name_or_index
90
+ columner_data.each_with_index do |(_name, values), i|
91
+ return values if i == index
92
+ end
93
+ nil
94
+ else
95
+ name = name_or_index
96
+ columner_data[normalize_name(name)]
97
+ end
18
98
  end
19
99
 
20
100
  def dictionary_encode(name)
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.0.8"
2
+ VERSION = "0.0.9"
3
3
  end
@@ -52,7 +52,7 @@ module Datasets
52
52
  end
53
53
 
54
54
  private
55
- def open_data
55
+ def open_data(&block)
56
56
  base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
57
57
  data_path = cache_dir_path + base_name
58
58
  unless data_path.exist?
@@ -60,15 +60,7 @@ module Datasets
60
60
  download(data_path, data_url)
61
61
  end
62
62
 
63
- input, output = IO.pipe
64
- pid = spawn("bzcat", data_path.to_s, {:out => output})
65
- begin
66
- output.close
67
- yield(input)
68
- ensure
69
- input.close
70
- Process.waitpid(pid)
71
- end
63
+ extract_bz2(data_path, &block)
72
64
  end
73
65
 
74
66
  def type_in_path
@@ -0,0 +1,80 @@
1
+ class MushroomTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::Mushroom.new
4
+ end
5
+
6
+ def record(*args)
7
+ Datasets::Mushroom::Record.new(*args)
8
+ end
9
+
10
+ test("#each") do
11
+ records = @dataset.each.to_a
12
+ assert_equal([
13
+ 8124,
14
+ {
15
+ :label => "poisonous",
16
+ :cap_shape => "convex",
17
+ :cap_surface => "smooth",
18
+ :cap_color => "brown",
19
+ :bruises => "bruises",
20
+ :odor => "pungent",
21
+ :gill_attachment => "free",
22
+ :gill_spacing => "close",
23
+ :gill_size => "narrow",
24
+ :gill_color => "black",
25
+ :stalk_shape => "enlarging",
26
+ :stalk_root => "equal",
27
+ :stalk_surface_above_ring => "smooth",
28
+ :stalk_surface_below_ring => "smooth",
29
+ :stalk_color_above_ring => "white",
30
+ :stalk_color_below_ring => "white",
31
+ :veil_type => "partial",
32
+ :veil_color => "white",
33
+ :n_rings => 1,
34
+ :ring_type => "pendant",
35
+ :spore_print_color => "black",
36
+ :population => "scattered",
37
+ :habitat => "urban"
38
+ },
39
+ {
40
+ :label => "edible",
41
+ :cap_shape => "convex",
42
+ :cap_surface => "smooth",
43
+ :cap_color => "brown",
44
+ :bruises => "no",
45
+ :odor => "none",
46
+ :gill_attachment => "attached",
47
+ :gill_spacing => "close",
48
+ :gill_size => "broad",
49
+ :gill_color => "yellow",
50
+ :stalk_shape => "enlarging",
51
+ :stalk_root => "missing",
52
+ :stalk_surface_above_ring => "smooth",
53
+ :stalk_surface_below_ring => "smooth",
54
+ :stalk_color_above_ring => "orange",
55
+ :stalk_color_below_ring => "orange",
56
+ :veil_type => "partial",
57
+ :veil_color => "orange",
58
+ :n_rings => 1,
59
+ :ring_type => "pendant",
60
+ :spore_print_color => "orange",
61
+ :population => "clustered",
62
+ :habitat => "leaves"
63
+ }
64
+ ],
65
+ [
66
+ records.size,
67
+ records[0].to_h,
68
+ records[-1].to_h
69
+ ])
70
+ end
71
+
72
+ sub_test_case("#metadata") do
73
+ test("#description") do
74
+ description = @dataset.metadata.description
75
+ assert do
76
+ description.start_with?("1. Title: Mushroom Database")
77
+ end
78
+ end
79
+ end
80
+ end
@@ -3,9 +3,129 @@ class TableTest < Test::Unit::TestCase
3
3
  @table = Datasets::Iris.new.to_table
4
4
  end
5
5
 
6
- test("#[]") do
7
- assert_equal([1.4, 1.4, 1.3, 1.5, 1.4],
8
- @table[:petal_length].first(5))
6
+ test("#n_columns") do
7
+ assert_equal(5, @table.n_columns)
8
+ end
9
+
10
+ test("#n_rows") do
11
+ assert_equal(150, @table.n_rows)
12
+ end
13
+
14
+ test("#column_names") do
15
+ assert_equal([
16
+ :sepal_length,
17
+ :sepal_width,
18
+ :petal_length,
19
+ :petal_width,
20
+ :label,
21
+ ],
22
+ @table.column_names)
23
+ end
24
+
25
+ test("#each") do
26
+ shorten_hash = {}
27
+ @table.each do |name, values|
28
+ shorten_hash[name] = values.first(5)
29
+ end
30
+ assert_equal({
31
+ :label => ["Iris-setosa"] * 5,
32
+ :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
33
+ :petal_width => [0.2, 0.2, 0.2, 0.2, 0.2],
34
+ :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
35
+ :sepal_width => [3.5, 3.0, 3.2, 3.1, 3.6],
36
+ },
37
+ shorten_hash)
38
+ end
39
+
40
+ test("#each_column") do
41
+ shorten_hash = {}
42
+ @table.each_column do |name, values|
43
+ shorten_hash[name] = values.first(5)
44
+ end
45
+ assert_equal({
46
+ :label => ["Iris-setosa"] * 5,
47
+ :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
48
+ :petal_width => [0.2, 0.2, 0.2, 0.2, 0.2],
49
+ :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
50
+ :sepal_width => [3.5, 3.0, 3.2, 3.1, 3.6],
51
+ },
52
+ shorten_hash)
53
+ end
54
+
55
+ test("#each_record") do
56
+ records = []
57
+ @table.each_record do |record|
58
+ records << record
59
+ break if records.size == 3
60
+ end
61
+ assert_equal([
62
+ {
63
+ label: "Iris-setosa",
64
+ petal_length: 1.4,
65
+ petal_width: 0.2,
66
+ sepal_length: 5.1,
67
+ sepal_width: 3.5,
68
+ },
69
+ {
70
+ label: "Iris-setosa",
71
+ petal_length: 1.4,
72
+ petal_width: 0.2,
73
+ sepal_length: 4.9,
74
+ sepal_width: 3.0,
75
+ },
76
+ {
77
+ label: "Iris-setosa",
78
+ petal_length: 1.3,
79
+ petal_width: 0.2,
80
+ sepal_length: 4.7,
81
+ sepal_width: 3.2,
82
+ },
83
+ ],
84
+ records.collect(&:to_h))
85
+ end
86
+
87
+ sub_test_case("#find_record") do
88
+ test("positive") do
89
+ assert_equal({
90
+ label: "Iris-setosa",
91
+ petal_length: 1.4,
92
+ petal_width: 0.2,
93
+ sepal_length: 4.9,
94
+ sepal_width: 3.0,
95
+ },
96
+ @table.find_record(1).to_h)
97
+ end
98
+
99
+ test("positive - over") do
100
+ assert_nil(@table.find_record(151))
101
+ end
102
+
103
+ test("negative") do
104
+ assert_equal({
105
+ label: "Iris-virginica",
106
+ petal_length: 5.1,
107
+ petal_width: 1.8,
108
+ sepal_length: 5.9,
109
+ sepal_width: 3.0,
110
+ },
111
+ @table.find_record(-1).to_h)
112
+ end
113
+
114
+ test("negative - over") do
115
+ assert_nil(@table.find_record(-151))
116
+ end
117
+ end
118
+
119
+ sub_test_case("#[]") do
120
+ test("index") do
121
+ assert_equal([1.4, 1.4, 1.3, 1.5, 1.4],
122
+ @table[2].first(5))
123
+ end
124
+
125
+ test("name") do
126
+ assert_equal([1.4, 1.4, 1.3, 1.5, 1.4],
127
+ @table[:petal_length].first(5))
128
+ end
9
129
  end
10
130
 
11
131
  test("#dictionary_encode") do
@@ -58,21 +178,6 @@ class TableTest < Test::Unit::TestCase
58
178
  end
59
179
  end
60
180
 
61
- test("#each") do
62
- shorten_hash = {}
63
- @table.each do |name, values|
64
- shorten_hash[name] = values.first(5)
65
- end
66
- assert_equal({
67
- :label => ["Iris-setosa"] * 5,
68
- :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
69
- :petal_width => [0.2, 0.2, 0.2, 0.2, 0.2],
70
- :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
71
- :sepal_width => [3.5, 3.0, 3.2, 3.1, 3.6],
72
- },
73
- shorten_hash)
74
- end
75
-
76
181
  test("#to_h") do
77
182
  shorten_hash = {}
78
183
  @table.to_h.each do |name, values|
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red-datasets
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - tomisuker
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2019-03-24 00:00:00.000000000 Z
12
+ date: 2019-09-09 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: csv
@@ -138,6 +138,7 @@ files:
138
138
  - lib/datasets/libsvm.rb
139
139
  - lib/datasets/metadata.rb
140
140
  - lib/datasets/mnist.rb
141
+ - lib/datasets/mushroom.rb
141
142
  - lib/datasets/penn-treebank.rb
142
143
  - lib/datasets/postal-code-japan.rb
143
144
  - lib/datasets/table.rb
@@ -155,6 +156,7 @@ files:
155
156
  - test/test-libsvm-dataset-list.rb
156
157
  - test/test-libsvm.rb
157
158
  - test/test-mnist.rb
159
+ - test/test-mushroom.rb
158
160
  - test/test-penn-treebank.rb
159
161
  - test/test-postal-code-japan.rb
160
162
  - test/test-table.rb
@@ -180,23 +182,24 @@ required_rubygems_version: !ruby/object:Gem::Requirement
180
182
  version: '0'
181
183
  requirements: []
182
184
  rubyforge_project:
183
- rubygems_version: 2.7.6
185
+ rubygems_version: 2.7.6.2
184
186
  signing_key:
185
187
  specification_version: 4
186
188
  summary: Red Datasets provides classes that provide common datasets such as iris dataset.
187
189
  test_files:
188
- - test/test-iris.rb
189
- - test/test-wikipedia.rb
190
- - test/test-fashion-mnist.rb
191
- - test/test-wine.rb
192
- - test/test-postal-code-japan.rb
193
- - test/test-mnist.rb
194
- - test/helper.rb
195
190
  - test/test-adult.rb
196
191
  - test/test-libsvm.rb
197
- - test/run-test.rb
198
- - test/test-table.rb
199
- - test/test-cifar.rb
192
+ - test/test-wikipedia.rb
200
193
  - test/test-libsvm-dataset-list.rb
194
+ - test/helper.rb
195
+ - test/test-iris.rb
196
+ - test/test-table.rb
197
+ - test/run-test.rb
198
+ - test/test-wine.rb
201
199
  - test/test-penn-treebank.rb
200
+ - test/test-postal-code-japan.rb
201
+ - test/test-cifar.rb
202
+ - test/test-mnist.rb
203
+ - test/test-mushroom.rb
202
204
  - test/test-dictionary.rb
205
+ - test/test-fashion-mnist.rb