red-datasets 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c7a9199546e7a001c97e45c6fa28db15c0d96b748e527d9705dfee4e4b1db6fd
4
- data.tar.gz: c659f6ae1e658ad91210e4427be063463124d89ef90388d34ebfb73ceb49068a
3
+ metadata.gz: 81ed53e83d75d517052aaf07c66fe177f12f986584141c951ac1dcfa2fc88646
4
+ data.tar.gz: 94b9f3b8042eaad65304bf7c3d2fc35519f8328b0ca4e9f8a7ad9be13781a91e
5
5
  SHA512:
6
- metadata.gz: d8a23c4a165a596df22ce5bbe1f8f0cd5c0f002deecafbb26cd5e5f75abb3c0224c1013898162a67787159258d1b801395fc4d949c17939d95940664cffd5600
7
- data.tar.gz: f2fd4eb733e6205f138c4005627e815e3787040a8a4b6cce7eca9fd5d4adaa12263e17e8f5bd9394a851e5210f28736ee3c682c81e110da304ae17fb3f0bedba
6
+ metadata.gz: c73561ed005e4b58f27fc6de969605a22d57adf4bc5b5184e5cdb65739f1ac6b86f6ed67794bfe61164859fc4a1b0f80430bc819b2ea37ac455a560a6f008b13
7
+ data.tar.gz: 07560b09d68272dc7a959c16ec03975d1fa752f9d6930f0fd746c46e9236995606694f5899bad5bf770812c5a2d81e6f013353f680fc8adf65ad42bae514f57c
@@ -1,5 +1,38 @@
1
1
  # News
2
2
 
3
+ ## 0.0.9 - 2019-09-09
4
+
5
+ ### Improvements
6
+
7
+ * `Datasets::LIBSVMDatasetList`: Improved performance.
8
+
9
+ * `Datasets::Mushroom`: Added.
10
+ [GitHub#33][Patch by Yasuo Honda]
11
+
12
+ * `Datasets::Table#n_columns`: Added.
13
+
14
+ * `Datasets::Table#n_rows`: Added.
15
+
16
+ * `Datasets::Table#[]`: Added support for index access.
17
+
18
+ * `Datasets::Table#coolumn_names`: Added.
19
+
20
+ * `Datasets::Table#size`: Added.
21
+
22
+ * `Datasets::Table#length`: Added.
23
+
24
+ * `Datasets::Table#each_column`: Added.
25
+
26
+ * `Datasets::Table#each_record`: Added.
27
+
28
+ * `Datasets::Table#find_record`: Added.
29
+
30
+ ### Thanks
31
+
32
+ * Yasuo Honda
33
+
34
+ ### Improvements
35
+
3
36
  ## 0.0.8 - 2019-03-24
4
37
 
5
38
  ### Improvements
@@ -7,6 +7,7 @@ require_relative "datasets/iris"
7
7
  require_relative "datasets/libsvm"
8
8
  require_relative "datasets/libsvm-dataset-list"
9
9
  require_relative "datasets/mnist"
10
+ require_relative "datasets/mushroom"
10
11
  require_relative "datasets/penn-treebank"
11
12
  require_relative "datasets/postal-code-japan"
12
13
  require_relative "datasets/wikipedia"
@@ -34,5 +34,17 @@ module Datasets
34
34
  downloader = Downloader.new(url)
35
35
  downloader.download(output_path)
36
36
  end
37
+
38
+ def extract_bz2(path)
39
+ input, output = IO.pipe
40
+ pid = spawn("bzcat", path.to_s, {:out => output})
41
+ begin
42
+ output.close
43
+ yield(input)
44
+ ensure
45
+ input.close
46
+ Process.waitpid(pid)
47
+ end
48
+ end
37
49
  end
38
50
  end
@@ -1,5 +1,6 @@
1
- require "English"
2
- require "rexml/document"
1
+ require "rexml/streamlistener"
2
+ require "rexml/parsers/baseparser"
3
+ require "rexml/parsers/streamparser"
3
4
 
4
5
  require_relative "dataset"
5
6
 
@@ -32,26 +33,17 @@ module Datasets
32
33
  end
33
34
  end
34
35
 
35
- def each
36
+ def each(&block)
36
37
  return to_enum(__method__) unless block_given?
37
38
 
38
39
  open_data do |input|
39
- # TODO: Improve performance
40
- document = REXML::Document.new(input)
41
- is_header = true
42
- document.each_element("//tr") do |tr|
43
- if is_header
44
- is_header = false
45
- next
40
+ catch do |abort_tag|
41
+ listener = IndexListener.new(abort_tag) do |href, record|
42
+ parse_detail(href, record)
43
+ yield(record)
46
44
  end
47
- name = tr.elements.first
48
- a = name.elements.first
49
- href = a.attributes["href"]
50
- record = Record.new
51
- record.name = a.text
52
- record.files = []
53
- parse_detail(href, record)
54
- yield(record)
45
+ parser = REXML::Parsers::StreamParser.new(input, listener)
46
+ parser.parse
55
47
  end
56
48
  end
57
49
  end
@@ -69,17 +61,11 @@ module Datasets
69
61
 
70
62
  def extract_description
71
63
  open_data do |input|
72
- document = REXML::Document.new(input)
73
64
  description = []
74
- in_content = false
75
- document.each_element("//body/*") do |element|
76
- unless in_content
77
- in_content = (element.name == "h1")
78
- next
79
- end
80
- break if element.name == "hr"
81
- content = extract_text(element)
82
- description << content unless content.empty?
65
+ catch do |abort_tag|
66
+ listener = DescriptionListener.new(abort_tag, description)
67
+ parser = REXML::Parsers::StreamParser.new(input, listener)
68
+ parser.parse
83
69
  end
84
70
  description.join("\n\n")
85
71
  end
@@ -102,36 +88,190 @@ module Datasets
102
88
 
103
89
  def parse_detail(href, record)
104
90
  path, id = href.split("#")
105
- open_detail(path) do |detail|
106
- detail_document = REXML::Document.new(detail)
107
- anchor = REXML::XPath.match(detail_document, "//*[@name='#{id}']")[0]
108
- ul = anchor.next_sibling
109
- ul.each_element do |li|
110
- text = extract_text(li)
111
- case text
112
- when /\ASource: /
113
- record.source = $POSTMATCH
114
- when /\APreprocessing: /
115
- record.preprocessing = $POSTMATCH
116
- when /\A\# of classes: (\d+)/
117
- record.n_classes = Integer($1, 10)
118
- when /\A\# of data: ([\d,]+)/
119
- record.n_data = Integer($1.gsub(/,/, ""), 10)
120
- when /\A\# of features: ([\d,]+)/
121
- record.n_features = Integer($1.gsub(/,/, ""), 10)
122
- when /\AFiles:/
123
- li.elements.first.each_element do |file_li|
124
- file_a = file_li.elements.first
125
- file = File.new
126
- file.name = file_a.text
127
- file.url = @metadata.url + file_a.attributes["href"]
128
- file_note = file_li.text
129
- file.note = file_note.strip.gsub(/[()]/, "") if file_note
130
- record.files << file
91
+ open_detail(path) do |input|
92
+ catch do |abort_tag|
93
+ listener = DetailListener.new(abort_tag, id, @metadata.url, record)
94
+ parser = REXML::Parsers::StreamParser.new(input, listener)
95
+ parser.parse
96
+ end
97
+ end
98
+ end
99
+
100
+ class IndexListener
101
+ include REXML::StreamListener
102
+
103
+ def initialize(abort_tag, &block)
104
+ @abort_tag = abort_tag
105
+ @block = block
106
+ @row = nil
107
+ @in_td = false
108
+ end
109
+
110
+ def tag_start(name, attributes)
111
+ case name
112
+ when "tr"
113
+ @row = []
114
+ when "td"
115
+ @in_td = true
116
+ @row << {:text => ""}
117
+ when "a"
118
+ @row.last[:href] = attributes["href"] if @in_td
119
+ end
120
+ end
121
+
122
+ def tag_end(name)
123
+ case name
124
+ when "table"
125
+ throw(@abort_tag)
126
+ when "tr"
127
+ name_column = @row[0]
128
+ return unless name_column
129
+ record = Record.new
130
+ record.name = name_column[:text]
131
+ record.files = []
132
+ @block.call(name_column[:href], record)
133
+ when "td"
134
+ @in_td = false
135
+ end
136
+ end
137
+
138
+ def text(data)
139
+ @row.last[:text] << data if @in_td
140
+ end
141
+ end
142
+
143
+ class DetailListener
144
+ include REXML::StreamListener
145
+
146
+ def initialize(abort_tag, id, base_url, record)
147
+ @abort_tag = abort_tag
148
+ @id = id
149
+ @base_url = base_url
150
+ @record = record
151
+ @in_target = false
152
+ @target_li_level = nil
153
+ @key = nil
154
+ @data = nil
155
+ @file = nil
156
+ end
157
+
158
+ def tag_start(name, attributes)
159
+ if @in_target
160
+ case name
161
+ when "li"
162
+ @target_li_level += 1
163
+ case @target_li_level
164
+ when 0
165
+ @key = nil
166
+ @data = nil
167
+ @file = nil
168
+ when 1
169
+ @file = File.new
131
170
  end
171
+ when "a"
172
+ @file.url = @base_url + attributes["href"] if @file
173
+ end
174
+ else
175
+ if attributes["name"] == @id
176
+ @in_target = true
177
+ @target_li_level = -1
178
+ end
179
+ end
180
+ end
181
+
182
+ def tag_end(name)
183
+ if @in_target
184
+ case name
185
+ when "ul"
186
+ throw(@abort_tag) if @target_li_level == -1
187
+ when "li"
188
+ case @target_li_level
189
+ when 0
190
+ if @key
191
+ data = @data
192
+ data = data.gsub(/[ \t\n]+/, " ").strip if data.is_a?(String)
193
+ @record[@key] = data
194
+ end
195
+ when 1
196
+ @data << @file if @data and @file
197
+ end
198
+ @target_li_level -= 1
199
+ end
200
+ end
201
+ end
202
+
203
+ def text(data)
204
+ case @target_li_level
205
+ when 0
206
+ if @key
207
+ @data << data
208
+ else
209
+ case data.gsub(/[ \t\n]+/, " ")
210
+ when /\ASource: /
211
+ @key = :source
212
+ @data = $POSTMATCH
213
+ when /\APreprocessing: /
214
+ @key = :preprocessing
215
+ @data = $POSTMATCH
216
+ when /\A\# of classes: (\d+)/
217
+ @key = :n_classes
218
+ @data = Integer($1, 10)
219
+ when /\A\# of data: ([\d,]+)/
220
+ @key = :n_data
221
+ @data = Integer($1.gsub(/,/, ""), 10)
222
+ when /\A\# of features: ([\d,]+)/
223
+ @key = :n_features
224
+ @data = Integer($1.gsub(/,/, ""), 10)
225
+ when /\AFiles:/
226
+ @key = :files
227
+ @data = []
228
+ end
229
+ end
230
+ when 1
231
+ if @file.name.nil?
232
+ @file.name = data
233
+ else
234
+ @file.note = data.strip.gsub(/[()]/, "")
132
235
  end
133
236
  end
134
237
  end
135
238
  end
239
+
240
+ class DescriptionListener
241
+ include REXML::StreamListener
242
+
243
+ def initialize(abort_tag, description)
244
+ @abort_tag = abort_tag
245
+ @description = description
246
+ @in_content = false
247
+ @p = nil
248
+ end
249
+
250
+ def tag_start(name, attributes)
251
+ case name
252
+ when "p"
253
+ @in_content = true
254
+ @p = []
255
+ when "br"
256
+ @description << @p.join(" ")
257
+ @p = []
258
+ when "hr"
259
+ throw(@abort_tag)
260
+ end
261
+ end
262
+
263
+ def tag_end(name)
264
+ case name
265
+ when "p"
266
+ @description << @p.join(" ")
267
+ end
268
+ end
269
+
270
+ def text(data)
271
+ return unless @in_content
272
+ content = data.gsub(/[ \t\n]+/, " ").strip
273
+ @p << content unless content.empty?
274
+ end
275
+ end
136
276
  end
137
277
  end
@@ -103,15 +103,7 @@ module Datasets
103
103
  download(data_path, @file.url)
104
104
  end
105
105
  if data_path.extname == ".bz2"
106
- input, output = IO.pipe
107
- pid = spawn("bzcat", data_path.to_s, {:out => output})
108
- begin
109
- output.close
110
- yield(input)
111
- ensure
112
- input.close
113
- Process.waitpid(pid)
114
- end
106
+ extract_bz2(data_path, &block)
115
107
  else
116
108
  File.open(data_path, &block)
117
109
  end
@@ -0,0 +1,256 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class Mushroom < Dataset
7
+ Record = Struct.new(
8
+ :label,
9
+ :cap_shape,
10
+ :cap_surface,
11
+ :cap_color,
12
+ :bruises,
13
+ :odor,
14
+ :gill_attachment,
15
+ :gill_spacing,
16
+ :gill_size,
17
+ :gill_color,
18
+ :stalk_shape,
19
+ :stalk_root,
20
+ :stalk_surface_above_ring,
21
+ :stalk_surface_below_ring,
22
+ :stalk_color_above_ring,
23
+ :stalk_color_below_ring,
24
+ :veil_type,
25
+ :veil_color,
26
+ :n_rings,
27
+ :ring_type,
28
+ :spore_print_color,
29
+ :population,
30
+ :habitat,
31
+ )
32
+
33
+ def initialize
34
+ super()
35
+ @metadata.id = "mushroom"
36
+ @metadata.name = "Mushroom"
37
+ @metadata.url = "https://archive.ics.uci.edu/ml/datasets/mushroom"
38
+ @metadata.description = lambda do
39
+ read_names
40
+ end
41
+ end
42
+
43
+ def each
44
+ return to_enum(__method__) unless block_given?
45
+
46
+ open_data do |csv|
47
+ csv.each do |row|
48
+ next if row[0].nil?
49
+ record = Record.new(*row)
50
+ record.members.each do |member|
51
+ record[member] = CONVERTERS[member][record[member]]
52
+ end
53
+ yield(record)
54
+ end
55
+ end
56
+ end
57
+
58
+ private
59
+ def open_data
60
+ data_path = cache_dir_path + "agaricus-lepiota.data"
61
+ unless data_path.exist?
62
+ data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
63
+ download(data_path, data_url)
64
+ end
65
+ CSV.open(data_path) do |csv|
66
+ yield(csv)
67
+ end
68
+ end
69
+
70
+ def read_names
71
+ names_path = cache_dir_path + "agaricus-lepiota.names"
72
+ unless names_path.exist?
73
+ names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
74
+ download(names_path, names_url)
75
+ end
76
+ names_path.read
77
+ end
78
+
79
+ CONVERTERS = {
80
+ label: {
81
+ "p" => "poisonous",
82
+ "e" => "edible",
83
+ },
84
+ cap_shape: {
85
+ "b" => "bell",
86
+ "c" => "conical",
87
+ "x" => "convex",
88
+ "f" => "flat",
89
+ "k" => "knobbed",
90
+ "s" => "sunken",
91
+ },
92
+ cap_surface: {
93
+ "f" => "fibrous",
94
+ "g" => "grooves",
95
+ "y" => "scaly",
96
+ "s" => "smooth",
97
+ },
98
+ cap_color: {
99
+ "n" => "brown",
100
+ "b" => "buff",
101
+ "c" => "cinnamon",
102
+ "g" => "gray",
103
+ "r" => "green",
104
+ "p" => "pink",
105
+ "u" => "purple",
106
+ "e" => "red",
107
+ "w" => "white",
108
+ "y" => "yellow",
109
+ },
110
+ bruises: {
111
+ "t" => "bruises",
112
+ "f" => "no",
113
+ },
114
+ odor: {
115
+ "a" => "almond",
116
+ "l" => "anise",
117
+ "c" => "creosote",
118
+ "y" => "fishy",
119
+ "f" => "foul",
120
+ "m" => "musty",
121
+ "n" => "none",
122
+ "p" => "pungent",
123
+ "s" => "spicy",
124
+ },
125
+ gill_attachment: {
126
+ "a" => "attached",
127
+ "d" => "descending",
128
+ "f" => "free",
129
+ "n" => "notched",
130
+ },
131
+ gill_spacing: {
132
+ "c" => "close",
133
+ "w" => "crowded",
134
+ "d" => "distant",
135
+ },
136
+ gill_size: {
137
+ "b" => "broad",
138
+ "n" => "narrow",
139
+ },
140
+ gill_color: {
141
+ "k" => "black",
142
+ "n" => "brown",
143
+ "b" => "buff",
144
+ "h" => "chocolate",
145
+ "g" => "gray",
146
+ "r" => "green",
147
+ "o" => "orange",
148
+ "p" => "pink",
149
+ "u" => "purple",
150
+ "e" => "red",
151
+ "w" => "white",
152
+ "y" => "yellow",
153
+ },
154
+ stalk_shape: {
155
+ "e" => "enlarging",
156
+ "t" => "tapering",
157
+ },
158
+ stalk_root: {
159
+ "b" => "bulbous",
160
+ "c" => "club",
161
+ "u" => "cup",
162
+ "e" => "equal",
163
+ "z" => "rhizomorphs",
164
+ "r" => "rooted",
165
+ "?" => "missing",
166
+ },
167
+ stalk_surface_above_ring: {
168
+ "f" => "fibrous",
169
+ "y" => "scaly",
170
+ "k" => "silky",
171
+ "s" => "smooth",
172
+ },
173
+ stalk_surface_below_ring: {
174
+ "f" => "fibrous",
175
+ "y" => "scaly",
176
+ "k" => "silky",
177
+ "s" => "smooth",
178
+ },
179
+ stalk_color_above_ring: {
180
+ "n" => "brown",
181
+ "b" => "buff",
182
+ "c" => "cinnamon",
183
+ "g" => "gray",
184
+ "o" => "orange",
185
+ "p" => "pink",
186
+ "e" => "red",
187
+ "w" => "white",
188
+ "y" => "yellow",
189
+ },
190
+ stalk_color_below_ring: {
191
+ "n" => "brown",
192
+ "b" => "buff",
193
+ "c" => "cinnamon",
194
+ "g" => "gray",
195
+ "o" => "orange",
196
+ "p" => "pink",
197
+ "e" => "red",
198
+ "w" => "white",
199
+ "y" => "yellow",
200
+ },
201
+ veil_type: {
202
+ "p" => "partial",
203
+ "u" => "universal",
204
+ },
205
+ veil_color: {
206
+ "n" => "brown",
207
+ "o" => "orange",
208
+ "w" => "white",
209
+ "y" => "yellow",
210
+ },
211
+ n_rings: {
212
+ "n" => 0,
213
+ "o" => 1,
214
+ "t" => 2,
215
+ },
216
+ ring_type: {
217
+ "c" => "cobwebby",
218
+ "e" => "evanescent",
219
+ "f" => "flaring",
220
+ "l" => "large",
221
+ "n" => "none",
222
+ "p" => "pendant",
223
+ "s" => "sheathing",
224
+ "z" => "zone",
225
+ },
226
+ spore_print_color: {
227
+ "k" => "black",
228
+ "n" => "brown",
229
+ "b" => "buff",
230
+ "h" => "chocolate",
231
+ "r" => "green",
232
+ "o" => "orange",
233
+ "u" => "purple",
234
+ "w" => "white",
235
+ "y" => "yellow",
236
+ },
237
+ population: {
238
+ "a" => "abundant",
239
+ "c" => "clustered",
240
+ "n" => "numerous",
241
+ "s" => "scattered",
242
+ "v" => "several",
243
+ "y" => "solitary",
244
+ },
245
+ habitat: {
246
+ "g" => "grasses",
247
+ "l" => "leaves",
248
+ "m" => "meadows",
249
+ "p" => "paths",
250
+ "u" => "urban",
251
+ "w" => "waste",
252
+ "d" => "woods",
253
+ }
254
+ }
255
+ end
256
+ end
@@ -2,19 +2,99 @@ require "datasets/dictionary"
2
2
 
3
3
  module Datasets
4
4
  class Table
5
+ class Record
6
+ include Enumerable
7
+
8
+ def initialize(table, index)
9
+ @table = table
10
+ @index = index
11
+ end
12
+
13
+ def [](column_name_or_column_index)
14
+ @table[column_name_or_column_index][@index]
15
+ end
16
+
17
+ def each
18
+ return to_enum(__method__) unless block_given?
19
+ @table.each_column.each do |column_name, column_values|
20
+ yield(column_name, column_values[@index])
21
+ end
22
+ end
23
+
24
+ def values
25
+ @table.each_column.collect do |_column_name, column_values|
26
+ column_values[@index]
27
+ end
28
+ end
29
+
30
+ def to_h
31
+ hash = {}
32
+ each do |column_name, column_value|
33
+ hash[column_name] = column_value
34
+ end
35
+ hash
36
+ end
37
+
38
+ def inspect
39
+ "#<#{self.class.name} #{@table.dataset.metadata.name}[#{@index}] #{to_h.inspect}>"
40
+ end
41
+ end
42
+
5
43
  include Enumerable
6
44
 
45
+ attr_reader :dataset
7
46
  def initialize(dataset)
8
47
  @dataset = dataset
9
48
  @dictionaries = {}
10
49
  end
11
50
 
12
- def each(&block)
51
+ def n_columns
52
+ columner_data.size
53
+ end
54
+ alias_method :size, :n_columns
55
+ alias_method :length, :n_columns
56
+
57
+ def n_rows
58
+ first_column = columner_data.first
59
+ return 0 if first_column.nil?
60
+ first_column[1].size
61
+ end
62
+
63
+ def column_names
64
+ columner_data.keys
65
+ end
66
+
67
+ def each_column(&block)
13
68
  columner_data.each(&block)
14
69
  end
70
+ alias_method :each, :each_column
15
71
 
16
- def [](name)
17
- columner_data[normalize_name(name)]
72
+ def each_record
73
+ return to_enum(__method__) unless block_given?
74
+ n_rows.times do |i|
75
+ yield(Record.new(self, i))
76
+ end
77
+ end
78
+
79
+ def find_record(row)
80
+ row += n_rows if row < 0
81
+ return nil if row < 0
82
+ return nil if row >= n_rows
83
+ Record.new(self, row)
84
+ end
85
+
86
+ def [](name_or_index)
87
+ case name_or_index
88
+ when Integer
89
+ index = name_or_index
90
+ columner_data.each_with_index do |(_name, values), i|
91
+ return values if i == index
92
+ end
93
+ nil
94
+ else
95
+ name = name_or_index
96
+ columner_data[normalize_name(name)]
97
+ end
18
98
  end
19
99
 
20
100
  def dictionary_encode(name)
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.0.8"
2
+ VERSION = "0.0.9"
3
3
  end
@@ -52,7 +52,7 @@ module Datasets
52
52
  end
53
53
 
54
54
  private
55
- def open_data
55
+ def open_data(&block)
56
56
  base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
57
57
  data_path = cache_dir_path + base_name
58
58
  unless data_path.exist?
@@ -60,15 +60,7 @@ module Datasets
60
60
  download(data_path, data_url)
61
61
  end
62
62
 
63
- input, output = IO.pipe
64
- pid = spawn("bzcat", data_path.to_s, {:out => output})
65
- begin
66
- output.close
67
- yield(input)
68
- ensure
69
- input.close
70
- Process.waitpid(pid)
71
- end
63
+ extract_bz2(data_path, &block)
72
64
  end
73
65
 
74
66
  def type_in_path
@@ -0,0 +1,80 @@
1
+ class MushroomTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::Mushroom.new
4
+ end
5
+
6
+ def record(*args)
7
+ Datasets::Mushroom::Record.new(*args)
8
+ end
9
+
10
+ test("#each") do
11
+ records = @dataset.each.to_a
12
+ assert_equal([
13
+ 8124,
14
+ {
15
+ :label => "poisonous",
16
+ :cap_shape => "convex",
17
+ :cap_surface => "smooth",
18
+ :cap_color => "brown",
19
+ :bruises => "bruises",
20
+ :odor => "pungent",
21
+ :gill_attachment => "free",
22
+ :gill_spacing => "close",
23
+ :gill_size => "narrow",
24
+ :gill_color => "black",
25
+ :stalk_shape => "enlarging",
26
+ :stalk_root => "equal",
27
+ :stalk_surface_above_ring => "smooth",
28
+ :stalk_surface_below_ring => "smooth",
29
+ :stalk_color_above_ring => "white",
30
+ :stalk_color_below_ring => "white",
31
+ :veil_type => "partial",
32
+ :veil_color => "white",
33
+ :n_rings => 1,
34
+ :ring_type => "pendant",
35
+ :spore_print_color => "black",
36
+ :population => "scattered",
37
+ :habitat => "urban"
38
+ },
39
+ {
40
+ :label => "edible",
41
+ :cap_shape => "convex",
42
+ :cap_surface => "smooth",
43
+ :cap_color => "brown",
44
+ :bruises => "no",
45
+ :odor => "none",
46
+ :gill_attachment => "attached",
47
+ :gill_spacing => "close",
48
+ :gill_size => "broad",
49
+ :gill_color => "yellow",
50
+ :stalk_shape => "enlarging",
51
+ :stalk_root => "missing",
52
+ :stalk_surface_above_ring => "smooth",
53
+ :stalk_surface_below_ring => "smooth",
54
+ :stalk_color_above_ring => "orange",
55
+ :stalk_color_below_ring => "orange",
56
+ :veil_type => "partial",
57
+ :veil_color => "orange",
58
+ :n_rings => 1,
59
+ :ring_type => "pendant",
60
+ :spore_print_color => "orange",
61
+ :population => "clustered",
62
+ :habitat => "leaves"
63
+ }
64
+ ],
65
+ [
66
+ records.size,
67
+ records[0].to_h,
68
+ records[-1].to_h
69
+ ])
70
+ end
71
+
72
+ sub_test_case("#metadata") do
73
+ test("#description") do
74
+ description = @dataset.metadata.description
75
+ assert do
76
+ description.start_with?("1. Title: Mushroom Database")
77
+ end
78
+ end
79
+ end
80
+ end
@@ -3,9 +3,129 @@ class TableTest < Test::Unit::TestCase
3
3
  @table = Datasets::Iris.new.to_table
4
4
  end
5
5
 
6
- test("#[]") do
7
- assert_equal([1.4, 1.4, 1.3, 1.5, 1.4],
8
- @table[:petal_length].first(5))
6
+ test("#n_columns") do
7
+ assert_equal(5, @table.n_columns)
8
+ end
9
+
10
+ test("#n_rows") do
11
+ assert_equal(150, @table.n_rows)
12
+ end
13
+
14
+ test("#column_names") do
15
+ assert_equal([
16
+ :sepal_length,
17
+ :sepal_width,
18
+ :petal_length,
19
+ :petal_width,
20
+ :label,
21
+ ],
22
+ @table.column_names)
23
+ end
24
+
25
+ test("#each") do
26
+ shorten_hash = {}
27
+ @table.each do |name, values|
28
+ shorten_hash[name] = values.first(5)
29
+ end
30
+ assert_equal({
31
+ :label => ["Iris-setosa"] * 5,
32
+ :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
33
+ :petal_width => [0.2, 0.2, 0.2, 0.2, 0.2],
34
+ :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
35
+ :sepal_width => [3.5, 3.0, 3.2, 3.1, 3.6],
36
+ },
37
+ shorten_hash)
38
+ end
39
+
40
+ test("#each_column") do
41
+ shorten_hash = {}
42
+ @table.each_column do |name, values|
43
+ shorten_hash[name] = values.first(5)
44
+ end
45
+ assert_equal({
46
+ :label => ["Iris-setosa"] * 5,
47
+ :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
48
+ :petal_width => [0.2, 0.2, 0.2, 0.2, 0.2],
49
+ :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
50
+ :sepal_width => [3.5, 3.0, 3.2, 3.1, 3.6],
51
+ },
52
+ shorten_hash)
53
+ end
54
+
55
+ test("#each_record") do
56
+ records = []
57
+ @table.each_record do |record|
58
+ records << record
59
+ break if records.size == 3
60
+ end
61
+ assert_equal([
62
+ {
63
+ label: "Iris-setosa",
64
+ petal_length: 1.4,
65
+ petal_width: 0.2,
66
+ sepal_length: 5.1,
67
+ sepal_width: 3.5,
68
+ },
69
+ {
70
+ label: "Iris-setosa",
71
+ petal_length: 1.4,
72
+ petal_width: 0.2,
73
+ sepal_length: 4.9,
74
+ sepal_width: 3.0,
75
+ },
76
+ {
77
+ label: "Iris-setosa",
78
+ petal_length: 1.3,
79
+ petal_width: 0.2,
80
+ sepal_length: 4.7,
81
+ sepal_width: 3.2,
82
+ },
83
+ ],
84
+ records.collect(&:to_h))
85
+ end
86
+
87
+ sub_test_case("#find_record") do
88
+ test("positive") do
89
+ assert_equal({
90
+ label: "Iris-setosa",
91
+ petal_length: 1.4,
92
+ petal_width: 0.2,
93
+ sepal_length: 4.9,
94
+ sepal_width: 3.0,
95
+ },
96
+ @table.find_record(1).to_h)
97
+ end
98
+
99
+ test("positive - over") do
100
+ assert_nil(@table.find_record(151))
101
+ end
102
+
103
+ test("negative") do
104
+ assert_equal({
105
+ label: "Iris-virginica",
106
+ petal_length: 5.1,
107
+ petal_width: 1.8,
108
+ sepal_length: 5.9,
109
+ sepal_width: 3.0,
110
+ },
111
+ @table.find_record(-1).to_h)
112
+ end
113
+
114
+ test("negative - over") do
115
+ assert_nil(@table.find_record(-151))
116
+ end
117
+ end
118
+
119
+ sub_test_case("#[]") do
120
+ test("index") do
121
+ assert_equal([1.4, 1.4, 1.3, 1.5, 1.4],
122
+ @table[2].first(5))
123
+ end
124
+
125
+ test("name") do
126
+ assert_equal([1.4, 1.4, 1.3, 1.5, 1.4],
127
+ @table[:petal_length].first(5))
128
+ end
9
129
  end
10
130
 
11
131
  test("#dictionary_encode") do
@@ -58,21 +178,6 @@ class TableTest < Test::Unit::TestCase
58
178
  end
59
179
  end
60
180
 
61
- test("#each") do
62
- shorten_hash = {}
63
- @table.each do |name, values|
64
- shorten_hash[name] = values.first(5)
65
- end
66
- assert_equal({
67
- :label => ["Iris-setosa"] * 5,
68
- :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
69
- :petal_width => [0.2, 0.2, 0.2, 0.2, 0.2],
70
- :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
71
- :sepal_width => [3.5, 3.0, 3.2, 3.1, 3.6],
72
- },
73
- shorten_hash)
74
- end
75
-
76
181
  test("#to_h") do
77
182
  shorten_hash = {}
78
183
  @table.to_h.each do |name, values|
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red-datasets
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - tomisuker
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2019-03-24 00:00:00.000000000 Z
12
+ date: 2019-09-09 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: csv
@@ -138,6 +138,7 @@ files:
138
138
  - lib/datasets/libsvm.rb
139
139
  - lib/datasets/metadata.rb
140
140
  - lib/datasets/mnist.rb
141
+ - lib/datasets/mushroom.rb
141
142
  - lib/datasets/penn-treebank.rb
142
143
  - lib/datasets/postal-code-japan.rb
143
144
  - lib/datasets/table.rb
@@ -155,6 +156,7 @@ files:
155
156
  - test/test-libsvm-dataset-list.rb
156
157
  - test/test-libsvm.rb
157
158
  - test/test-mnist.rb
159
+ - test/test-mushroom.rb
158
160
  - test/test-penn-treebank.rb
159
161
  - test/test-postal-code-japan.rb
160
162
  - test/test-table.rb
@@ -180,23 +182,24 @@ required_rubygems_version: !ruby/object:Gem::Requirement
180
182
  version: '0'
181
183
  requirements: []
182
184
  rubyforge_project:
183
- rubygems_version: 2.7.6
185
+ rubygems_version: 2.7.6.2
184
186
  signing_key:
185
187
  specification_version: 4
186
188
  summary: Red Datasets provides classes that provide common datasets such as iris dataset.
187
189
  test_files:
188
- - test/test-iris.rb
189
- - test/test-wikipedia.rb
190
- - test/test-fashion-mnist.rb
191
- - test/test-wine.rb
192
- - test/test-postal-code-japan.rb
193
- - test/test-mnist.rb
194
- - test/helper.rb
195
190
  - test/test-adult.rb
196
191
  - test/test-libsvm.rb
197
- - test/run-test.rb
198
- - test/test-table.rb
199
- - test/test-cifar.rb
192
+ - test/test-wikipedia.rb
200
193
  - test/test-libsvm-dataset-list.rb
194
+ - test/helper.rb
195
+ - test/test-iris.rb
196
+ - test/test-table.rb
197
+ - test/run-test.rb
198
+ - test/test-wine.rb
201
199
  - test/test-penn-treebank.rb
200
+ - test/test-postal-code-japan.rb
201
+ - test/test-cifar.rb
202
+ - test/test-mnist.rb
203
+ - test/test-mushroom.rb
202
204
  - test/test-dictionary.rb
205
+ - test/test-fashion-mnist.rb