iterationlabs-rarff 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm use 1.9.2@rarff --create
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source :rubygems
2
+ gem 'shoulda'
3
+ gem 'jeweler'
@@ -0,0 +1,17 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ git (1.2.5)
5
+ jeweler (1.6.4)
6
+ bundler (~> 1.0)
7
+ git (>= 1.2.5)
8
+ rake
9
+ rake (0.9.2.2)
10
+ shoulda (2.11.3)
11
+
12
+ PLATFORMS
13
+ ruby
14
+
15
+ DEPENDENCIES
16
+ jeweler
17
+ shoulda
@@ -0,0 +1,18 @@
1
+ == Changes
2
+
3
+ === 0.2.3
4
+
5
+ * Added set_string_attributes_to_nominal for easy conversion of string attributes to nominal ones
6
+
7
+ === 0.2.2 (unofficial)
8
+
9
+ * Handles boolean inputs, which are modelled as nominals
10
+ * Handles spaces in nominals, which are replaced by underscores. Probably should be quoting these, but is good enough for me right now
11
+
12
+ === 0.2.1 (unofficial)
13
+
14
+ * Handles missing data in output, encoded internally as nil values
15
+
16
+ == 0.2.0 ?
17
+
18
+ * Sparse ARFF files (thanks to Tom Adams)
@@ -0,0 +1,5 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README.txt
4
+ Rakefile
5
+ lib/rarff.rb
@@ -0,0 +1,90 @@
1
+ = rarff
2
+
3
+ http://adenserparlance.blogspot.com/2007/01/rarff-simple-arff-library-in-ruby.html
4
+
5
+ == DESCRIPTION:
6
+
7
+ Rarff is a Ruby library for dealing with Attribute-Relation File Format (ARFF) files. ARFF files are used to specify data sets for data mining and machine learning.
8
+
9
+
10
+ == FEATURES/PROBLEMS:
11
+
12
+ === FEATURES
13
+ * Missing values - '?' are handled in creation of ARFF files
14
+
15
+ === PROBLEMS
16
+ * Spaces or quotes in nominal types
17
+ * Commas in quoted attributes or in nominal types
18
+ * Add error checking/validation
19
+ * Creation of sparse ARFF files
20
+ * Dates - do some work to create, translate, and interpret date format strings.
21
+
22
+ == SYNOPSIS:
23
+
24
+ arff_file_str = <<-END_OF_ARFF_FILE
25
+ @RELATION MyCoolRelation
26
+ @ATTRIBUTE Attr0 NUMERIC
27
+ @ATTRIBUTE subject STRING
28
+ @ATTRIBUTE Attr2 NUMERIC
29
+ @ATTRIBUTE Attr3 STRING
30
+ @ATTRIBUTE birthday DATE "yyyy-MM-dd HH:mm:ss"
31
+ @DATA
32
+ 1.4, 'foo bar', 5, baz, "1900-08-08 12:12:12"
33
+ 20.9, ruby, 46, rocks, "2005-10-23 12:12:12"
34
+ 0, ruby, 46, rocks, "2001-02-19 12:12:12"
35
+ 68.1, stuff, 728, 'is cool', "1974-02-10 12:12:12"
36
+ END_OF_ARFF_FILE
37
+
38
+ arff_file_str.gsub!(/\n$/, '')
39
+
40
+ instances = [ [1.4, 'foo bar', 5, 'baz', "1900-08-08 12:12:12"],
41
+ [20.9, 'ruby', 46, 'rocks', "2005-10-23 12:12:12"],
42
+ [0, 'ruby', 46, 'rocks', "2001-02-19 12:12:12"],
43
+ [68.1, 'stuff', 728, 'is cool', "1974-02-10 12:12:12"]]
44
+
45
+ rel = Rarff::Relation.new('MyCoolRelation')
46
+ rel.instances = instances
47
+ rel.attributes[1].name = 'subject'
48
+ rel.attributes[4].name = 'birthday'
49
+ rel.attributes[4].type = 'DATE "yyyy-MM-dd HH:mm:ss"'
50
+
51
+ # puts "rel.to_arff:\n(\n#{rel.to_arff}\n)\n"
52
+ assert_equal(arff_file_str, rel.to_arff, "Arff creation test failed.")
53
+
54
+ == REQUIREMENTS:
55
+
56
+ == INSTALL:
57
+
58
+ * sudo gem install wwood-rarff
59
+
60
+ == LICENSE:
61
+
62
+ Copyright (c) 2008 Andy Payne
63
+ All rights reserved.
64
+
65
+ Redistribution and use in source and binary forms, with or without
66
+ modification, are permitted provided that the following conditions are met:
67
+
68
+ * Redistributions of source code must retain the above copyright notice,
69
+ this list of conditions and the following disclaimer.
70
+ * Redistributions in binary form must reproduce the above copyright notice,
71
+ this list of conditions and the following disclaimer in the
72
+ documentation and/or other materials provided with the distribution.
73
+ * Neither the name of the COPYRIGHT OWNER nor the names of its contributors
74
+ may be used to endorse or promote products derived from this software
75
+ without specific prior written permission.
76
+
77
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
78
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
79
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
80
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
81
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
82
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
83
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
84
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
85
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
86
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
87
+
88
+
89
+
90
+
@@ -0,0 +1,52 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "iterationlabs-rarff"
8
+ gem.summary = %Q{Rarff is a Ruby library for dealing with Attribute-Relation File Format (ARFF) files}
9
+ gem.description = %Q{Rarff is a Ruby library for dealing with Attribute-Relation File Format (ARFF) files. ARFF files are used to specify
10
+ data sets for data mining and machine learning.}
11
+ gem.email = "donttrustben near gmail.com"
12
+ gem.homepage = "http://github.com/iterationlabs/rarff"
13
+ gem.authors = ["Ben J Woodcroft","Andy Payne", "Andrew Cantino"]
14
+ gem.add_development_dependency "thoughtbot-shoulda", ">= 0"
15
+ end
16
+ rescue LoadError
17
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
18
+ end
19
+
20
+ require 'rake/testtask'
21
+ Rake::TestTask.new(:test) do |test|
22
+ test.libs << 'lib' << 'test'
23
+ test.pattern = 'test/**/test_*.rb'
24
+ test.verbose = true
25
+ end
26
+
27
+ begin
28
+ require 'rcov/rcovtask'
29
+ Rcov::RcovTask.new do |test|
30
+ test.libs << 'test'
31
+ test.pattern = 'test/**/test_*.rb'
32
+ test.verbose = true
33
+ end
34
+ rescue LoadError
35
+ task :rcov do
36
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
37
+ end
38
+ end
39
+
40
+ task :default => :test
41
+
42
+ # task :test => :check_dependencies
43
+
44
+ # require 'rdoc/task'
45
+ # Rake::RDocTask.new do |rdoc|
46
+ # version = File.exist?('VERSION') ? File.read('VERSION') : ""
47
+ #
48
+ # rdoc.rdoc_dir = 'rdoc'
49
+ # rdoc.title = "blah #{version}"
50
+ # rdoc.rdoc_files.include('README*')
51
+ # rdoc.rdoc_files.include('lib/**/*.rb')
52
+ # end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.5
@@ -0,0 +1,338 @@
1
+ # = rarff
2
+
3
+ # This is the top-level include file for rarff. See the README file for
4
+ # details.
5
+
6
+ ################################################################################
7
+
8
+ # Custom scan that returns a boolean indicating whether the regex matched.
9
+ # TODO: Is there a way to avoid doing this?
10
+ class String
11
+ def my_scan(re)
12
+ hit = false
13
+ scan(re) { |arr|
14
+ yield arr if block_given?
15
+ hit = true
16
+ }
17
+ hit
18
+ end
19
+ end
20
+
21
+ ################################################################################
22
+
23
+ module Enumerable
24
+ # This map_with_index hack allows access to the index of each item as the map
25
+ # iterates.
26
+ # TODO: Is there a better way?
27
+ def map_with_index
28
+ # Ugly, but I need the yield to be the last statement in the map.
29
+ i = -1
30
+ return map { |item|
31
+ i += 1
32
+ yield item, i
33
+ }
34
+ end
35
+ end
36
+
37
+ ################################################################################
38
+
39
+ module Rarff
40
+
41
+ COMMENT_MARKER = '%'
42
+ RELATION_MARKER = '@RELATION'
43
+ ATTRIBUTE_MARKER = '@ATTRIBUTE'
44
+ DATA_MARKER = '@DATA'
45
+
46
+ SPARSE_ARFF_BEGIN = '{'
47
+ ESC_SPARSE_ARFF_BEGIN = '\\' + SPARSE_ARFF_BEGIN
48
+ SPARSE_ARFF_END = '}'
49
+ ESC_SPARSE_ARFF_END = '\\' + SPARSE_ARFF_END
50
+
51
+ ATTRIBUTE_NUMERIC = 'NUMERIC'
52
+ ATTRIBUTE_REAL = 'REAL'
53
+ ATTRIBUTE_INTEGER = 'INTEGER'
54
+ ATTRIBUTE_STRING = 'STRING'
55
+ ATTRIBUTE_DATE = 'DATE'
56
+ # Model Boolean as a Nominal Attribute.
57
+ # Use {false, true} not {true, false} because then in visualisations in Weka
58
+ # true is to the right, which makes more intuitive sense
59
+ ATTRIBUTE_BOOLEAN = '{false, true}'
60
+
61
+ MISSING = '?'
62
+
63
+ ################################################################################
64
+
65
+ class Attribute
66
+ attr_accessor :name
67
+ attr_reader :type
68
+
69
+ def initialize(name='', type='')
70
+ @name = name
71
+
72
+ @type_is_nominal = false
73
+ @type = type
74
+
75
+ check_nominal()
76
+ end
77
+
78
+
79
+ def type=(type)
80
+ @type = type
81
+ check_nominal()
82
+ end
83
+
84
+
85
+ # Convert string representation of nominal type to array, if necessary
86
+ # TODO: This might falsely trigger on wacky date formats.
87
+ def check_nominal
88
+ if @type =~ /^\s*\{.*(\,.*)+\}\s*$/
89
+ @type_is_nominal = true
90
+ # Example format: "{nom1,nom2, nom3, nom4,nom5 } "
91
+ # Split on '{' ',' or '}'
92
+ # @type = @type.gsub(/^\s*\{\s*/, '').gsub(/\s*\}\s*$/, '').split(/\s*\,\s*/)
93
+ @type = @type.split(/\s*\,\s*/)
94
+ end
95
+ end
96
+
97
+
98
+ def add_nominal_value(str)
99
+ if @type_is_nominal == false
100
+ @type = Array.new
101
+ end
102
+
103
+ @type << str
104
+ end
105
+
106
+
107
+ def to_arff
108
+ if @type_is_nominal == true
109
+ ATTRIBUTE_MARKER + " #{@name} #{@type.join(',').gsub(' ','_')}"
110
+ else
111
+ ATTRIBUTE_MARKER + " #{@name} #{@type}"
112
+ end
113
+ end
114
+
115
+
116
+ def to_s
117
+ to_arff
118
+ end
119
+
120
+ end
121
+
122
+
123
+
124
+ class Relation
125
+ attr_accessor :name, :attributes
126
+ attr_reader :instances
127
+
128
+
129
+ def initialize(name='')
130
+ @name = name
131
+ @attributes = Array.new
132
+ @instances = Array.new
133
+ end
134
+
135
+
136
+ def parse(str)
137
+ in_data_section = false
138
+
139
+ # TODO: Doesn't handle commas in quoted attributes.
140
+ str.split("\n").each { |line|
141
+ next if line =~ /^\s*$/
142
+ next if line =~ /^\s*#{COMMENT_MARKER}/
143
+ next if line.my_scan(/^\s*#{RELATION_MARKER}\s*(.*)\s*$/i) { |name| @name = name }
144
+ next if line.my_scan(/^\s*#{ATTRIBUTE_MARKER}\s*([^\s]*)\s+(.*)\s*$/i) { |name, type|
145
+ @attributes.push(Attribute.new(name, type))
146
+ }
147
+ next if line.my_scan(/^\s*#{DATA_MARKER}/i) { in_data_section = true }
148
+ next if in_data_section == false ## Below is data section handling
149
+ # next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^\s*#{SPARSE_ARFF_BEGIN}(.*)#{SPARSE_ARFF_END}\s*$/) { |data|
150
+ next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^#{ESC_SPARSE_ARFF_BEGIN}(.*)#{ESC_SPARSE_ARFF_END}$/) { |data|
151
+ # Sparse ARFF
152
+ # TODO: Factor duplication with non-sparse data below
153
+ @instances << expand_sparse(data.first)
154
+ create_attributes(true)
155
+ }
156
+ next if line.my_scan(/^\s*(.*)\s*$/) { |data|
157
+ @instances << data.first.split(/,\s*/).map { |field|
158
+ # Remove outer single quotes on strings, if any ('foo bar' --> foo bar)
159
+ field.gsub(/^\s*\'(.*)\'\s*$/, "\\1")
160
+ }
161
+ create_attributes(true)
162
+ }
163
+ }
164
+ end
165
+
166
+
167
+ # Assign instances to the internal array
168
+ # parse: choose to parse strings into numerics
169
+ def instances=(instances, parse=false)
170
+ @instances = instances
171
+ create_attributes(parse)
172
+ end
173
+
174
+
175
+
176
+ def create_attributes(attr_parse=false)
177
+ raise Exception, "Not enough data to create ARFF attributes" if @instances.nil? or
178
+ @instances.empty? or
179
+ @instances[0].empty?
180
+
181
+ # Keep track of whether an attribute has been defined or not.
182
+ # The only reason an attribute would not be defined in the first
183
+ # row is if it has nil's in it. The geek inside screams for a binary
184
+ # encoding like chmod but eh.
185
+ attributes_defined = {}
186
+ @instances.each_with_index { |row, i|
187
+ row.each_with_index { |col, j|
188
+ next if attributes_defined[j] or col.nil?
189
+
190
+ attributes_defined[j] = true #whatever happens, we are going to define it
191
+ if attr_parse
192
+ if col =~ /^\-?\d+\.?\d*$/
193
+ @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_NUMERIC)
194
+ end
195
+ next #parse next column - this one is finished
196
+ end
197
+
198
+ # No parsing - just take it how it is
199
+ if col.kind_of?(Numeric)
200
+ @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_NUMERIC)
201
+ elsif col.kind_of?(String)
202
+ @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_STRING)
203
+ elsif col == false or col == true #exactly equal to a boolean
204
+ @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_BOOLEAN)
205
+ else
206
+ raise Exception, "Could not parse attribute to ARFF data type: #{col.inspect}"
207
+ end
208
+ }
209
+ }
210
+
211
+ # Make sure all attributes have a definition, because otherwise
212
+ # needless errors are thrown
213
+ @instances[0].each_index do |i|
214
+ @attributes[i] ||= Attribute.new("Attr#{i}", ATTRIBUTE_NUMERIC)
215
+ end
216
+ end
217
+
218
+ # Make all String type attributes into nominal attributes, because
219
+ # they are more useful in WEKA because more techniques handle them than
220
+ # strings.
221
+ #
222
+ # column_indices is an optional argumetn specifying the columns that
223
+ # are to be set to nominal (0 based indexes). if nil (the default), then
224
+ # all columns are included
225
+ def set_string_attributes_to_nominal(column_indices = nil)
226
+ nominals = {}
227
+ # Frustratingly, we have to traverse this 2D array with the
228
+ # wrong dimension first. Oh well.
229
+ @instances.each_with_index do |row, row_index|
230
+ row.each_with_index do |string, col_index|
231
+ next unless @attributes[col_index].type == ATTRIBUTE_STRING
232
+ next unless column_indices.nil? or column_indices.include?(col_index)
233
+
234
+ nominals[col_index] ||= {}
235
+ nominals[col_index][string] ||= true
236
+ end
237
+ end
238
+
239
+ nominals.each do |index, strings|
240
+ @attributes[index].type = "{#{strings.keys.join(',')}}"
241
+ end
242
+ end
243
+
244
+ def expand_sparse(str)
245
+ arr = Array.new(@attributes.size, 0)
246
+ str.gsub(/^\s*\{(.*)\}\s*$/, "\\1").split(/\s*\,\s*/).map { |pr|
247
+ pra = pr.split(/\s/)
248
+ arr[pra[0].to_i] = pra[1]
249
+ }
250
+ arr
251
+ end
252
+
253
+
254
+ def to_arff(sparse=false)
255
+ RELATION_MARKER + " #{@name}\n" +
256
+ @attributes.join("\n") +
257
+ "\n" +
258
+ DATA_MARKER + "\n" +
259
+
260
+ @instances.map { |inst|
261
+ mapped = inst.map_with_index { |col, i|
262
+ # First pass - quote strings with spaces, and dates
263
+ # TODO: Doesn't handle cases in which strings already contain
264
+ # quotes or are already quoted.
265
+ unless col.nil?
266
+ if @attributes[i].type =~ /^#{ATTRIBUTE_STRING}$/i
267
+ if col =~ /[,\s]+/
268
+ col = "'" + col + "'"
269
+ end
270
+ elsif @attributes[i].type =~ /^#{ATTRIBUTE_DATE}/i ## Hack comparison. Ugh.
271
+ col = '"' + col + '"'
272
+ end
273
+ end
274
+
275
+ # Do the final output
276
+ if sparse
277
+ if col.nil? or
278
+ (@attributes[i].type =~ /^#{ATTRIBUTE_NUMERIC}$/i and col == 0)
279
+ nil
280
+ else
281
+ "#{i} #{col}"
282
+ end
283
+ else
284
+ if col.nil?
285
+ MISSING
286
+ else
287
+ col
288
+ end
289
+ end
290
+ }
291
+
292
+ if sparse
293
+ mapped.reject{|col| col.nil?}.join(', ')
294
+ else
295
+ mapped.join(", ")
296
+ end
297
+ }.join("\n")
298
+ end
299
+
300
+
301
+ def to_s
302
+ to_arff
303
+ end
304
+
305
+ end
306
+
307
+
308
+ end # module Rarff
309
+
310
+ ################################################################################
311
+
312
+ if $0 == __FILE__ then
313
+
314
+
315
+ if ARGV[0]
316
+ in_file = ARGV[0]
317
+ contents = ''
318
+
319
+ contents = File.open(in_file).read
320
+
321
+ rel = Rarff::Relation.new
322
+ rel.parse(contents)
323
+
324
+ else
325
+ exit
326
+ end
327
+
328
+ puts '='*80
329
+ puts '='*80
330
+ puts "ARFF:"
331
+ puts rel
332
+
333
+
334
+ end
335
+
336
+ ################################################################################
337
+
338
+
@@ -0,0 +1,27 @@
1
+ % 1. Title: Iris Plants Database
2
+ %
3
+ % 2. Sources:
4
+ % (a) Creator: R.A. Fisher
5
+ % (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
6
+ % (c) Date: July, 1988
7
+ %
8
+ @RELATION iris
9
+
10
+ @ATTRIBUTE sepallength NUMERIC
11
+ @ATTRIBUTE sepalwidth NUMERIC
12
+ @ATTRIBUTE petallength NUMERIC
13
+ @ATTRIBUTE petalwidth NUMERIC
14
+ @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
15
+
16
+ @DATA
17
+ 5.1,3.5,1.4,0.2,Iris-setosa
18
+ 4.9,3.0,1.4,0.2,Iris-setosa
19
+ 4.7,3.2,1.3,0.2,Iris-setosa
20
+ 4.6,3.1,1.5,0.2,Iris-setosa
21
+ 5.0,3.6,1.4,0.2,Iris-setosa
22
+ 5.4,3.9,1.7,0.4,Iris-setosa
23
+ 4.6,3.4,1.4,0.3,Iris-setosa
24
+ 5.0,3.4,1.5,0.2,Iris-setosa
25
+ 4.4,2.9,1.4,0.2,Iris-setosa
26
+ 4.9,3.1,1.5,0.1,Iris-setosa
27
+
@@ -0,0 +1,340 @@
1
+ # See the README file for more information.
2
+ $:.unshift File.join(File.dirname(__FILE__),'..','lib')
3
+ require 'test/unit'
4
+ require 'rarff'
5
+
6
+ class TestArffLib < Test::Unit::TestCase
7
+
8
+ # Test creation of an arff file string.
9
+ def test_arff_creation
10
+
11
+ arff_file_str = <<-END_OF_ARFF_FILE
12
+ @RELATION MyCoolRelation
13
+ @ATTRIBUTE Attr0 NUMERIC
14
+ @ATTRIBUTE subject STRING
15
+ @ATTRIBUTE Attr2 NUMERIC
16
+ @ATTRIBUTE Attr3 STRING
17
+ @ATTRIBUTE birthday DATE "yyyy-MM-dd HH:mm:ss"
18
+ @DATA
19
+ 1.4, 'foo bar', 5, baz, "1900-08-08 12:12:12"
20
+ 20.9, ruby, 46, 'roc,ks', "2005-10-23 12:12:12"
21
+ 0, ruby, 46, rocks, "2001-02-19 12:12:12"
22
+ 68.1, stuff, 728, 'is cool', "1974-02-10 12:12:12"
23
+ END_OF_ARFF_FILE
24
+
25
+ arff_file_str.gsub!(/\n$/, '')
26
+
27
+ instances = [ [1.4, 'foo bar', 5, 'baz', "1900-08-08 12:12:12"],
28
+ [20.9, 'ruby', 46, 'roc,ks', "2005-10-23 12:12:12"],
29
+ [0, 'ruby', 46, 'rocks', "2001-02-19 12:12:12"],
30
+ [68.1, 'stuff', 728, 'is cool', "1974-02-10 12:12:12"]]
31
+
32
+ rel = Rarff::Relation.new('MyCoolRelation')
33
+ rel.instances = instances
34
+ rel.attributes[1].name = 'subject'
35
+ rel.attributes[4].name = 'birthday'
36
+ rel.attributes[4].type = 'DATE "yyyy-MM-dd HH:mm:ss"'
37
+
38
+ # puts "rel.to_arff:\n(\n#{rel.to_arff}\n)\n"
39
+ assert_equal(arff_file_str, rel.to_arff, "Arff creation test failed.")
40
+ end
41
+
42
+ # # Test creation of a sparse arff file string.
43
+ # def test_sparse_arff_creation
44
+ #
45
+ # arff_file_str = <<-END_OF_ARFF_FILE
46
+ #@RELATION MyCoolRelation
47
+ #@ATTRIBUTE Attr0 NUMERIC
48
+ #@ATTRIBUTE subject STRING
49
+ #@ATTRIBUTE Attr2 NUMERIC
50
+ #@ATTRIBUTE Attr3 STRING
51
+ #@ATTRIBUTE birthday DATE "yyyy-MM-dd HH:mm:ss"
52
+ #@DATA
53
+ #{0 1.4, 1 'foo bar', 3 baz, 4 "1900-08-08 12:12:12"}
54
+ #{0 20.9, 1 ruby, 2 46, 3 rocks, 4 "2005-10-23 12:12:12"}
55
+ #{1 ruby, 2 46, 3 rocks, 4 "2001-02-19 12:12:12"}
56
+ #{0 68.1, 1 stuff, 3 'is cool', 4 "1974-02-10 12:12:12"}
57
+ # END_OF_ARFF_FILE
58
+ #
59
+ # arff_file_str.gsub!(/\n$/, '')
60
+ #
61
+ # instances = [ [1.4, 'foo bar', 0, 'baz', "1900-08-08 12:12:12"],
62
+ # [20.9, 'ruby', 46, 'rocks', "2005-10-23 12:12:12"],
63
+ # [0.0, 'ruby', 46, 'rocks', "2001-02-19 12:12:12"],
64
+ # [68.1, 'stuff', 0, 'is cool', "1974-02-10 12:12:12"]]
65
+ #
66
+ # rel = Rarff::Relation.new('MyCoolRelation')
67
+ # rel.instances = instances
68
+ # rel.attributes[1].name = 'subject'
69
+ # rel.attributes[4].name = 'birthday'
70
+ # rel.attributes[4].type = 'DATE "yyyy-MM-dd HH:mm:ss"'
71
+ #
72
+ # # puts "rel.to_arff(true):\n(\n#{rel.to_arff(true)}\n)\n"
73
+ # assert_equal( arff_file_str, rel.to_arff(true), "test_sparse_arff_creation.")
74
+ # end
75
+ #
76
+ #
77
+ # # Test parsing of an arff file.
78
+ # def test_arff_parse
79
+ # in_file = './test_arff.arff'
80
+ # rel = Rarff::Relation.new
81
+ # rel.parse(File.open(File.join(File.dirname(__FILE__),in_file)).read)
82
+ #
83
+ # assert_equal(rel.instances[2][1], 3.2)
84
+ # assert_equal(rel.instances[7][4], 'Iris-setosa')
85
+ # end
86
+ #
87
+ #
88
+ # # Test parsing of sparse ARFF format
89
+ # def test_sparse_arff_parse
90
+ # in_file = './test_sparse_arff.arff'
91
+ # rel = Rarff::Relation.new
92
+ # rel.parse(File.open(File.join(File.dirname(__FILE__),in_file)).read)
93
+ #
94
+ # assert_equal(13, rel.instances[0].size)
95
+ # assert_equal(0, rel.instances[0][1])
96
+ # assert_equal(7, rel.instances[0][3])
97
+ # assert_equal(2.4, rel.instances[1][1])
98
+ # assert_equal(0, rel.instances[1][2])
99
+ # assert_equal(19, rel.instances[1][12])
100
+ # assert_equal(6, rel.instances[2][6])
101
+ # assert_equal(0, rel.instances[3][12])
102
+ # # puts "\n\nARFF: (\n#{rel.to_arff}\n)"
103
+ # end
104
+ def test_output_missing
105
+ arff_file_str = <<-END_OF_ARFF_FILE
106
+ @RELATION MyCoolRelation
107
+ @ATTRIBUTE Attr0 NUMERIC
108
+ @ATTRIBUTE subject STRING
109
+ @ATTRIBUTE Attr2 NUMERIC
110
+ @ATTRIBUTE Attr3 STRING
111
+ @ATTRIBUTE birthday DATE "yyyy-MM-dd HH:mm:ss"
112
+ @DATA
113
+ ?, 'foo bar', 5, baz, ?
114
+ 20.9, ruby, 46, ?, "2005-10-23 12:12:12"
115
+ END_OF_ARFF_FILE
116
+
117
+ arff_file_str.gsub!(/\n$/, '')
118
+
119
+ instances = [ [nil, 'foo bar', 5, 'baz', nil],
120
+ [20.9, 'ruby', 46, nil, "2005-10-23 12:12:12"]]
121
+
122
+ rel = Rarff::Relation.new('MyCoolRelation')
123
+ rel.instances = instances
124
+ rel.attributes[1].name = 'subject'
125
+ rel.attributes[4].name = 'birthday'
126
+ rel.attributes[4].type = 'DATE "yyyy-MM-dd HH:mm:ss"'
127
+
128
+ # puts "rel.to_arff:\n(\n#{rel.to_arff}\n)\n"
129
+ assert_equal(arff_file_str, rel.to_arff, "missing data output failure")
130
+ end
131
+
132
+ def test_output_missing_undefined_first_row
133
+ arff_file_str = <<-END_OF_ARFF_FILE
134
+ @RELATION MyCoolRelation
135
+ @ATTRIBUTE Attr0 NUMERIC
136
+ @ATTRIBUTE subject STRING
137
+ @ATTRIBUTE Attr2 NUMERIC
138
+ @ATTRIBUTE Attr3 NUMERIC
139
+ @ATTRIBUTE birthday DATE "yyyy-MM-dd HH:mm:ss"
140
+ @DATA
141
+ ?, ?, ?, ?, ?
142
+ 20.9, ruby, 46, ?, "2005-10-23 12:12:12"
143
+ END_OF_ARFF_FILE
144
+
145
+ arff_file_str.gsub!(/\n$/, '')
146
+
147
+ instances = [ [nil, nil, nil, nil, nil],
148
+ [20.9, 'ruby', 46, nil, "2005-10-23 12:12:12"]]
149
+
150
+ rel = Rarff::Relation.new('MyCoolRelation')
151
+ rel.instances = instances
152
+ rel.attributes[1].name = 'subject'
153
+ rel.attributes[4].name = 'birthday'
154
+ rel.attributes[4].type = 'DATE "yyyy-MM-dd HH:mm:ss"'
155
+
156
+ # puts "rel.to_arff:\n(\n#{rel.to_arff}\n)\n"
157
+ assert_equal(arff_file_str, rel.to_arff, "missing data from first line output failure")
158
+ end
159
+
160
+ def test_boolean
161
+ arff_file_str = <<-END_OF_ARFF_FILE
162
+ @RELATION MyCoolRelation
163
+ @ATTRIBUTE Attr0 {false,true}
164
+ @DATA
165
+ true
166
+ END_OF_ARFF_FILE
167
+
168
+ arff_file_str.gsub!(/\n$/, '')
169
+
170
+ instances = [ [true]]
171
+
172
+ rel = Rarff::Relation.new('MyCoolRelation')
173
+ rel.instances = instances
174
+
175
+ # puts "rel.to_arff:\n(\n#{rel.to_arff}\n)\n"
176
+ assert_equal(arff_file_str, rel.to_arff, "missing data from first line output failure")
177
+ end
178
+
179
+ def test_boolean_multipl
180
+ arff_file_str = <<-END_OF_ARFF_FILE
181
+ @RELATION MyCoolRelation
182
+ @ATTRIBUTE Attr0 {false,true}
183
+ @ATTRIBUTE Attr1 {false,true}
184
+ @ATTRIBUTE Attr2 {false,true}
185
+ @DATA
186
+ true, false, true
187
+ true, true, true
188
+ END_OF_ARFF_FILE
189
+
190
+ arff_file_str.gsub!(/\n$/, '')
191
+
192
+ instances = [ [true,false,true],[true,true,true]]
193
+
194
+ rel = Rarff::Relation.new('MyCoolRelation')
195
+ rel.instances = instances
196
+
197
+ # puts "rel.to_arff:\n(\n#{rel.to_arff}\n)\n"
198
+ assert_equal(arff_file_str, rel.to_arff, "missing data from first line output failure")
199
+ end
200
+
201
+ def test_strings_as_nominal
202
+ arff_file_str = <<-END_OF_ARFF_FILE
203
+ @RELATION MyCoolRelation
204
+ @ATTRIBUTE Attr0 {two,one}
205
+ @ATTRIBUTE Attr1 {three,four}
206
+ @DATA
207
+ one, three
208
+ two, four
209
+ END_OF_ARFF_FILE
210
+
211
+ arff_file_str.gsub!(/\n$/, '')
212
+
213
+ instances = [ ['one','three'],['two','four']]
214
+
215
+ rel = Rarff::Relation.new('MyCoolRelation')
216
+ rel.instances = instances
217
+ rel.set_string_attributes_to_nominal
218
+
219
+ # puts "rel.to_arff:\n(\n#{rel.to_arff}\n)\n"
220
+ assert_equal(arff_file_str, rel.to_arff, "test_strings_as_nominal")
221
+ end
222
+
223
+ def test_set_strings_nominal2
224
+ arff_file_str = <<-END_OF_ARFF_FILE
225
+ @RELATION MyCoolRelation
226
+ @ATTRIBUTE Attr0 NUMERIC
227
+ @ATTRIBUTE Attr1 {three,four}
228
+ @DATA
229
+ 1, three
230
+ 2, four
231
+ END_OF_ARFF_FILE
232
+
233
+ arff_file_str.gsub!(/\n$/, '')
234
+
235
+ instances = [ [1,'three'],[2,'four']]
236
+
237
+ rel = Rarff::Relation.new('MyCoolRelation')
238
+ rel.instances = instances
239
+ rel.set_string_attributes_to_nominal
240
+
241
+ # puts "rel.to_arff:\n(\n#{rel.to_arff}\n)\n"
242
+ assert_equal(arff_file_str, rel.to_arff, "test_strings_as_nominal")
243
+ end
244
+
245
+ def test_strings_nominal_with_arguments1
246
+ arff_file_str = <<-END_OF_ARFF_FILE
247
+ @RELATION MyCoolRelation
248
+ @ATTRIBUTE Attr0 NUMERIC
249
+ @ATTRIBUTE Attr1 STRING
250
+ @DATA
251
+ 1, three
252
+ 2, four
253
+ END_OF_ARFF_FILE
254
+
255
+ arff_file_str.gsub!(/\n$/, '')
256
+
257
+ instances = [ [1,'three'],[2,'four']]
258
+
259
+ rel = Rarff::Relation.new('MyCoolRelation')
260
+ rel.instances = instances
261
+ rel.set_string_attributes_to_nominal([0])
262
+
263
+ # puts "rel.to_arff:\n(\n#{rel.to_arff}\n)\n"
264
+ assert_equal(arff_file_str, rel.to_arff, "test_strings_as_nominal")
265
+ end
266
+
267
+ def test_strings_nominal_with_arguments2
268
+ arff_file_str = <<-END_OF_ARFF_FILE
269
+ @RELATION MyCoolRelation
270
+ @ATTRIBUTE Attr0 NUMERIC
271
+ @ATTRIBUTE Attr1 {three,four}
272
+ @DATA
273
+ 1, three
274
+ 2, four
275
+ END_OF_ARFF_FILE
276
+
277
+ arff_file_str.gsub!(/\n$/, '')
278
+
279
+ instances = [ [1,'three'],[2,'four']]
280
+
281
+ rel = Rarff::Relation.new('MyCoolRelation')
282
+ rel.instances = instances
283
+ rel.set_string_attributes_to_nominal([0,1])
284
+
285
+ # puts "rel.to_arff:\n(\n#{rel.to_arff}\n)\n"
286
+ assert_equal(arff_file_str, rel.to_arff, "test_strings_as_nominal")
287
+ end
288
+
289
+ def test_boolean_2
290
+ arff_file_str = <<-END_OF_ARFF_FILE
291
+ @RELATION MyCoolRelation
292
+ @ATTRIBUTE Attr0 NUMERIC
293
+ @ATTRIBUTE subject STRING
294
+ @ATTRIBUTE Attr2 {false,true}
295
+ @DATA
296
+ ?, ?, ?
297
+ 20.9, ruby, true
298
+ END_OF_ARFF_FILE
299
+
300
+ arff_file_str.gsub!(/\n$/, '')
301
+
302
+ instances = [ [nil, nil, nil],
303
+ [20.9, 'ruby', true]]
304
+
305
+ rel = Rarff::Relation.new('MyCoolRelation')
306
+ rel.instances = instances
307
+ rel.attributes[1].name = 'subject'
308
+
309
+ assert_equal(arff_file_str, rel.to_arff, "missing data output failure")
310
+ end
311
+
312
+ def test_commas_in_attribute_name
313
+ arff_file_str = <<-END_OF_ARFF_FILE
314
+ @RELATION MyCoolRelation
315
+ @ATTRIBUTE subject {ruby_yeh,ruby}
316
+ @ATTRIBUTE Attr1 {duh}
317
+ @DATA
318
+ ruby__yeh, duh
319
+ ruby, duh
320
+ END_OF_ARFF_FILE
321
+
322
+ arff_file_str.gsub!(/\n$/, '')
323
+
324
+ instances = [
325
+ ['ruby, yeh','duh'],
326
+ ['ruby','duh']
327
+ ]
328
+
329
+
330
+ rel = Rarff::Relation.new('MyCoolRelation')
331
+ rel.instances = instances
332
+ rel.attributes[0].name = 'subject'
333
+ rel.set_string_attributes_to_nominal
334
+
335
+ assert_equal(arff_file_str, rel.to_arff, "comma in string attribute failure")
336
+ end
337
+ end
338
+
339
+
340
+
@@ -0,0 +1,24 @@
1
+ % Sample sparse ARFF file
2
+ @RELATION sparseness
3
+
4
+ @ATTRIBUTE attr1 NUMERIC
5
+ @ATTRIBUTE attr2 NUMERIC
6
+ @ATTRIBUTE attr3 NUMERIC
7
+ @ATTRIBUTE attr4 NUMERIC
8
+ @ATTRIBUTE attr5 NUMERIC
9
+ @ATTRIBUTE attr6 NUMERIC
10
+ @ATTRIBUTE attr7 NUMERIC
11
+ @ATTRIBUTE attr8 NUMERIC
12
+ @ATTRIBUTE attr9 NUMERIC
13
+ @ATTRIBUTE attr10 NUMERIC
14
+ @ATTRIBUTE attr11 NUMERIC
15
+ @ATTRIBUTE attr12 NUMERIC
16
+ @ATTRIBUTE attr13 NUMERIC
17
+
18
+ @DATA
19
+ {3 7, 10 34}
20
+ {1 2.4, 4 62, 12 19}
21
+ {0 0, 1 1, 2 2, 3 3, 4 4, 5 5, 6 6, 7 7, 8 8, 9 9, 10 10, 11 11, 12 12}
22
+ {9 42}
23
+ {2 54.3, 3 92, 11 10.2}
24
+
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: iterationlabs-rarff
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.5
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Ben J Woodcroft
9
+ - Andy Payne
10
+ - Andrew Cantino
11
+ autorequire:
12
+ bindir: bin
13
+ cert_chain: []
14
+ date: 2012-01-23 00:00:00.000000000Z
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: shoulda
18
+ requirement: &70346083110640 !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - ! '>='
22
+ - !ruby/object:Gem::Version
23
+ version: '0'
24
+ type: :runtime
25
+ prerelease: false
26
+ version_requirements: *70346083110640
27
+ - !ruby/object:Gem::Dependency
28
+ name: jeweler
29
+ requirement: &70346083109860 !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ! '>='
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: *70346083109860
38
+ - !ruby/object:Gem::Dependency
39
+ name: thoughtbot-shoulda
40
+ requirement: &70346083108740 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ type: :development
47
+ prerelease: false
48
+ version_requirements: *70346083108740
49
+ description: ! "Rarff is a Ruby library for dealing with Attribute-Relation File Format
50
+ (ARFF) files. ARFF files are used to specify \ndata sets for data mining and machine
51
+ learning."
52
+ email: donttrustben near gmail.com
53
+ executables: []
54
+ extensions: []
55
+ extra_rdoc_files:
56
+ - README.txt
57
+ files:
58
+ - .rvmrc
59
+ - Gemfile
60
+ - Gemfile.lock
61
+ - History.txt
62
+ - Manifest.txt
63
+ - README.txt
64
+ - Rakefile
65
+ - VERSION
66
+ - lib/rarff.rb
67
+ - test/test_arff.arff
68
+ - test/test_rarff.rb
69
+ - test/test_sparse_arff.arff
70
+ homepage: http://github.com/iterationlabs/rarff
71
+ licenses: []
72
+ post_install_message:
73
+ rdoc_options: []
74
+ require_paths:
75
+ - lib
76
+ required_ruby_version: !ruby/object:Gem::Requirement
77
+ none: false
78
+ requirements:
79
+ - - ! '>='
80
+ - !ruby/object:Gem::Version
81
+ version: '0'
82
+ required_rubygems_version: !ruby/object:Gem::Requirement
83
+ none: false
84
+ requirements:
85
+ - - ! '>='
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ requirements: []
89
+ rubyforge_project:
90
+ rubygems_version: 1.8.10
91
+ signing_key:
92
+ specification_version: 3
93
+ summary: Rarff is a Ruby library for dealing with Attribute-Relation File Format (ARFF)
94
+ files
95
+ test_files: []