iterationlabs-rarff 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm use 1.9.2@rarff --create
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source :rubygems
2
+ gem 'shoulda'
3
+ gem 'jeweler'
@@ -0,0 +1,17 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ git (1.2.5)
5
+ jeweler (1.6.4)
6
+ bundler (~> 1.0)
7
+ git (>= 1.2.5)
8
+ rake
9
+ rake (0.9.2.2)
10
+ shoulda (2.11.3)
11
+
12
+ PLATFORMS
13
+ ruby
14
+
15
+ DEPENDENCIES
16
+ jeweler
17
+ shoulda
@@ -0,0 +1,18 @@
1
+ == Changes
2
+
3
+ === 0.2.3
4
+
5
+ * Added set_string_attributes_to_nominal for easy conversion of string attributes to nominal ones
6
+
7
+ === 0.2.2 (unofficial)
8
+
9
+ * Handles boolean inputs, which are modelled as nominals
10
+ * Handles spaces in nominals, which are replaced by underscores. Probably should be quoting these, but is good enough for me right now
11
+
12
+ === 0.2.1 (unofficial)
13
+
14
+ * Handles missing data in output, encoded internally as nil values
15
+
16
+ == 0.2.0 ?
17
+
18
+ * Sparse ARFF files (thanks to Tom Adams)
@@ -0,0 +1,5 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README.txt
4
+ Rakefile
5
+ lib/rarff.rb
@@ -0,0 +1,90 @@
1
+ = rarff
2
+
3
+ http://adenserparlance.blogspot.com/2007/01/rarff-simple-arff-library-in-ruby.html
4
+
5
+ == DESCRIPTION:
6
+
7
+ Rarff is a Ruby library for dealing with Attribute-Relation File Format (ARFF) files. ARFF files are used to specify data sets for data mining and machine learning.
8
+
9
+
10
+ == FEATURES/PROBLEMS:
11
+
12
+ === FEATURES
13
+ * Missing values - '?' are handled in creation of ARFF files
14
+
15
+ === PROBLEMS
16
+ * Spaces or quotes in nominal types
17
+ * Commas in quoted attributes or in nominal types
18
+ * Add error checking/validation
19
+ * Creation of sparse ARFF files
20
+ * Dates - do some work to create, translate, and interpret date format strings.
21
+
22
+ == SYNOPSIS:
23
+
24
+ arff_file_str = <<-END_OF_ARFF_FILE
25
+ @RELATION MyCoolRelation
26
+ @ATTRIBUTE Attr0 NUMERIC
27
+ @ATTRIBUTE subject STRING
28
+ @ATTRIBUTE Attr2 NUMERIC
29
+ @ATTRIBUTE Attr3 STRING
30
+ @ATTRIBUTE birthday DATE "yyyy-MM-dd HH:mm:ss"
31
+ @DATA
32
+ 1.4, 'foo bar', 5, baz, "1900-08-08 12:12:12"
33
+ 20.9, ruby, 46, rocks, "2005-10-23 12:12:12"
34
+ 0, ruby, 46, rocks, "2001-02-19 12:12:12"
35
+ 68.1, stuff, 728, 'is cool', "1974-02-10 12:12:12"
36
+ END_OF_ARFF_FILE
37
+
38
+ arff_file_str.gsub!(/\n$/, '')
39
+
40
+ instances = [ [1.4, 'foo bar', 5, 'baz', "1900-08-08 12:12:12"],
41
+ [20.9, 'ruby', 46, 'rocks', "2005-10-23 12:12:12"],
42
+ [0, 'ruby', 46, 'rocks', "2001-02-19 12:12:12"],
43
+ [68.1, 'stuff', 728, 'is cool', "1974-02-10 12:12:12"]]
44
+
45
+ rel = Rarff::Relation.new('MyCoolRelation')
46
+ rel.instances = instances
47
+ rel.attributes[1].name = 'subject'
48
+ rel.attributes[4].name = 'birthday'
49
+ rel.attributes[4].type = 'DATE "yyyy-MM-dd HH:mm:ss"'
50
+
51
+ # puts "rel.to_arff:\n(\n#{rel.to_arff}\n)\n"
52
+ assert_equal(arff_file_str, rel.to_arff, "Arff creation test failed.")
53
+
54
+ == REQUIREMENTS:
55
+
56
+ == INSTALL:
57
+
58
+ * sudo gem install wwood-rarff
59
+
60
+ == LICENSE:
61
+
62
+ Copyright (c) 2008 Andy Payne
63
+ All rights reserved.
64
+
65
+ Redistribution and use in source and binary forms, with or without
66
+ modification, are permitted provided that the following conditions are met:
67
+
68
+ * Redistributions of source code must retain the above copyright notice,
69
+ this list of conditions and the following disclaimer.
70
+ * Redistributions in binary form must reproduce the above copyright notice,
71
+ this list of conditions and the following disclaimer in the
72
+ documentation and/or other materials provided with the distribution.
73
+ * Neither the name of the COPYRIGHT OWNER nor the names of its contributors
74
+ may be used to endorse or promote products derived from this software
75
+ without specific prior written permission.
76
+
77
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
78
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
79
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
80
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
81
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
82
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
83
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
84
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
85
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
86
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
87
+
88
+
89
+
90
+
@@ -0,0 +1,52 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "iterationlabs-rarff"
8
+ gem.summary = %Q{Rarff is a Ruby library for dealing with Attribute-Relation File Format (ARFF) files}
9
+ gem.description = %Q{Rarff is a Ruby library for dealing with Attribute-Relation File Format (ARFF) files. ARFF files are used to specify
10
+ data sets for data mining and machine learning.}
11
+ gem.email = "donttrustben near gmail.com"
12
+ gem.homepage = "http://github.com/iterationlabs/rarff"
13
+ gem.authors = ["Ben J Woodcroft","Andy Payne", "Andrew Cantino"]
14
+ gem.add_development_dependency "thoughtbot-shoulda", ">= 0"
15
+ end
16
+ rescue LoadError
17
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
18
+ end
19
+
20
+ require 'rake/testtask'
21
+ Rake::TestTask.new(:test) do |test|
22
+ test.libs << 'lib' << 'test'
23
+ test.pattern = 'test/**/test_*.rb'
24
+ test.verbose = true
25
+ end
26
+
27
+ begin
28
+ require 'rcov/rcovtask'
29
+ Rcov::RcovTask.new do |test|
30
+ test.libs << 'test'
31
+ test.pattern = 'test/**/test_*.rb'
32
+ test.verbose = true
33
+ end
34
+ rescue LoadError
35
+ task :rcov do
36
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
37
+ end
38
+ end
39
+
40
+ task :default => :test
41
+
42
+ # task :test => :check_dependencies
43
+
44
+ # require 'rdoc/task'
45
+ # Rake::RDocTask.new do |rdoc|
46
+ # version = File.exist?('VERSION') ? File.read('VERSION') : ""
47
+ #
48
+ # rdoc.rdoc_dir = 'rdoc'
49
+ # rdoc.title = "blah #{version}"
50
+ # rdoc.rdoc_files.include('README*')
51
+ # rdoc.rdoc_files.include('lib/**/*.rb')
52
+ # end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.5
@@ -0,0 +1,338 @@
1
+ # = rarff
2
+
3
+ # This is the top-level include file for rarff. See the README file for
4
+ # details.
5
+
6
+ ################################################################################
7
+
8
+ # Custom scan that returns a boolean indicating whether the regex matched.
9
+ # TODO: Is there a way to avoid doing this?
10
+ class String
11
+ def my_scan(re)
12
+ hit = false
13
+ scan(re) { |arr|
14
+ yield arr if block_given?
15
+ hit = true
16
+ }
17
+ hit
18
+ end
19
+ end
20
+
21
+ ################################################################################
22
+
23
+ module Enumerable
24
+ # This map_with_index hack allows access to the index of each item as the map
25
+ # iterates.
26
+ # TODO: Is there a better way?
27
+ def map_with_index
28
+ # Ugly, but I need the yield to be the last statement in the map.
29
+ i = -1
30
+ return map { |item|
31
+ i += 1
32
+ yield item, i
33
+ }
34
+ end
35
+ end
36
+
37
+ ################################################################################
38
+
39
+ module Rarff
40
+
41
+ COMMENT_MARKER = '%'
42
+ RELATION_MARKER = '@RELATION'
43
+ ATTRIBUTE_MARKER = '@ATTRIBUTE'
44
+ DATA_MARKER = '@DATA'
45
+
46
+ SPARSE_ARFF_BEGIN = '{'
47
+ ESC_SPARSE_ARFF_BEGIN = '\\' + SPARSE_ARFF_BEGIN
48
+ SPARSE_ARFF_END = '}'
49
+ ESC_SPARSE_ARFF_END = '\\' + SPARSE_ARFF_END
50
+
51
+ ATTRIBUTE_NUMERIC = 'NUMERIC'
52
+ ATTRIBUTE_REAL = 'REAL'
53
+ ATTRIBUTE_INTEGER = 'INTEGER'
54
+ ATTRIBUTE_STRING = 'STRING'
55
+ ATTRIBUTE_DATE = 'DATE'
56
+ # Model Boolean as a Nominal Attribute.
57
+ # Use {false, true} not {true, false} because then in visualisations in Weka
58
+ # true is to the right, which makes more intuitive sense
59
+ ATTRIBUTE_BOOLEAN = '{false, true}'
60
+
61
+ MISSING = '?'
62
+
63
+ ################################################################################
64
+
65
+ class Attribute
66
+ attr_accessor :name
67
+ attr_reader :type
68
+
69
+ def initialize(name='', type='')
70
+ @name = name
71
+
72
+ @type_is_nominal = false
73
+ @type = type
74
+
75
+ check_nominal()
76
+ end
77
+
78
+
79
+ def type=(type)
80
+ @type = type
81
+ check_nominal()
82
+ end
83
+
84
+
85
+ # Convert string representation of nominal type to array, if necessary
86
+ # TODO: This might falsely trigger on wacky date formats.
87
+ def check_nominal
88
+ if @type =~ /^\s*\{.*(\,.*)+\}\s*$/
89
+ @type_is_nominal = true
90
+ # Example format: "{nom1,nom2, nom3, nom4,nom5 } "
91
+ # Split on '{' ',' or '}'
92
+ # @type = @type.gsub(/^\s*\{\s*/, '').gsub(/\s*\}\s*$/, '').split(/\s*\,\s*/)
93
+ @type = @type.split(/\s*\,\s*/)
94
+ end
95
+ end
96
+
97
+
98
+ def add_nominal_value(str)
99
+ if @type_is_nominal == false
100
+ @type = Array.new
101
+ end
102
+
103
+ @type << str
104
+ end
105
+
106
+
107
+ def to_arff
108
+ if @type_is_nominal == true
109
+ ATTRIBUTE_MARKER + " #{@name} #{@type.join(',').gsub(' ','_')}"
110
+ else
111
+ ATTRIBUTE_MARKER + " #{@name} #{@type}"
112
+ end
113
+ end
114
+
115
+
116
+ def to_s
117
+ to_arff
118
+ end
119
+
120
+ end
121
+
122
+
123
+
124
+ class Relation
125
+ attr_accessor :name, :attributes
126
+ attr_reader :instances
127
+
128
+
129
+ def initialize(name='')
130
+ @name = name
131
+ @attributes = Array.new
132
+ @instances = Array.new
133
+ end
134
+
135
+
136
+ def parse(str)
137
+ in_data_section = false
138
+
139
+ # TODO: Doesn't handle commas in quoted attributes.
140
+ str.split("\n").each { |line|
141
+ next if line =~ /^\s*$/
142
+ next if line =~ /^\s*#{COMMENT_MARKER}/
143
+ next if line.my_scan(/^\s*#{RELATION_MARKER}\s*(.*)\s*$/i) { |name| @name = name }
144
+ next if line.my_scan(/^\s*#{ATTRIBUTE_MARKER}\s*([^\s]*)\s+(.*)\s*$/i) { |name, type|
145
+ @attributes.push(Attribute.new(name, type))
146
+ }
147
+ next if line.my_scan(/^\s*#{DATA_MARKER}/i) { in_data_section = true }
148
+ next if in_data_section == false ## Below is data section handling
149
+ # next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^\s*#{SPARSE_ARFF_BEGIN}(.*)#{SPARSE_ARFF_END}\s*$/) { |data|
150
+ next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^#{ESC_SPARSE_ARFF_BEGIN}(.*)#{ESC_SPARSE_ARFF_END}$/) { |data|
151
+ # Sparse ARFF
152
+ # TODO: Factor duplication with non-sparse data below
153
+ @instances << expand_sparse(data.first)
154
+ create_attributes(true)
155
+ }
156
+ next if line.my_scan(/^\s*(.*)\s*$/) { |data|
157
+ @instances << data.first.split(/,\s*/).map { |field|
158
+ # Remove outer single quotes on strings, if any ('foo bar' --> foo bar)
159
+ field.gsub(/^\s*\'(.*)\'\s*$/, "\\1")
160
+ }
161
+ create_attributes(true)
162
+ }
163
+ }
164
+ end
165
+
166
+
167
+ # Assign instances to the internal array
168
+ # parse: choose to parse strings into numerics
169
+ def instances=(instances, parse=false)
170
+ @instances = instances
171
+ create_attributes(parse)
172
+ end
173
+
174
+
175
+
176
+ def create_attributes(attr_parse=false)
177
+ raise Exception, "Not enough data to create ARFF attributes" if @instances.nil? or
178
+ @instances.empty? or
179
+ @instances[0].empty?
180
+
181
+ # Keep track of whether an attribute has been defined or not.
182
+ # The only reason an attribute would not be defined in the first
183
+ # row is if it has nil's in it. The geek inside screams for a binary
184
+ # encoding like chmod but eh.
185
+ attributes_defined = {}
186
+ @instances.each_with_index { |row, i|
187
+ row.each_with_index { |col, j|
188
+ next if attributes_defined[j] or col.nil?
189
+
190
+ attributes_defined[j] = true #whatever happens, we are going to define it
191
+ if attr_parse
192
+ if col =~ /^\-?\d+\.?\d*$/
193
+ @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_NUMERIC)
194
+ end
195
+ next #parse next column - this one is finished
196
+ end
197
+
198
+ # No parsing - just take it how it is
199
+ if col.kind_of?(Numeric)
200
+ @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_NUMERIC)
201
+ elsif col.kind_of?(String)
202
+ @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_STRING)
203
+ elsif col == false or col == true #exactly equal to a boolean
204
+ @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_BOOLEAN)
205
+ else
206
+ raise Exception, "Could not parse attribute to ARFF data type: #{col.inspect}"
207
+ end
208
+ }
209
+ }
210
+
211
+ # Make sure all attributes have a definition, because otherwise
212
+ # needless errors are thrown
213
+ @instances[0].each_index do |i|
214
+ @attributes[i] ||= Attribute.new("Attr#{i}", ATTRIBUTE_NUMERIC)
215
+ end
216
+ end
217
+
218
+ # Make all String type attributes into nominal attributes, because
219
+ # they are more useful in WEKA because more techniques handle them than
220
+ # strings.
221
+ #
222
+ # column_indices is an optional argumetn specifying the columns that
223
+ # are to be set to nominal (0 based indexes). if nil (the default), then
224
+ # all columns are included
225
+ def set_string_attributes_to_nominal(column_indices = nil)
226
+ nominals = {}
227
+ # Frustratingly, we have to traverse this 2D array with the
228
+ # wrong dimension first. Oh well.
229
+ @instances.each_with_index do |row, row_index|
230
+ row.each_with_index do |string, col_index|
231
+ next unless @attributes[col_index].type == ATTRIBUTE_STRING
232
+ next unless column_indices.nil? or column_indices.include?(col_index)
233
+
234
+ nominals[col_index] ||= {}
235
+ nominals[col_index][string] ||= true
236
+ end
237
+ end
238
+
239
+ nominals.each do |index, strings|
240
+ @attributes[index].type = "{#{strings.keys.join(',')}}"
241
+ end
242
+ end
243
+
244
+ def expand_sparse(str)
245
+ arr = Array.new(@attributes.size, 0)
246
+ str.gsub(/^\s*\{(.*)\}\s*$/, "\\1").split(/\s*\,\s*/).map { |pr|
247
+ pra = pr.split(/\s/)
248
+ arr[pra[0].to_i] = pra[1]
249
+ }
250
+ arr
251
+ end
252
+
253
+
254
+ def to_arff(sparse=false)
255
+ RELATION_MARKER + " #{@name}\n" +
256
+ @attributes.join("\n") +
257
+ "\n" +
258
+ DATA_MARKER + "\n" +
259
+
260
+ @instances.map { |inst|
261
+ mapped = inst.map_with_index { |col, i|
262
+ # First pass - quote strings with spaces, and dates
263
+ # TODO: Doesn't handle cases in which strings already contain
264
+ # quotes or are already quoted.
265
+ unless col.nil?
266
+ if @attributes[i].type =~ /^#{ATTRIBUTE_STRING}$/i
267
+ if col =~ /[,\s]+/
268
+ col = "'" + col + "'"
269
+ end
270
+ elsif @attributes[i].type =~ /^#{ATTRIBUTE_DATE}/i ## Hack comparison. Ugh.
271
+ col = '"' + col + '"'
272
+ end
273
+ end
274
+
275
+ # Do the final output
276
+ if sparse
277
+ if col.nil? or
278
+ (@attributes[i].type =~ /^#{ATTRIBUTE_NUMERIC}$/i and col == 0)
279
+ nil
280
+ else
281
+ "#{i} #{col}"
282
+ end
283
+ else
284
+ if col.nil?
285
+ MISSING
286
+ else
287
+ col
288
+ end
289
+ end
290
+ }
291
+
292
+ if sparse
293
+ mapped.reject{|col| col.nil?}.join(', ')
294
+ else
295
+ mapped.join(", ")
296
+ end
297
+ }.join("\n")
298
+ end
299
+
300
+
301
+ def to_s
302
+ to_arff
303
+ end
304
+
305
+ end
306
+
307
+
308
+ end # module Rarff
309
+
310
+ ################################################################################
311
+
312
+ if $0 == __FILE__ then
313
+
314
+
315
+ if ARGV[0]
316
+ in_file = ARGV[0]
317
+ contents = ''
318
+
319
+ contents = File.open(in_file).read
320
+
321
+ rel = Rarff::Relation.new
322
+ rel.parse(contents)
323
+
324
+ else
325
+ exit
326
+ end
327
+
328
+ puts '='*80
329
+ puts '='*80
330
+ puts "ARFF:"
331
+ puts rel
332
+
333
+
334
+ end
335
+
336
+ ################################################################################
337
+
338
+
@@ -0,0 +1,27 @@
1
+ % 1. Title: Iris Plants Database
2
+ %
3
+ % 2. Sources:
4
+ % (a) Creator: R.A. Fisher
5
+ % (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
6
+ % (c) Date: July, 1988
7
+ %
8
+ @RELATION iris
9
+
10
+ @ATTRIBUTE sepallength NUMERIC
11
+ @ATTRIBUTE sepalwidth NUMERIC
12
+ @ATTRIBUTE petallength NUMERIC
13
+ @ATTRIBUTE petalwidth NUMERIC
14
+ @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
15
+
16
+ @DATA
17
+ 5.1,3.5,1.4,0.2,Iris-setosa
18
+ 4.9,3.0,1.4,0.2,Iris-setosa
19
+ 4.7,3.2,1.3,0.2,Iris-setosa
20
+ 4.6,3.1,1.5,0.2,Iris-setosa
21
+ 5.0,3.6,1.4,0.2,Iris-setosa
22
+ 5.4,3.9,1.7,0.4,Iris-setosa
23
+ 4.6,3.4,1.4,0.3,Iris-setosa
24
+ 5.0,3.4,1.5,0.2,Iris-setosa
25
+ 4.4,2.9,1.4,0.2,Iris-setosa
26
+ 4.9,3.1,1.5,0.1,Iris-setosa
27
+
@@ -0,0 +1,340 @@
1
+ # See the README file for more information.
2
+ $:.unshift File.join(File.dirname(__FILE__),'..','lib')
3
+ require 'test/unit'
4
+ require 'rarff'
5
+
6
+ class TestArffLib < Test::Unit::TestCase
7
+
8
+ # Test creation of an arff file string.
9
+ def test_arff_creation
10
+
11
+ arff_file_str = <<-END_OF_ARFF_FILE
12
+ @RELATION MyCoolRelation
13
+ @ATTRIBUTE Attr0 NUMERIC
14
+ @ATTRIBUTE subject STRING
15
+ @ATTRIBUTE Attr2 NUMERIC
16
+ @ATTRIBUTE Attr3 STRING
17
+ @ATTRIBUTE birthday DATE "yyyy-MM-dd HH:mm:ss"
18
+ @DATA
19
+ 1.4, 'foo bar', 5, baz, "1900-08-08 12:12:12"
20
+ 20.9, ruby, 46, 'roc,ks', "2005-10-23 12:12:12"
21
+ 0, ruby, 46, rocks, "2001-02-19 12:12:12"
22
+ 68.1, stuff, 728, 'is cool', "1974-02-10 12:12:12"
23
+ END_OF_ARFF_FILE
24
+
25
+ arff_file_str.gsub!(/\n$/, '')
26
+
27
+ instances = [ [1.4, 'foo bar', 5, 'baz', "1900-08-08 12:12:12"],
28
+ [20.9, 'ruby', 46, 'roc,ks', "2005-10-23 12:12:12"],
29
+ [0, 'ruby', 46, 'rocks', "2001-02-19 12:12:12"],
30
+ [68.1, 'stuff', 728, 'is cool', "1974-02-10 12:12:12"]]
31
+
32
+ rel = Rarff::Relation.new('MyCoolRelation')
33
+ rel.instances = instances
34
+ rel.attributes[1].name = 'subject'
35
+ rel.attributes[4].name = 'birthday'
36
+ rel.attributes[4].type = 'DATE "yyyy-MM-dd HH:mm:ss"'
37
+
38
+ # puts "rel.to_arff:\n(\n#{rel.to_arff}\n)\n"
39
+ assert_equal(arff_file_str, rel.to_arff, "Arff creation test failed.")
40
+ end
41
+
42
+ # # Test creation of a sparse arff file string.
43
+ # def test_sparse_arff_creation
44
+ #
45
+ # arff_file_str = <<-END_OF_ARFF_FILE
46
+ #@RELATION MyCoolRelation
47
+ #@ATTRIBUTE Attr0 NUMERIC
48
+ #@ATTRIBUTE subject STRING
49
+ #@ATTRIBUTE Attr2 NUMERIC
50
+ #@ATTRIBUTE Attr3 STRING
51
+ #@ATTRIBUTE birthday DATE "yyyy-MM-dd HH:mm:ss"
52
+ #@DATA
53
+ #{0 1.4, 1 'foo bar', 3 baz, 4 "1900-08-08 12:12:12"}
54
+ #{0 20.9, 1 ruby, 2 46, 3 rocks, 4 "2005-10-23 12:12:12"}
55
+ #{1 ruby, 2 46, 3 rocks, 4 "2001-02-19 12:12:12"}
56
+ #{0 68.1, 1 stuff, 3 'is cool', 4 "1974-02-10 12:12:12"}
57
+ # END_OF_ARFF_FILE
58
+ #
59
+ # arff_file_str.gsub!(/\n$/, '')
60
+ #
61
+ # instances = [ [1.4, 'foo bar', 0, 'baz', "1900-08-08 12:12:12"],
62
+ # [20.9, 'ruby', 46, 'rocks', "2005-10-23 12:12:12"],
63
+ # [0.0, 'ruby', 46, 'rocks', "2001-02-19 12:12:12"],
64
+ # [68.1, 'stuff', 0, 'is cool', "1974-02-10 12:12:12"]]
65
+ #
66
+ # rel = Rarff::Relation.new('MyCoolRelation')
67
+ # rel.instances = instances
68
+ # rel.attributes[1].name = 'subject'
69
+ # rel.attributes[4].name = 'birthday'
70
+ # rel.attributes[4].type = 'DATE "yyyy-MM-dd HH:mm:ss"'
71
+ #
72
+ # # puts "rel.to_arff(true):\n(\n#{rel.to_arff(true)}\n)\n"
73
+ # assert_equal( arff_file_str, rel.to_arff(true), "test_sparse_arff_creation.")
74
+ # end
75
+ #
76
+ #
77
+ # # Test parsing of an arff file.
78
+ # def test_arff_parse
79
+ # in_file = './test_arff.arff'
80
+ # rel = Rarff::Relation.new
81
+ # rel.parse(File.open(File.join(File.dirname(__FILE__),in_file)).read)
82
+ #
83
+ # assert_equal(rel.instances[2][1], 3.2)
84
+ # assert_equal(rel.instances[7][4], 'Iris-setosa')
85
+ # end
86
+ #
87
+ #
88
+ # # Test parsing of sparse ARFF format
89
+ # def test_sparse_arff_parse
90
+ # in_file = './test_sparse_arff.arff'
91
+ # rel = Rarff::Relation.new
92
+ # rel.parse(File.open(File.join(File.dirname(__FILE__),in_file)).read)
93
+ #
94
+ # assert_equal(13, rel.instances[0].size)
95
+ # assert_equal(0, rel.instances[0][1])
96
+ # assert_equal(7, rel.instances[0][3])
97
+ # assert_equal(2.4, rel.instances[1][1])
98
+ # assert_equal(0, rel.instances[1][2])
99
+ # assert_equal(19, rel.instances[1][12])
100
+ # assert_equal(6, rel.instances[2][6])
101
+ # assert_equal(0, rel.instances[3][12])
102
+ # # puts "\n\nARFF: (\n#{rel.to_arff}\n)"
103
+ # end
104
+ def test_output_missing
105
+ arff_file_str = <<-END_OF_ARFF_FILE
106
+ @RELATION MyCoolRelation
107
+ @ATTRIBUTE Attr0 NUMERIC
108
+ @ATTRIBUTE subject STRING
109
+ @ATTRIBUTE Attr2 NUMERIC
110
+ @ATTRIBUTE Attr3 STRING
111
+ @ATTRIBUTE birthday DATE "yyyy-MM-dd HH:mm:ss"
112
+ @DATA
113
+ ?, 'foo bar', 5, baz, ?
114
+ 20.9, ruby, 46, ?, "2005-10-23 12:12:12"
115
+ END_OF_ARFF_FILE
116
+
117
+ arff_file_str.gsub!(/\n$/, '')
118
+
119
+ instances = [ [nil, 'foo bar', 5, 'baz', nil],
120
+ [20.9, 'ruby', 46, nil, "2005-10-23 12:12:12"]]
121
+
122
+ rel = Rarff::Relation.new('MyCoolRelation')
123
+ rel.instances = instances
124
+ rel.attributes[1].name = 'subject'
125
+ rel.attributes[4].name = 'birthday'
126
+ rel.attributes[4].type = 'DATE "yyyy-MM-dd HH:mm:ss"'
127
+
128
+ # puts "rel.to_arff:\n(\n#{rel.to_arff}\n)\n"
129
+ assert_equal(arff_file_str, rel.to_arff, "missing data output failure")
130
+ end
131
+
132
+ def test_output_missing_undefined_first_row
133
+ arff_file_str = <<-END_OF_ARFF_FILE
134
+ @RELATION MyCoolRelation
135
+ @ATTRIBUTE Attr0 NUMERIC
136
+ @ATTRIBUTE subject STRING
137
+ @ATTRIBUTE Attr2 NUMERIC
138
+ @ATTRIBUTE Attr3 NUMERIC
139
+ @ATTRIBUTE birthday DATE "yyyy-MM-dd HH:mm:ss"
140
+ @DATA
141
+ ?, ?, ?, ?, ?
142
+ 20.9, ruby, 46, ?, "2005-10-23 12:12:12"
143
+ END_OF_ARFF_FILE
144
+
145
+ arff_file_str.gsub!(/\n$/, '')
146
+
147
+ instances = [ [nil, nil, nil, nil, nil],
148
+ [20.9, 'ruby', 46, nil, "2005-10-23 12:12:12"]]
149
+
150
+ rel = Rarff::Relation.new('MyCoolRelation')
151
+ rel.instances = instances
152
+ rel.attributes[1].name = 'subject'
153
+ rel.attributes[4].name = 'birthday'
154
+ rel.attributes[4].type = 'DATE "yyyy-MM-dd HH:mm:ss"'
155
+
156
+ # puts "rel.to_arff:\n(\n#{rel.to_arff}\n)\n"
157
+ assert_equal(arff_file_str, rel.to_arff, "missing data from first line output failure")
158
+ end
159
+
160
+ def test_boolean
161
+ arff_file_str = <<-END_OF_ARFF_FILE
162
+ @RELATION MyCoolRelation
163
+ @ATTRIBUTE Attr0 {false,true}
164
+ @DATA
165
+ true
166
+ END_OF_ARFF_FILE
167
+
168
+ arff_file_str.gsub!(/\n$/, '')
169
+
170
+ instances = [ [true]]
171
+
172
+ rel = Rarff::Relation.new('MyCoolRelation')
173
+ rel.instances = instances
174
+
175
+ # puts "rel.to_arff:\n(\n#{rel.to_arff}\n)\n"
176
+ assert_equal(arff_file_str, rel.to_arff, "missing data from first line output failure")
177
+ end
178
+
179
+ def test_boolean_multipl
180
+ arff_file_str = <<-END_OF_ARFF_FILE
181
+ @RELATION MyCoolRelation
182
+ @ATTRIBUTE Attr0 {false,true}
183
+ @ATTRIBUTE Attr1 {false,true}
184
+ @ATTRIBUTE Attr2 {false,true}
185
+ @DATA
186
+ true, false, true
187
+ true, true, true
188
+ END_OF_ARFF_FILE
189
+
190
+ arff_file_str.gsub!(/\n$/, '')
191
+
192
+ instances = [ [true,false,true],[true,true,true]]
193
+
194
+ rel = Rarff::Relation.new('MyCoolRelation')
195
+ rel.instances = instances
196
+
197
+ # puts "rel.to_arff:\n(\n#{rel.to_arff}\n)\n"
198
+ assert_equal(arff_file_str, rel.to_arff, "missing data from first line output failure")
199
+ end
200
+
201
+ def test_strings_as_nominal
202
+ arff_file_str = <<-END_OF_ARFF_FILE
203
+ @RELATION MyCoolRelation
204
+ @ATTRIBUTE Attr0 {two,one}
205
+ @ATTRIBUTE Attr1 {three,four}
206
+ @DATA
207
+ one, three
208
+ two, four
209
+ END_OF_ARFF_FILE
210
+
211
+ arff_file_str.gsub!(/\n$/, '')
212
+
213
+ instances = [ ['one','three'],['two','four']]
214
+
215
+ rel = Rarff::Relation.new('MyCoolRelation')
216
+ rel.instances = instances
217
+ rel.set_string_attributes_to_nominal
218
+
219
+ # puts "rel.to_arff:\n(\n#{rel.to_arff}\n)\n"
220
+ assert_equal(arff_file_str, rel.to_arff, "test_strings_as_nominal")
221
+ end
222
+
223
+ def test_set_strings_nominal2
224
+ arff_file_str = <<-END_OF_ARFF_FILE
225
+ @RELATION MyCoolRelation
226
+ @ATTRIBUTE Attr0 NUMERIC
227
+ @ATTRIBUTE Attr1 {three,four}
228
+ @DATA
229
+ 1, three
230
+ 2, four
231
+ END_OF_ARFF_FILE
232
+
233
+ arff_file_str.gsub!(/\n$/, '')
234
+
235
+ instances = [ [1,'three'],[2,'four']]
236
+
237
+ rel = Rarff::Relation.new('MyCoolRelation')
238
+ rel.instances = instances
239
+ rel.set_string_attributes_to_nominal
240
+
241
+ # puts "rel.to_arff:\n(\n#{rel.to_arff}\n)\n"
242
+ assert_equal(arff_file_str, rel.to_arff, "test_strings_as_nominal")
243
+ end
244
+
245
+ def test_strings_nominal_with_arguments1
246
+ arff_file_str = <<-END_OF_ARFF_FILE
247
+ @RELATION MyCoolRelation
248
+ @ATTRIBUTE Attr0 NUMERIC
249
+ @ATTRIBUTE Attr1 STRING
250
+ @DATA
251
+ 1, three
252
+ 2, four
253
+ END_OF_ARFF_FILE
254
+
255
+ arff_file_str.gsub!(/\n$/, '')
256
+
257
+ instances = [ [1,'three'],[2,'four']]
258
+
259
+ rel = Rarff::Relation.new('MyCoolRelation')
260
+ rel.instances = instances
261
+ rel.set_string_attributes_to_nominal([0])
262
+
263
+ # puts "rel.to_arff:\n(\n#{rel.to_arff}\n)\n"
264
+ assert_equal(arff_file_str, rel.to_arff, "test_strings_as_nominal")
265
+ end
266
+
267
+ def test_strings_nominal_with_arguments2
268
+ arff_file_str = <<-END_OF_ARFF_FILE
269
+ @RELATION MyCoolRelation
270
+ @ATTRIBUTE Attr0 NUMERIC
271
+ @ATTRIBUTE Attr1 {three,four}
272
+ @DATA
273
+ 1, three
274
+ 2, four
275
+ END_OF_ARFF_FILE
276
+
277
+ arff_file_str.gsub!(/\n$/, '')
278
+
279
+ instances = [ [1,'three'],[2,'four']]
280
+
281
+ rel = Rarff::Relation.new('MyCoolRelation')
282
+ rel.instances = instances
283
+ rel.set_string_attributes_to_nominal([0,1])
284
+
285
+ # puts "rel.to_arff:\n(\n#{rel.to_arff}\n)\n"
286
+ assert_equal(arff_file_str, rel.to_arff, "test_strings_as_nominal")
287
+ end
288
+
289
+ def test_boolean_2
290
+ arff_file_str = <<-END_OF_ARFF_FILE
291
+ @RELATION MyCoolRelation
292
+ @ATTRIBUTE Attr0 NUMERIC
293
+ @ATTRIBUTE subject STRING
294
+ @ATTRIBUTE Attr2 {false,true}
295
+ @DATA
296
+ ?, ?, ?
297
+ 20.9, ruby, true
298
+ END_OF_ARFF_FILE
299
+
300
+ arff_file_str.gsub!(/\n$/, '')
301
+
302
+ instances = [ [nil, nil, nil],
303
+ [20.9, 'ruby', true]]
304
+
305
+ rel = Rarff::Relation.new('MyCoolRelation')
306
+ rel.instances = instances
307
+ rel.attributes[1].name = 'subject'
308
+
309
+ assert_equal(arff_file_str, rel.to_arff, "missing data output failure")
310
+ end
311
+
312
+ def test_commas_in_attribute_name
313
+ arff_file_str = <<-END_OF_ARFF_FILE
314
+ @RELATION MyCoolRelation
315
+ @ATTRIBUTE subject {ruby_yeh,ruby}
316
+ @ATTRIBUTE Attr1 {duh}
317
+ @DATA
318
+ ruby__yeh, duh
319
+ ruby, duh
320
+ END_OF_ARFF_FILE
321
+
322
+ arff_file_str.gsub!(/\n$/, '')
323
+
324
+ instances = [
325
+ ['ruby, yeh','duh'],
326
+ ['ruby','duh']
327
+ ]
328
+
329
+
330
+ rel = Rarff::Relation.new('MyCoolRelation')
331
+ rel.instances = instances
332
+ rel.attributes[0].name = 'subject'
333
+ rel.set_string_attributes_to_nominal
334
+
335
+ assert_equal(arff_file_str, rel.to_arff, "comma in string attribute failure")
336
+ end
337
+ end
338
+
339
+
340
+
@@ -0,0 +1,24 @@
1
+ % Sample sparse ARFF file
2
+ @RELATION sparseness
3
+
4
+ @ATTRIBUTE attr1 NUMERIC
5
+ @ATTRIBUTE attr2 NUMERIC
6
+ @ATTRIBUTE attr3 NUMERIC
7
+ @ATTRIBUTE attr4 NUMERIC
8
+ @ATTRIBUTE attr5 NUMERIC
9
+ @ATTRIBUTE attr6 NUMERIC
10
+ @ATTRIBUTE attr7 NUMERIC
11
+ @ATTRIBUTE attr8 NUMERIC
12
+ @ATTRIBUTE attr9 NUMERIC
13
+ @ATTRIBUTE attr10 NUMERIC
14
+ @ATTRIBUTE attr11 NUMERIC
15
+ @ATTRIBUTE attr12 NUMERIC
16
+ @ATTRIBUTE attr13 NUMERIC
17
+
18
+ @DATA
19
+ {3 7, 10 34}
20
+ {1 2.4, 4 62, 12 19}
21
+ {0 0, 1 1, 2 2, 3 3, 4 4, 5 5, 6 6, 7 7, 8 8, 9 9, 10 10, 11 11, 12 12}
22
+ {9 42}
23
+ {2 54.3, 3 92, 11 10.2}
24
+
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: iterationlabs-rarff
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.5
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Ben J Woodcroft
9
+ - Andy Payne
10
+ - Andrew Cantino
11
+ autorequire:
12
+ bindir: bin
13
+ cert_chain: []
14
+ date: 2012-01-23 00:00:00.000000000Z
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: shoulda
18
+ requirement: &70346083110640 !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - ! '>='
22
+ - !ruby/object:Gem::Version
23
+ version: '0'
24
+ type: :runtime
25
+ prerelease: false
26
+ version_requirements: *70346083110640
27
+ - !ruby/object:Gem::Dependency
28
+ name: jeweler
29
+ requirement: &70346083109860 !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ! '>='
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: *70346083109860
38
+ - !ruby/object:Gem::Dependency
39
+ name: thoughtbot-shoulda
40
+ requirement: &70346083108740 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ type: :development
47
+ prerelease: false
48
+ version_requirements: *70346083108740
49
+ description: ! "Rarff is a Ruby library for dealing with Attribute-Relation File Format
50
+ (ARFF) files. ARFF files are used to specify \ndata sets for data mining and machine
51
+ learning."
52
+ email: donttrustben near gmail.com
53
+ executables: []
54
+ extensions: []
55
+ extra_rdoc_files:
56
+ - README.txt
57
+ files:
58
+ - .rvmrc
59
+ - Gemfile
60
+ - Gemfile.lock
61
+ - History.txt
62
+ - Manifest.txt
63
+ - README.txt
64
+ - Rakefile
65
+ - VERSION
66
+ - lib/rarff.rb
67
+ - test/test_arff.arff
68
+ - test/test_rarff.rb
69
+ - test/test_sparse_arff.arff
70
+ homepage: http://github.com/iterationlabs/rarff
71
+ licenses: []
72
+ post_install_message:
73
+ rdoc_options: []
74
+ require_paths:
75
+ - lib
76
+ required_ruby_version: !ruby/object:Gem::Requirement
77
+ none: false
78
+ requirements:
79
+ - - ! '>='
80
+ - !ruby/object:Gem::Version
81
+ version: '0'
82
+ required_rubygems_version: !ruby/object:Gem::Requirement
83
+ none: false
84
+ requirements:
85
+ - - ! '>='
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ requirements: []
89
+ rubyforge_project:
90
+ rubygems_version: 1.8.10
91
+ signing_key:
92
+ specification_version: 3
93
+ summary: Rarff is a Ruby library for dealing with Attribute-Relation File Format (ARFF)
94
+ files
95
+ test_files: []