rarff 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,68 @@
1
+ = rarff
2
+
3
+ Rarff - Ruby ARFF Library
4
+
5
+ Rarff is a Ruby library for dealing with Attribute-Relation File Format (ARFF) files. ARFF files are used to specify data sets for data mining and machine learning.
6
+
7
+
8
+ == License
9
+
10
+ Copyright (c) 2006, Andy Payne
11
+ All rights reserved.
12
+
13
+ Redistribution and use in source and binary forms, with or without
14
+ modification, are permitted provided that the following conditions are met:
15
+
16
+ * Redistributions of source code must retain the above copyright notice,
17
+ this list of conditions and the following disclaimer.
18
+ * Redistributions in binary form must reproduce the above copyright notice,
19
+ this list of conditions and the following disclaimer in the
20
+ documentation and/or other materials provided with the distribution.
21
+ * Neither the name of the COPYRIGHT OWNER nor the names of its contributors
22
+ may be used to endorse or promote products derived from this software
23
+ without specific prior written permission.
24
+
25
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
26
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
27
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
28
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
29
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
30
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
32
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
33
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35
+
36
+
37
+ == Todo
38
+
39
+ * Spaces or quotes in nominal types
40
+ * Commas in quoted attributes or in nominal types
41
+ * Add error checking/validation
42
+ * Creation of sparse ARFF files
43
+ * Missing values - '?'
44
+ * Dates - do some work to create, translate, and interpret date format strings.
45
+
46
+
47
+ == Weka
48
+
49
+ Weka is "a collection of machine learning algorithms for data mining tasks."
50
+ (http://www.cs.waikato.ac.nz/ml/weka/) Weka accompanies the following book:
51
+
52
+ Ian H. Witten and Eibe Frank (2005) "Data Mining: Practical machine learning
53
+ tools and techniques", 2nd Edition, Morgan Kaufmann, San Francisco, 2005.
54
+
55
+
56
+ == ARFF Information
57
+
58
+ ARFF files are similar to CSV files, but are strongly-typed, have a pre-defined
59
+ set of data types, and include a sparse representation.
60
+
61
+ Links to documentation:
62
+
63
+ * http://www.cs.waikato.ac.nz/~ml/weka/arff.html
64
+ * http://weka.sourceforge.net/wekadoc/index.php/en:ARFF_%283.4.6%29
65
+
66
+
67
+
68
+
@@ -0,0 +1,262 @@
1
+ # = rarff
2
+
3
+ # This is the top-level include file for rarff. See the README file for
4
+ # details.
5
+
6
+ ################################################################################
7
+
8
+ # Custom scan that returns a boolean indicating whether the regex matched.
9
+ # TODO: Is there a way to avoid doing this?
10
+ class String
11
+ def my_scan(re)
12
+ hit = false
13
+ scan(re) { |arr|
14
+ yield arr if block_given?
15
+ hit = true
16
+ }
17
+ hit
18
+ end
19
+ end
20
+
21
+ ################################################################################
22
+
23
+ module Enumerable
24
+ # This map_with_index hack allows access to the index of each item as the map
25
+ # iterates.
26
+ # TODO: Is there a better way?
27
+ def map_with_index
28
+ # Ugly, but I need the yield to be the last statement in the map.
29
+ i = -1
30
+ return map { |item|
31
+ i += 1
32
+ yield item, i
33
+ }
34
+ end
35
+ end
36
+
37
+ ################################################################################
38
+
39
+ module Rarff
40
+
41
+ COMMENT_MARKER = '%'
42
+ RELATION_MARKER = '@RELATION'
43
+ ATTRIBUTE_MARKER = '@ATTRIBUTE'
44
+ DATA_MARKER = '@DATA'
45
+
46
+ SPARSE_ARFF_BEGIN = '{'
47
+ ESC_SPARSE_ARFF_BEGIN = '\\' + SPARSE_ARFF_BEGIN
48
+ SPARSE_ARFF_END = '}'
49
+ ESC_SPARSE_ARFF_END = '\\' + SPARSE_ARFF_END
50
+
51
+ ATTRIBUTE_NUMERIC = 'NUMERIC'
52
+ ATTRIBUTE_REAL = 'REAL'
53
+ ATTRIBUTE_INTEGER = 'INTEGER'
54
+ ATTRIBUTE_STRING = 'STRING'
55
+ ATTRIBUTE_DATE = 'DATE'
56
+
57
+
58
+ ################################################################################
59
+
60
+ class Attribute
61
+ attr_accessor :name, :type
62
+
63
+ def initialize(name='', type='')
64
+ @name = name
65
+
66
+ @type_is_nominal = false
67
+ @type = type
68
+
69
+ check_nominal()
70
+ end
71
+
72
+
73
+ def type=(type)
74
+ @type = type
75
+ check_nominal()
76
+ end
77
+
78
+
79
+ # Convert string representation of nominal type to array, if necessary
80
+ # TODO: This might falsely trigger on wacky date formats.
81
+ def check_nominal
82
+ if @type =~ /^\s*\{.*(\,.*)+\}\s*$/
83
+ @type_is_nominal = true
84
+ # Example format: "{nom1,nom2, nom3, nom4,nom5 } "
85
+ # Split on '{' ',' or '}'
86
+ @type = @type.gsub(/^\s*\{\s*/, '').gsub(/\s*\}\s*$/, '').split(/\s*\,\s*/)
87
+ end
88
+ end
89
+
90
+
91
+ def add_nominal_value(str)
92
+ if @type_is_nominal == false
93
+ @type = Array.new
94
+ end
95
+
96
+ @type << str
97
+ end
98
+
99
+
100
+ def to_arff
101
+ if @type_is_nominal == true
102
+ ATTRIBUTE_MARKER + " #{@name} #{@type.join(',')}"
103
+ else
104
+ ATTRIBUTE_MARKER + " #{@name} #{@type}"
105
+ end
106
+ end
107
+
108
+
109
+ def to_s
110
+ to_arff
111
+ end
112
+
113
+ end
114
+
115
+
116
+
117
+ class Relation
118
+ attr_accessor :name, :attributes, :instances
119
+
120
+
121
+ def initialize(name='')
122
+ @name = name
123
+ @attributes = Array.new
124
+ @instances = Array.new
125
+ end
126
+
127
+
128
+ def parse(str)
129
+ in_data_section = false
130
+
131
+ # TODO: Doesn't handle commas in quoted attributes.
132
+ str.split("\n").each { |line|
133
+ next if line =~ /^\s*$/
134
+ next if line =~ /^\s*#{COMMENT_MARKER}/
135
+ next if line.my_scan(/^\s*#{RELATION_MARKER}\s*(.*)\s*$/i) { |name| @name = name }
136
+ next if line.my_scan(/^\s*#{ATTRIBUTE_MARKER}\s*([^\s]*)\s+(.*)\s*$/i) { |name, type|
137
+ @attributes.push(Attribute.new(name, type))
138
+ }
139
+ next if line.my_scan(/^\s*#{DATA_MARKER}/i) { in_data_section = true }
140
+ next if in_data_section == false ## Below is data section handling
141
+ # next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^\s*#{SPARSE_ARFF_BEGIN}(.*)#{SPARSE_ARFF_END}\s*$/) { |data|
142
+ next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^#{ESC_SPARSE_ARFF_BEGIN}(.*)#{ESC_SPARSE_ARFF_END}$/) { |data|
143
+ # Sparse ARFF
144
+ # TODO: Factor duplication with non-sparse data below
145
+ @instances << expand_sparse(data.first)
146
+ create_attributes()
147
+ }
148
+ next if line.my_scan(/^\s*(.*)\s*$/) { |data|
149
+ @instances << data.first.split(/,\s*/).map { |field|
150
+ # Remove outer single quotes on strings, if any ('foo bar' --> foo bar)
151
+ field.gsub(/^\s*\'(.*)\'\s*$/, "\\1")
152
+ }
153
+ create_attributes()
154
+ }
155
+ }
156
+ end
157
+
158
+
159
+ def instances=(instances)
160
+ @instances = instances
161
+ create_attributes()
162
+ end
163
+
164
+
165
+ def create_attributes
166
+ attr_pass = true
167
+
168
+ @instances.each_index { |i|
169
+ @instances[i].each_index { |j|
170
+ if @instances[i][j].class != String
171
+ if attr_pass == true
172
+ @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_NUMERIC)
173
+ end
174
+ elsif @instances[i][j] =~ /^\-?\d+\.?\d*$/
175
+ # TODO: Should I have a separate to_i conversion, or is to_f sufficient?
176
+ @instances[i][j] = @instances[i][j].to_f
177
+ if attr_pass == true
178
+ @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_NUMERIC)
179
+ end
180
+ else
181
+ if attr_pass == true
182
+ @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_STRING)
183
+ end
184
+ end
185
+ }
186
+
187
+ attr_pass = false
188
+ }
189
+ end
190
+
191
+
192
+ def expand_sparse(str)
193
+ arr = Array.new(@attributes.size, 0)
194
+ str.gsub(/^\s*\{(.*)\}\s*$/, "\\1").split(/\s*\,\s*/).map { |pr|
195
+ pra = pr.split(/\s/)
196
+ arr[pra[0].to_i] = pra[1]
197
+ }
198
+ arr
199
+ end
200
+
201
+
202
+ def to_arff
203
+ RELATION_MARKER + " #{@name}\n" +
204
+ @attributes.map{ |attr| attr.to_arff }.join("\n") +
205
+ "\n" +
206
+ DATA_MARKER + "\n" +
207
+ @instances.map { |inst|
208
+ inst.map_with_index { |col, i|
209
+ # Quote strings with spaces.
210
+ # TODO: Doesn't handle cases in which strings already contain
211
+ # quotes or are already quoted.
212
+ if @attributes[i].type =~ /^#{ATTRIBUTE_STRING}$/i
213
+ if col =~ /\s+/
214
+ col = "'" + col + "'"
215
+ end
216
+ elsif @attributes[i].type =~ /^#{ATTRIBUTE_DATE}/i ## Hack comparison. Ugh.
217
+ col = '"' + col + '"'
218
+ end
219
+ col
220
+ }.join(', ')
221
+ }.join("\n")
222
+ end
223
+
224
+
225
+ def to_s
226
+ to_arff
227
+ end
228
+
229
+ end
230
+
231
+
232
+ end # module Rarff
233
+
234
+ ################################################################################
235
+
236
+ if $0 == __FILE__ then
237
+
238
+
239
+ if ARGV[0]
240
+ in_file = ARGV[0]
241
+ contents = ''
242
+
243
+ contents = File.open(in_file).read
244
+
245
+ rel = Rarff::Relation.new
246
+ rel.parse(contents)
247
+
248
+ else
249
+ exit
250
+ end
251
+
252
+ puts '='*80
253
+ puts '='*80
254
+ puts "ARFF:"
255
+ puts rel
256
+
257
+
258
+ end
259
+
260
+ ################################################################################
261
+
262
+
@@ -0,0 +1,27 @@
1
+ % 1. Title: Iris Plants Database
2
+ %
3
+ % 2. Sources:
4
+ % (a) Creator: R.A. Fisher
5
+ % (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
6
+ % (c) Date: July, 1988
7
+ %
8
+ @RELATION iris
9
+
10
+ @ATTRIBUTE sepallength NUMERIC
11
+ @ATTRIBUTE sepalwidth NUMERIC
12
+ @ATTRIBUTE petallength NUMERIC
13
+ @ATTRIBUTE petalwidth NUMERIC
14
+ @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
15
+
16
+ @DATA
17
+ 5.1,3.5,1.4,0.2,Iris-setosa
18
+ 4.9,3.0,1.4,0.2,Iris-setosa
19
+ 4.7,3.2,1.3,0.2,Iris-setosa
20
+ 4.6,3.1,1.5,0.2,Iris-setosa
21
+ 5.0,3.6,1.4,0.2,Iris-setosa
22
+ 5.4,3.9,1.7,0.4,Iris-setosa
23
+ 4.6,3.4,1.4,0.3,Iris-setosa
24
+ 5.0,3.4,1.5,0.2,Iris-setosa
25
+ 4.4,2.9,1.4,0.2,Iris-setosa
26
+ 4.9,3.1,1.5,0.1,Iris-setosa
27
+
@@ -0,0 +1,24 @@
1
+ % Sample sparse ARFF file
2
+ @RELATION sparseness
3
+
4
+ @ATTRIBUTE attr1 NUMERIC
5
+ @ATTRIBUTE attr2 NUMERIC
6
+ @ATTRIBUTE attr3 NUMERIC
7
+ @ATTRIBUTE attr4 NUMERIC
8
+ @ATTRIBUTE attr5 NUMERIC
9
+ @ATTRIBUTE attr6 NUMERIC
10
+ @ATTRIBUTE attr7 NUMERIC
11
+ @ATTRIBUTE attr8 NUMERIC
12
+ @ATTRIBUTE attr9 NUMERIC
13
+ @ATTRIBUTE attr10 NUMERIC
14
+ @ATTRIBUTE attr11 NUMERIC
15
+ @ATTRIBUTE attr12 NUMERIC
16
+ @ATTRIBUTE attr13 NUMERIC
17
+
18
+ @DATA
19
+ {3 7, 10 34}
20
+ {1 2.4, 4 62, 12 19}
21
+ {0 0, 1 1, 2 2, 3 3, 4 4, 5 5, 6 6, 7 7, 8 8, 9 9, 10 10, 11 11, 12 12}
22
+ {9 42}
23
+ {2 54.3, 3 92, 11 10.2}
24
+
@@ -0,0 +1,74 @@
1
+ # See the README file for more information.
2
+
3
+ require 'test/unit'
4
+ require 'rarff'
5
+
6
+ class TestArffLib < Test::Unit::TestCase
7
+
8
+ # Test creation of an arff file string.
9
+ def test_arff_creation
10
+
11
+ arff_file_str = <<-END_OF_ARFF_FILE
12
+ @RELATION MyCoolRelation
13
+ @ATTRIBUTE Attr0 NUMERIC
14
+ @ATTRIBUTE subject STRING
15
+ @ATTRIBUTE Attr2 NUMERIC
16
+ @ATTRIBUTE Attr3 STRING
17
+ @ATTRIBUTE birthday DATE "yyyy-MM-dd HH:mm:ss"
18
+ @DATA
19
+ 1.4, 'foo bar', 5, baz, "1900-08-08 12:12:12"
20
+ 20.9, ruby, 46, rocks, "2005-10-23 12:12:12"
21
+ 20.9, ruby, 46, rocks, "2001-02-19 12:12:12"
22
+ 68.1, stuff, 728, 'is cool', "1974-02-10 12:12:12"
23
+ END_OF_ARFF_FILE
24
+
25
+ arff_file_str.gsub!(/\n$/, '')
26
+
27
+ instances = [ [1.4, 'foo bar', 5, 'baz', "1900-08-08 12:12:12"],
28
+ [20.9, 'ruby', 46, 'rocks', "2005-10-23 12:12:12"],
29
+ [20.9, 'ruby', 46, 'rocks', "2001-02-19 12:12:12"],
30
+ [68.1, 'stuff', 728, 'is cool', "1974-02-10 12:12:12"]]
31
+
32
+ rel = Rarff::Relation.new('MyCoolRelation')
33
+ rel.instances = instances
34
+ rel.attributes[1].name = 'subject'
35
+ rel.attributes[4].name = 'birthday'
36
+ rel.attributes[4].type = 'DATE "yyyy-MM-dd HH:mm:ss"'
37
+
38
+ # puts "rel.to_arff:\n(\n#{rel.to_arff}\n)\n"
39
+ assert_equal(rel.to_arff, arff_file_str, "Arff creation test failed.")
40
+ end
41
+
42
+
43
+ # Test parsing of an arff file.
44
+ def test_arff_parse
45
+ in_file = './test_arff.arff'
46
+ rel = Rarff::Relation.new
47
+ rel.parse(File.open(in_file).read)
48
+
49
+ assert_equal(rel.instances[2][1], 3.2)
50
+ assert_equal(rel.instances[7][4], 'Iris-setosa')
51
+ end
52
+
53
+
54
+ # Test parsing of sparse ARFF format
55
+ def test_sparse_arff_parse
56
+ in_file = './test_sparse_arff.arff'
57
+ rel = Rarff::Relation.new
58
+ rel.parse(File.open(in_file).read)
59
+
60
+ assert_equal(rel.instances[0].size, 13)
61
+ assert_equal(rel.instances[0][1], 0)
62
+ assert_equal(rel.instances[0][3], 7)
63
+ assert_equal(rel.instances[1][1], 2.4)
64
+ assert_equal(rel.instances[1][2], 0)
65
+ assert_equal(rel.instances[1][12], 19)
66
+ assert_equal(rel.instances[2][6], 6)
67
+ assert_equal(rel.instances[3][12], 0)
68
+ # puts "\n\nARFF: (\n#{rel.to_arff}\n)"
69
+ end
70
+ end
71
+
72
+
73
+
74
+
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.11
3
+ specification_version: 1
4
+ name: rarff
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2006-09-18 00:00:00 -05:00
8
+ summary: Library for handling Weka ARFF files
9
+ require_paths:
10
+ - lib
11
+ email: apayne@gmail.com
12
+ homepage: TODO
13
+ rubyforge_project:
14
+ description:
15
+ autorequire: rarff
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ -
22
+ - ">"
23
+ - !ruby/object:Gem::Version
24
+ version: 0.0.0
25
+ version:
26
+ platform: ruby
27
+ signing_key:
28
+ cert_chain:
29
+ authors:
30
+ - Andy Payne
31
+ files:
32
+ - lib/rarff.rb
33
+ - tests/test_arff.arff
34
+ - tests/test_sparse_arff.arff
35
+ - tests/ts_rarff.rb
36
+ - README
37
+ test_files:
38
+ - tests/ts_rarff.rb
39
+ rdoc_options: []
40
+ extra_rdoc_files:
41
+ - README
42
+ executables: []
43
+ extensions: []
44
+ requirements: []
45
+ dependencies: []