rarff 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,68 @@
1
+ = rarff
2
+
3
+ Rarff - Ruby ARFF Library
4
+
5
+ Rarff is a Ruby library for dealing with Attribute-Relation File Format (ARFF) files. ARFF files are used to specify data sets for data mining and machine learning.
6
+
7
+
8
+ == License
9
+
10
+ Copyright (c) 2006, Andy Payne
11
+ All rights reserved.
12
+
13
+ Redistribution and use in source and binary forms, with or without
14
+ modification, are permitted provided that the following conditions are met:
15
+
16
+ * Redistributions of source code must retain the above copyright notice,
17
+ this list of conditions and the following disclaimer.
18
+ * Redistributions in binary form must reproduce the above copyright notice,
19
+ this list of conditions and the following disclaimer in the
20
+ documentation and/or other materials provided with the distribution.
21
+ * Neither the name of the COPYRIGHT OWNER nor the names of its contributors
22
+ may be used to endorse or promote products derived from this software
23
+ without specific prior written permission.
24
+
25
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
26
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
27
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
28
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
29
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
30
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
32
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
33
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35
+
36
+
37
+ == Todo
38
+
39
+ * Spaces or quotes in nominal types
40
+ * Commas in quoted attributes or in nominal types
41
+ * Add error checking/validation
42
+ * Creation of sparse ARFF files
43
+ * Missing values - '?'
44
+ * Dates - do some work to create, translate, and interpret date format strings.
45
+
46
+
47
+ == Weka
48
+
49
+ Weka is "a collection of machine learning algorithms for data mining tasks."
50
+ (http://www.cs.waikato.ac.nz/ml/weka/) Weka accompanies the following book:
51
+
52
+ Ian H. Witten and Eibe Frank (2005) "Data Mining: Practical machine learning
53
+ tools and techniques", 2nd Edition, Morgan Kaufmann, San Francisco, 2005.
54
+
55
+
56
+ == ARFF Information
57
+
58
+ ARFF files are similar to CSV files, but are strongly-typed, have a pre-defined
59
+ set of data types, and include a sparse representation.
60
+
61
+ Links to documentation:
62
+
63
+ * http://www.cs.waikato.ac.nz/~ml/weka/arff.html
64
+ * http://weka.sourceforge.net/wekadoc/index.php/en:ARFF_%283.4.6%29
65
+
66
+
67
+
68
+
@@ -0,0 +1,262 @@
1
+ # = rarff
2
+
3
+ # This is the top-level include file for rarff. See the README file for
4
+ # details.
5
+
6
+ ################################################################################
7
+
8
+ # Custom scan that returns a boolean indicating whether the regex matched.
9
+ # TODO: Is there a way to avoid doing this?
10
+ class String
11
+ def my_scan(re)
12
+ hit = false
13
+ scan(re) { |arr|
14
+ yield arr if block_given?
15
+ hit = true
16
+ }
17
+ hit
18
+ end
19
+ end
20
+
21
+ ################################################################################
22
+
23
+ module Enumerable
24
+ # This map_with_index hack allows access to the index of each item as the map
25
+ # iterates.
26
+ # TODO: Is there a better way?
27
+ def map_with_index
28
+ # Ugly, but I need the yield to be the last statement in the map.
29
+ i = -1
30
+ return map { |item|
31
+ i += 1
32
+ yield item, i
33
+ }
34
+ end
35
+ end
36
+
37
+ ################################################################################
38
+
39
+ module Rarff
40
+
41
+ COMMENT_MARKER = '%'
42
+ RELATION_MARKER = '@RELATION'
43
+ ATTRIBUTE_MARKER = '@ATTRIBUTE'
44
+ DATA_MARKER = '@DATA'
45
+
46
+ SPARSE_ARFF_BEGIN = '{'
47
+ ESC_SPARSE_ARFF_BEGIN = '\\' + SPARSE_ARFF_BEGIN
48
+ SPARSE_ARFF_END = '}'
49
+ ESC_SPARSE_ARFF_END = '\\' + SPARSE_ARFF_END
50
+
51
+ ATTRIBUTE_NUMERIC = 'NUMERIC'
52
+ ATTRIBUTE_REAL = 'REAL'
53
+ ATTRIBUTE_INTEGER = 'INTEGER'
54
+ ATTRIBUTE_STRING = 'STRING'
55
+ ATTRIBUTE_DATE = 'DATE'
56
+
57
+
58
+ ################################################################################
59
+
60
+ class Attribute
61
+ attr_accessor :name, :type
62
+
63
+ def initialize(name='', type='')
64
+ @name = name
65
+
66
+ @type_is_nominal = false
67
+ @type = type
68
+
69
+ check_nominal()
70
+ end
71
+
72
+
73
+ def type=(type)
74
+ @type = type
75
+ check_nominal()
76
+ end
77
+
78
+
79
+ # Convert string representation of nominal type to array, if necessary
80
+ # TODO: This might falsely trigger on wacky date formats.
81
+ def check_nominal
82
+ if @type =~ /^\s*\{.*(\,.*)+\}\s*$/
83
+ @type_is_nominal = true
84
+ # Example format: "{nom1,nom2, nom3, nom4,nom5 } "
85
+ # Split on '{' ',' or '}'
86
+ @type = @type.gsub(/^\s*\{\s*/, '').gsub(/\s*\}\s*$/, '').split(/\s*\,\s*/)
87
+ end
88
+ end
89
+
90
+
91
+ def add_nominal_value(str)
92
+ if @type_is_nominal == false
93
+ @type = Array.new
94
+ end
95
+
96
+ @type << str
97
+ end
98
+
99
+
100
+ def to_arff
101
+ if @type_is_nominal == true
102
+ ATTRIBUTE_MARKER + " #{@name} #{@type.join(',')}"
103
+ else
104
+ ATTRIBUTE_MARKER + " #{@name} #{@type}"
105
+ end
106
+ end
107
+
108
+
109
+ def to_s
110
+ to_arff
111
+ end
112
+
113
+ end
114
+
115
+
116
+
117
+ class Relation
118
+ attr_accessor :name, :attributes, :instances
119
+
120
+
121
+ def initialize(name='')
122
+ @name = name
123
+ @attributes = Array.new
124
+ @instances = Array.new
125
+ end
126
+
127
+
128
+ def parse(str)
129
+ in_data_section = false
130
+
131
+ # TODO: Doesn't handle commas in quoted attributes.
132
+ str.split("\n").each { |line|
133
+ next if line =~ /^\s*$/
134
+ next if line =~ /^\s*#{COMMENT_MARKER}/
135
+ next if line.my_scan(/^\s*#{RELATION_MARKER}\s*(.*)\s*$/i) { |name| @name = name }
136
+ next if line.my_scan(/^\s*#{ATTRIBUTE_MARKER}\s*([^\s]*)\s+(.*)\s*$/i) { |name, type|
137
+ @attributes.push(Attribute.new(name, type))
138
+ }
139
+ next if line.my_scan(/^\s*#{DATA_MARKER}/i) { in_data_section = true }
140
+ next if in_data_section == false ## Below is data section handling
141
+ # next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^\s*#{SPARSE_ARFF_BEGIN}(.*)#{SPARSE_ARFF_END}\s*$/) { |data|
142
+ next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^#{ESC_SPARSE_ARFF_BEGIN}(.*)#{ESC_SPARSE_ARFF_END}$/) { |data|
143
+ # Sparse ARFF
144
+ # TODO: Factor duplication with non-sparse data below
145
+ @instances << expand_sparse(data.first)
146
+ create_attributes()
147
+ }
148
+ next if line.my_scan(/^\s*(.*)\s*$/) { |data|
149
+ @instances << data.first.split(/,\s*/).map { |field|
150
+ # Remove outer single quotes on strings, if any ('foo bar' --> foo bar)
151
+ field.gsub(/^\s*\'(.*)\'\s*$/, "\\1")
152
+ }
153
+ create_attributes()
154
+ }
155
+ }
156
+ end
157
+
158
+
159
+ def instances=(instances)
160
+ @instances = instances
161
+ create_attributes()
162
+ end
163
+
164
+
165
+ def create_attributes
166
+ attr_pass = true
167
+
168
+ @instances.each_index { |i|
169
+ @instances[i].each_index { |j|
170
+ if @instances[i][j].class != String
171
+ if attr_pass == true
172
+ @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_NUMERIC)
173
+ end
174
+ elsif @instances[i][j] =~ /^\-?\d+\.?\d*$/
175
+ # TODO: Should I have a separate to_i conversion, or is to_f sufficient?
176
+ @instances[i][j] = @instances[i][j].to_f
177
+ if attr_pass == true
178
+ @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_NUMERIC)
179
+ end
180
+ else
181
+ if attr_pass == true
182
+ @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_STRING)
183
+ end
184
+ end
185
+ }
186
+
187
+ attr_pass = false
188
+ }
189
+ end
190
+
191
+
192
+ def expand_sparse(str)
193
+ arr = Array.new(@attributes.size, 0)
194
+ str.gsub(/^\s*\{(.*)\}\s*$/, "\\1").split(/\s*\,\s*/).map { |pr|
195
+ pra = pr.split(/\s/)
196
+ arr[pra[0].to_i] = pra[1]
197
+ }
198
+ arr
199
+ end
200
+
201
+
202
+ def to_arff
203
+ RELATION_MARKER + " #{@name}\n" +
204
+ @attributes.map{ |attr| attr.to_arff }.join("\n") +
205
+ "\n" +
206
+ DATA_MARKER + "\n" +
207
+ @instances.map { |inst|
208
+ inst.map_with_index { |col, i|
209
+ # Quote strings with spaces.
210
+ # TODO: Doesn't handle cases in which strings already contain
211
+ # quotes or are already quoted.
212
+ if @attributes[i].type =~ /^#{ATTRIBUTE_STRING}$/i
213
+ if col =~ /\s+/
214
+ col = "'" + col + "'"
215
+ end
216
+ elsif @attributes[i].type =~ /^#{ATTRIBUTE_DATE}/i ## Hack comparison. Ugh.
217
+ col = '"' + col + '"'
218
+ end
219
+ col
220
+ }.join(', ')
221
+ }.join("\n")
222
+ end
223
+
224
+
225
+ def to_s
226
+ to_arff
227
+ end
228
+
229
+ end
230
+
231
+
232
+ end # module Rarff
233
+
234
+ ################################################################################
235
+
236
+ if $0 == __FILE__ then
237
+
238
+
239
+ if ARGV[0]
240
+ in_file = ARGV[0]
241
+ contents = ''
242
+
243
+ contents = File.open(in_file).read
244
+
245
+ rel = Rarff::Relation.new
246
+ rel.parse(contents)
247
+
248
+ else
249
+ exit
250
+ end
251
+
252
+ puts '='*80
253
+ puts '='*80
254
+ puts "ARFF:"
255
+ puts rel
256
+
257
+
258
+ end
259
+
260
+ ################################################################################
261
+
262
+
@@ -0,0 +1,27 @@
1
+ % 1. Title: Iris Plants Database
2
+ %
3
+ % 2. Sources:
4
+ % (a) Creator: R.A. Fisher
5
+ % (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
6
+ % (c) Date: July, 1988
7
+ %
8
+ @RELATION iris
9
+
10
+ @ATTRIBUTE sepallength NUMERIC
11
+ @ATTRIBUTE sepalwidth NUMERIC
12
+ @ATTRIBUTE petallength NUMERIC
13
+ @ATTRIBUTE petalwidth NUMERIC
14
+ @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
15
+
16
+ @DATA
17
+ 5.1,3.5,1.4,0.2,Iris-setosa
18
+ 4.9,3.0,1.4,0.2,Iris-setosa
19
+ 4.7,3.2,1.3,0.2,Iris-setosa
20
+ 4.6,3.1,1.5,0.2,Iris-setosa
21
+ 5.0,3.6,1.4,0.2,Iris-setosa
22
+ 5.4,3.9,1.7,0.4,Iris-setosa
23
+ 4.6,3.4,1.4,0.3,Iris-setosa
24
+ 5.0,3.4,1.5,0.2,Iris-setosa
25
+ 4.4,2.9,1.4,0.2,Iris-setosa
26
+ 4.9,3.1,1.5,0.1,Iris-setosa
27
+
@@ -0,0 +1,24 @@
1
+ % Sample sparse ARFF file
2
+ @RELATION sparseness
3
+
4
+ @ATTRIBUTE attr1 NUMERIC
5
+ @ATTRIBUTE attr2 NUMERIC
6
+ @ATTRIBUTE attr3 NUMERIC
7
+ @ATTRIBUTE attr4 NUMERIC
8
+ @ATTRIBUTE attr5 NUMERIC
9
+ @ATTRIBUTE attr6 NUMERIC
10
+ @ATTRIBUTE attr7 NUMERIC
11
+ @ATTRIBUTE attr8 NUMERIC
12
+ @ATTRIBUTE attr9 NUMERIC
13
+ @ATTRIBUTE attr10 NUMERIC
14
+ @ATTRIBUTE attr11 NUMERIC
15
+ @ATTRIBUTE attr12 NUMERIC
16
+ @ATTRIBUTE attr13 NUMERIC
17
+
18
+ @DATA
19
+ {3 7, 10 34}
20
+ {1 2.4, 4 62, 12 19}
21
+ {0 0, 1 1, 2 2, 3 3, 4 4, 5 5, 6 6, 7 7, 8 8, 9 9, 10 10, 11 11, 12 12}
22
+ {9 42}
23
+ {2 54.3, 3 92, 11 10.2}
24
+
@@ -0,0 +1,74 @@
1
+ # See the README file for more information.
2
+
3
+ require 'test/unit'
4
+ require 'rarff'
5
+
6
+ class TestArffLib < Test::Unit::TestCase
7
+
8
+ # Test creation of an arff file string.
9
+ def test_arff_creation
10
+
11
+ arff_file_str = <<-END_OF_ARFF_FILE
12
+ @RELATION MyCoolRelation
13
+ @ATTRIBUTE Attr0 NUMERIC
14
+ @ATTRIBUTE subject STRING
15
+ @ATTRIBUTE Attr2 NUMERIC
16
+ @ATTRIBUTE Attr3 STRING
17
+ @ATTRIBUTE birthday DATE "yyyy-MM-dd HH:mm:ss"
18
+ @DATA
19
+ 1.4, 'foo bar', 5, baz, "1900-08-08 12:12:12"
20
+ 20.9, ruby, 46, rocks, "2005-10-23 12:12:12"
21
+ 20.9, ruby, 46, rocks, "2001-02-19 12:12:12"
22
+ 68.1, stuff, 728, 'is cool', "1974-02-10 12:12:12"
23
+ END_OF_ARFF_FILE
24
+
25
+ arff_file_str.gsub!(/\n$/, '')
26
+
27
+ instances = [ [1.4, 'foo bar', 5, 'baz', "1900-08-08 12:12:12"],
28
+ [20.9, 'ruby', 46, 'rocks', "2005-10-23 12:12:12"],
29
+ [20.9, 'ruby', 46, 'rocks', "2001-02-19 12:12:12"],
30
+ [68.1, 'stuff', 728, 'is cool', "1974-02-10 12:12:12"]]
31
+
32
+ rel = Rarff::Relation.new('MyCoolRelation')
33
+ rel.instances = instances
34
+ rel.attributes[1].name = 'subject'
35
+ rel.attributes[4].name = 'birthday'
36
+ rel.attributes[4].type = 'DATE "yyyy-MM-dd HH:mm:ss"'
37
+
38
+ # puts "rel.to_arff:\n(\n#{rel.to_arff}\n)\n"
39
+ assert_equal(rel.to_arff, arff_file_str, "Arff creation test failed.")
40
+ end
41
+
42
+
43
+ # Test parsing of an arff file.
44
+ def test_arff_parse
45
+ in_file = './test_arff.arff'
46
+ rel = Rarff::Relation.new
47
+ rel.parse(File.open(in_file).read)
48
+
49
+ assert_equal(rel.instances[2][1], 3.2)
50
+ assert_equal(rel.instances[7][4], 'Iris-setosa')
51
+ end
52
+
53
+
54
+ # Test parsing of sparse ARFF format
55
+ def test_sparse_arff_parse
56
+ in_file = './test_sparse_arff.arff'
57
+ rel = Rarff::Relation.new
58
+ rel.parse(File.open(in_file).read)
59
+
60
+ assert_equal(rel.instances[0].size, 13)
61
+ assert_equal(rel.instances[0][1], 0)
62
+ assert_equal(rel.instances[0][3], 7)
63
+ assert_equal(rel.instances[1][1], 2.4)
64
+ assert_equal(rel.instances[1][2], 0)
65
+ assert_equal(rel.instances[1][12], 19)
66
+ assert_equal(rel.instances[2][6], 6)
67
+ assert_equal(rel.instances[3][12], 0)
68
+ # puts "\n\nARFF: (\n#{rel.to_arff}\n)"
69
+ end
70
+ end
71
+
72
+
73
+
74
+
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.11
3
+ specification_version: 1
4
+ name: rarff
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2006-09-18 00:00:00 -05:00
8
+ summary: Library for handling Weka ARFF files
9
+ require_paths:
10
+ - lib
11
+ email: apayne@gmail.com
12
+ homepage: TODO
13
+ rubyforge_project:
14
+ description:
15
+ autorequire: rarff
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ -
22
+ - ">"
23
+ - !ruby/object:Gem::Version
24
+ version: 0.0.0
25
+ version:
26
+ platform: ruby
27
+ signing_key:
28
+ cert_chain:
29
+ authors:
30
+ - Andy Payne
31
+ files:
32
+ - lib/rarff.rb
33
+ - tests/test_arff.arff
34
+ - tests/test_sparse_arff.arff
35
+ - tests/ts_rarff.rb
36
+ - README
37
+ test_files:
38
+ - tests/ts_rarff.rb
39
+ rdoc_options: []
40
+ extra_rdoc_files:
41
+ - README
42
+ executables: []
43
+ extensions: []
44
+ requirements: []
45
+ dependencies: []