rarff 0.2.0 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,24 +1,24 @@
1
- = rarff
1
+ # rarff
2
2
 
3
3
  Rarff - Ruby ARFF Library
4
4
 
5
5
  Rarff is a Ruby library for dealing with Attribute-Relation File Format (ARFF) files. ARFF files are used to specify data sets for data mining and machine learning.
6
6
 
7
7
 
8
- == License
8
+ ## License
9
9
 
10
- Copyright (c) 2008 Andy Payne
10
+ Copyright (c) 2006-2012 Andy Payne
11
11
  All rights reserved.
12
12
 
13
13
  Redistribution and use in source and binary forms, with or without
14
14
  modification, are permitted provided that the following conditions are met:
15
15
 
16
- * Redistributions of source code must retain the above copyright notice,
16
+ * Redistributions of source code must retain the above copyright notice,
17
17
  this list of conditions and the following disclaimer.
18
- * Redistributions in binary form must reproduce the above copyright notice,
18
+ * Redistributions in binary form must reproduce the above copyright notice,
19
19
  this list of conditions and the following disclaimer in the
20
20
  documentation and/or other materials provided with the distribution.
21
- * Neither the name of the COPYRIGHT OWNER nor the names of its contributors
21
+ * Neither the name of the COPYRIGHT OWNER nor the names of its contributors
22
22
  may be used to endorse or promote products derived from this software
23
23
  without specific prior written permission.
24
24
 
@@ -34,12 +34,7 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34
34
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35
35
 
36
36
 
37
- == Changes
38
-
39
- * Sparse ARFF files (thanks to Tom Adams)
40
-
41
-
42
- == Todo
37
+ ## Todo
43
38
 
44
39
  * Spaces or quotes in nominal types
45
40
  * Commas in quoted attributes or in nominal types
@@ -49,7 +44,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
49
44
  * Dates - do some work to create, translate, and interpret date format strings.
50
45
 
51
46
 
52
- == Weka
47
+ ## Weka
53
48
 
54
49
  Weka is "a collection of machine learning algorithms for data mining tasks."
55
50
  (http://www.cs.waikato.ac.nz/ml/weka/) Weka accompanies the following book:
@@ -58,7 +53,7 @@ Ian H. Witten and Eibe Frank (2005) "Data Mining: Practical machine learning
58
53
  tools and techniques", 2nd Edition, Morgan Kaufmann, San Francisco, 2005.
59
54
 
60
55
 
61
- == ARFF Information
56
+ ## ARFF Information
62
57
 
63
58
  ARFF files are similar to CSV files, but are strongly-typed, have a pre-defined
64
59
  set of data types, and include a sparse representation.
@@ -66,15 +61,7 @@ set of data types, and include a sparse representation.
66
61
  Links to documentation:
67
62
 
68
63
  * http://www.cs.waikato.ac.nz/~ml/weka/arff.html
69
- * http://weka.sourceforge.net/wekadoc/index.php/en:ARFF_%283.4.6%29
70
-
71
- == Contact Information
72
-
73
- Andy Payne
74
- Website: http://andy-payne.com/
75
- Email: apayne .at. gmail.com
76
- Twitter: http://twitter.com/andypayne
77
- RARFF website: http://rubyforge.org/projects/rarff/
64
+ * http://sourceforge.net/projects/weka/files/
78
65
 
79
66
 
80
67
 
@@ -8,14 +8,14 @@
8
8
  # Custom scan that returns a boolean indicating whether the regex matched.
9
9
  # TODO: Is there a way to avoid doing this?
10
10
  class String
11
- def my_scan(re)
12
- hit = false
13
- scan(re) { |arr|
14
- yield arr if block_given?
15
- hit = true
16
- }
17
- hit
18
- end
11
+ def my_scan(re)
12
+ hit = false
13
+ scan(re) { |arr|
14
+ yield arr if block_given?
15
+ hit = true
16
+ }
17
+ hit
18
+ end
19
19
  end
20
20
 
21
21
  ################################################################################
@@ -24,240 +24,233 @@ module Enumerable
24
24
  # This map_with_index hack allows access to the index of each item as the map
25
25
  # iterates.
26
26
  # TODO: Is there a better way?
27
- def map_with_index
28
- # Ugly, but I need the yield to be the last statement in the map.
29
- i = -1
30
- return map { |item|
31
- i += 1
32
- yield item, i
33
- }
34
- end
27
+ def map_with_index
28
+ # Ugly, but I need the yield to be the last statement in the map.
29
+ i = -1
30
+ return map { |item|
31
+ i += 1
32
+ yield item, i
33
+ }
34
+ end
35
35
  end
36
36
 
37
37
  ################################################################################
38
38
 
39
39
  module Rarff
40
40
 
41
- COMMENT_MARKER = '%'
42
- RELATION_MARKER = '@RELATION'
43
- ATTRIBUTE_MARKER = '@ATTRIBUTE'
44
- DATA_MARKER = '@DATA'
41
+ COMMENT_MARKER = '%'
42
+ RELATION_MARKER = '@RELATION'
43
+ ATTRIBUTE_MARKER = '@ATTRIBUTE'
44
+ DATA_MARKER = '@DATA'
45
45
 
46
- SPARSE_ARFF_BEGIN = '{'
47
- ESC_SPARSE_ARFF_BEGIN = '\\' + SPARSE_ARFF_BEGIN
48
- SPARSE_ARFF_END = '}'
49
- ESC_SPARSE_ARFF_END = '\\' + SPARSE_ARFF_END
46
+ SPARSE_ARFF_BEGIN = '{'
47
+ ESC_SPARSE_ARFF_BEGIN = '\\' + SPARSE_ARFF_BEGIN
48
+ SPARSE_ARFF_END = '}'
49
+ ESC_SPARSE_ARFF_END = '\\' + SPARSE_ARFF_END
50
50
 
51
- ATTRIBUTE_NUMERIC = 'NUMERIC'
52
- ATTRIBUTE_REAL = 'REAL'
53
- ATTRIBUTE_INTEGER = 'INTEGER'
54
- ATTRIBUTE_STRING = 'STRING'
55
- ATTRIBUTE_DATE = 'DATE'
51
+ ATTRIBUTE_NUMERIC = 'NUMERIC'
52
+ ATTRIBUTE_REAL = 'REAL'
53
+ ATTRIBUTE_INTEGER = 'INTEGER'
54
+ ATTRIBUTE_STRING = 'STRING'
55
+ ATTRIBUTE_DATE = 'DATE'
56
56
 
57
57
 
58
58
  ################################################################################
59
59
 
60
- class Attribute
61
- attr_accessor :name, :type
60
+ class Attribute
61
+ attr_accessor :name, :type
62
62
 
63
- def initialize(name='', type='')
64
- @name = name
63
+ def initialize(name='', type='')
64
+ @name = name
65
65
 
66
- @type_is_nominal = false
67
- @type = type
66
+ @type_is_nominal = false
67
+ @type = type
68
68
 
69
- check_nominal()
70
- end
69
+ check_nominal()
70
+ end
71
71
 
72
72
 
73
- def type=(type)
74
- @type = type
75
- check_nominal()
76
- end
73
+ def type=(type)
74
+ @type = type
75
+ check_nominal()
76
+ end
77
77
 
78
78
 
79
79
  # Convert string representation of nominal type to array, if necessary
80
80
  # TODO: This might falsely trigger on wacky date formats.
81
- def check_nominal
82
- if @type =~ /^\s*\{.*(\,.*)+\}\s*$/
83
- @type_is_nominal = true
84
- # Example format: "{nom1,nom2, nom3, nom4,nom5 } "
85
- # Split on '{' ',' or '}'
86
- @type = @type.gsub(/^\s*\{\s*/, '').gsub(/\s*\}\s*$/, '').split(/\s*\,\s*/)
87
- end
88
- end
81
+ def check_nominal
82
+ if @type =~ /^\s*\{.*(\,.*)+\}\s*$/
83
+ @type_is_nominal = true
84
+ # Example format: "{nom1,nom2, nom3, nom4,nom5 } "
85
+ # Split on '{' ',' or '}'
86
+ @type = @type.gsub(/^\s*\{\s*/, '').gsub(/\s*\}\s*$/, '').split(/\s*\,\s*/)
87
+ end
88
+ end
89
89
 
90
90
 
91
- def add_nominal_value(str)
92
- if @type_is_nominal == false
93
- @type = Array.new
94
- end
91
+ def add_nominal_value(str)
92
+ if @type_is_nominal == false
93
+ @type = Array.new
94
+ end
95
95
 
96
- @type << str
97
- end
96
+ @type << str
97
+ end
98
98
 
99
99
 
100
- def to_arff
101
- if @type_is_nominal == true
102
- ATTRIBUTE_MARKER + " #{@name} #{@type.join(',')}"
103
- else
104
- ATTRIBUTE_MARKER + " #{@name} #{@type}"
105
- end
106
- end
100
+ def to_arff
101
+ if @type_is_nominal == true
102
+ ATTRIBUTE_MARKER + " #{@name} #{@type.join(',')}"
103
+ else
104
+ ATTRIBUTE_MARKER + " #{@name} #{@type}"
105
+ end
106
+ end
107
107
 
108
108
 
109
- def to_s
110
- to_arff
111
- end
109
+ def to_s
110
+ to_arff
111
+ end
112
112
 
113
- end
113
+ end
114
114
 
115
+ Comment = Struct.new(:text,:row)
115
116
 
117
+ class Relation
118
+ attr_accessor :name, :attributes, :instances, :comments
116
119
 
117
- class Relation
118
- attr_accessor :name, :attributes, :instances
119
-
120
-
121
- def initialize(name='')
122
- @name = name
123
- @attributes = Array.new
124
- @instances = Array.new
125
- end
126
-
127
-
128
- def parse(str)
129
- in_data_section = false
130
-
131
- # TODO: Doesn't handle commas in quoted attributes.
132
- str.split("\n").each { |line|
133
- next if line =~ /^\s*$/
134
- next if line =~ /^\s*#{COMMENT_MARKER}/
135
- next if line.my_scan(/^\s*#{RELATION_MARKER}\s*(.*)\s*$/i) { |name| @name = name }
136
- next if line.my_scan(/^\s*#{ATTRIBUTE_MARKER}\s*([^\s]*)\s+(.*)\s*$/i) { |name, type|
137
- @attributes.push(Attribute.new(name, type))
138
- }
139
- next if line.my_scan(/^\s*#{DATA_MARKER}/i) { in_data_section = true }
140
- next if in_data_section == false ## Below is data section handling
141
- # next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^\s*#{SPARSE_ARFF_BEGIN}(.*)#{SPARSE_ARFF_END}\s*$/) { |data|
142
- next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^#{ESC_SPARSE_ARFF_BEGIN}(.*)#{ESC_SPARSE_ARFF_END}$/) { |data|
143
- # Sparse ARFF
144
- # TODO: Factor duplication with non-sparse data below
145
- @instances << expand_sparse(data.first)
146
- create_attributes()
147
- }
148
- next if line.my_scan(/^\s*(.*)\s*$/) { |data|
149
- @instances << data.first.split(/,\s*/).map { |field|
150
- # Remove outer single quotes on strings, if any ('foo bar' --> foo bar)
151
- field.gsub(/^\s*\'(.*)\'\s*$/, "\\1")
152
- }
153
- create_attributes()
154
- }
155
- }
156
- end
157
-
158
-
159
- def instances=(instances)
160
- @instances = instances
161
- create_attributes()
162
- end
163
-
164
-
165
- def create_attributes
166
- attr_pass = true
167
-
168
- @instances.each_index { |i|
169
- @instances[i].each_index { |j|
170
- if @instances[i][j].class != String
171
- if attr_pass == true
172
- @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_NUMERIC)
173
- end
174
- elsif @instances[i][j] =~ /^\-?\d+\.?\d*$/
175
- # TODO: Should I have a separate to_i conversion, or is to_f sufficient?
176
- @instances[i][j] = @instances[i][j].to_f
177
- if attr_pass == true
178
- @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_NUMERIC)
179
- end
180
- else
181
- if attr_pass == true
182
- @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_STRING)
183
- end
184
- end
185
- }
186
-
187
- attr_pass = false
188
- }
189
- end
190
-
191
-
192
- def expand_sparse(str)
193
- arr = Array.new(@attributes.size, 0)
194
- str.gsub(/^\s*\{(.*)\}\s*$/, "\\1").split(/\s*\,\s*/).map { |pr|
195
- pra = pr.split(/\s/)
196
- arr[pra[0].to_i] = pra[1]
197
- }
198
- arr
199
- end
200
-
201
-
202
- def to_arff(sparse=false)
203
- RELATION_MARKER + " #{@name}\n" +
204
- @attributes.map{ |attr| attr.to_arff }.join("\n") +
205
- "\n" +
206
- DATA_MARKER + "\n" +
207
- @instances.map { |inst|
208
- inst.map_with_index { |col, i|
209
- # Quote strings with spaces.
210
- # TODO: Doesn't handle cases in which strings already contain
211
- # quotes or are already quoted.
212
- if @attributes[i].type =~ /^#{ATTRIBUTE_STRING}$/i
213
- if col =~ /\s+/
214
- col = "'" + col + "'"
215
- end
216
- elsif @attributes[i].type =~ /^#{ATTRIBUTE_DATE}/i ## Hack comparison. Ugh.
217
- col = '"' + col + '"'
218
- end
219
- if @attributes[i].type =~ /^#{ATTRIBUTE_NUMERIC}$/i and col == 0
220
- nil
221
- else
222
- sparse ? "#{i} #{col}" : col
223
- end
224
- }.select{|c|not c.nil?}.join(', ')
225
- }.join("\n").gsub(/^/, sparse ? '{' : '').gsub(/$/, sparse ? '}' : '')
226
- end
227
-
228
-
229
- def to_s
230
- to_arff
231
- end
232
120
 
233
- end
121
+ def initialize(name='')
122
+ @name = name
123
+ @attributes = Array.new
124
+ @instances = Array.new
125
+ @comments = Array.new
126
+ end
234
127
 
235
128
 
236
- end # module Rarff
129
+ def parse(str)
130
+ in_data_section = false
237
131
 
238
- ################################################################################
132
+ # TODO: Doesn't handle commas in quoted attributes.
133
+ str.split("\n").each_with_index { |line, idx|
134
+ next if line =~ /^\s*$/
135
+ next if line.my_scan(/^\s*#{COMMENT_MARKER}/) { @comments << Comment.new(line.slice(1..-1), idx+1)}
136
+ next if line.my_scan(/^\s*#{RELATION_MARKER}\s*(.*)\s*$/i) { |name| @name = name }
137
+ next if line.my_scan(/^\s*#{ATTRIBUTE_MARKER}\s*([^\s]*)\s+(.*)\s*$/i) { |name, type|
138
+ @attributes.push(Attribute.new(name, type))
139
+ }
140
+ next if line.my_scan(/^\s*#{DATA_MARKER}/i) { in_data_section = true }
141
+ next if in_data_section == false ## Below is data section handling
142
+ # next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^\s*#{SPARSE_ARFF_BEGIN}(.*)#{SPARSE_ARFF_END}\s*$/) { |data|
143
+ next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^#{ESC_SPARSE_ARFF_BEGIN}(.*)#{ESC_SPARSE_ARFF_END}$/) { |data|
144
+ # Sparse ARFF
145
+ # TODO: Factor duplication with non-sparse data below
146
+ @instances << expand_sparse(data.first)
147
+ create_attributes()
148
+ }
149
+ next if line.my_scan(/^\s*(.*)\s*$/) { |data|
150
+ @instances << data.first.split(/,\s*/).map { |field|
151
+ # Remove outer single quotes on strings, if any ('foo bar' --> foo bar)
152
+ field.gsub(/^\s*\'(.*)\'\s*$/, "\\1")
153
+ }
154
+ create_attributes()
155
+ }
156
+ }
157
+ end
158
+
159
+
160
+ def instances=(instances)
161
+ @instances = instances
162
+ create_attributes()
163
+ end
239
164
 
240
- if $0 == __FILE__ then
241
165
 
166
+ def create_attributes
167
+ attr_pass = true
168
+
169
+ @instances.each_index { |i|
170
+ @instances[i].each_index { |j|
171
+ if @instances[i][j].class != String
172
+ assign_or_build_attr(j, ATTRIBUTE_NUMERIC) if attr_pass
173
+ elsif @instances[i][j] =~ /^\-?\d+\.?\d*$/
174
+ # TODO: Should I have a separate to_i conversion, or is to_f sufficient?
175
+ @instances[i][j] = @instances[i][j].to_f
176
+ assign_or_build_attr(j, ATTRIBUTE_NUMERIC) if attr_pass
177
+ else
178
+ assign_or_build_attr(j, ATTRIBUTE_STRING) if attr_pass
179
+ end
180
+ }
242
181
 
243
- if ARGV[0]
244
- in_file = ARGV[0]
245
- contents = ''
182
+ attr_pass = false
183
+ }
184
+ end
246
185
 
247
- contents = File.open(in_file).read
248
186
 
249
- rel = Rarff::Relation.new
250
- rel.parse(contents)
187
+ def assign_or_build_attr(j, attr_type)
188
+ if @attributes[j].is_a?(Attribute)
189
+ @attributes[j].type = attr_type
190
+ else
191
+ @attributes[j] = Attribute.new("Attr#{j}", attr_type)
192
+ end
193
+ end
251
194
 
252
- else
253
- exit
254
- end
195
+ def expand_sparse(str)
196
+ arr = Array.new(@attributes.size, 0)
197
+ str.gsub(/^\s*\{(.*)\}\s*$/, "\\1").split(/\s*\,\s*/).map { |pr|
198
+ pra = pr.split(/\s/)
199
+ arr[pra[0].to_i] = pra[1]
200
+ }
201
+ arr
202
+ end
203
+
204
+
205
+ def to_arff
206
+ RELATION_MARKER + " #{@name}\n" +
207
+ @attributes.map { |attr| attr.to_arff }.join("\n") +
208
+ "\n" +
209
+ DATA_MARKER + "\n" +
210
+ @instances.map { |inst|
211
+ inst.map_with_index { |col, i|
212
+ # Quote strings with spaces.
213
+ # TODO: Doesn't handle cases in which strings already contain
214
+ # quotes or are already quoted.
215
+ if @attributes[i].type =~ /^#{ATTRIBUTE_STRING}$/i
216
+ if col =~ /\s+/
217
+ col = "'" + col + "'"
218
+ end
219
+ elsif @attributes[i].type =~ /^#{ATTRIBUTE_DATE}/i ## Hack comparison. Ugh.
220
+ col = '"' + col + '"'
221
+ end
222
+ col
223
+ }.join(', ')
224
+ }.join("\n")
225
+ end
226
+
227
+
228
+ def to_s
229
+ to_arff
230
+ end
231
+
232
+ end
233
+
234
+
235
+ end # module Rarff
236
+
237
+ ################################################################################
238
+
239
+ if $0 == __FILE__
240
+
241
+ exit unless ARGV[0]
242
+ in_file = ARGV[0]
243
+ contents = ''
244
+
245
+ contents = File.open(in_file).read
255
246
 
256
- puts '='*80
257
- puts '='*80
258
- puts "ARFF:"
259
- puts rel
247
+ rel = Rarff::Relation.new
248
+ rel.parse(contents)
260
249
 
250
+ puts '='*80
251
+ puts '='*80
252
+ puts "ARFF:"
253
+ puts rel
261
254
 
262
255
  end
263
256