rarff 0.2.0 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/{README → README.md} +10 -23
- data/lib/rarff.rb +191 -198
- data/tests/test_case_arff.arff +694 -0
- data/tests/test_comments_arff.arff +21 -0
- data/tests/test_comments_raw.csv +11 -0
- data/tests/ts_rarff.rb +90 -77
- metadata +35 -38
data/{README → README.md}
RENAMED
@@ -1,24 +1,24 @@
|
|
1
|
-
|
1
|
+
# rarff
|
2
2
|
|
3
3
|
Rarff - Ruby ARFF Library
|
4
4
|
|
5
5
|
Rarff is a Ruby library for dealing with Attribute-Relation File Format (ARFF) files. ARFF files are used to specify data sets for data mining and machine learning.
|
6
6
|
|
7
7
|
|
8
|
-
|
8
|
+
## License
|
9
9
|
|
10
|
-
Copyright (c)
|
10
|
+
Copyright (c) 2006-2012 Andy Payne
|
11
11
|
All rights reserved.
|
12
12
|
|
13
13
|
Redistribution and use in source and binary forms, with or without
|
14
14
|
modification, are permitted provided that the following conditions are met:
|
15
15
|
|
16
|
-
|
16
|
+
* Redistributions of source code must retain the above copyright notice,
|
17
17
|
this list of conditions and the following disclaimer.
|
18
|
-
|
18
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
19
19
|
this list of conditions and the following disclaimer in the
|
20
20
|
documentation and/or other materials provided with the distribution.
|
21
|
-
|
21
|
+
* Neither the name of the COPYRIGHT OWNER nor the names of its contributors
|
22
22
|
may be used to endorse or promote products derived from this software
|
23
23
|
without specific prior written permission.
|
24
24
|
|
@@ -34,12 +34,7 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
34
34
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
35
35
|
|
36
36
|
|
37
|
-
|
38
|
-
|
39
|
-
* Sparse ARFF files (thanks to Tom Adams)
|
40
|
-
|
41
|
-
|
42
|
-
== Todo
|
37
|
+
## Todo
|
43
38
|
|
44
39
|
* Spaces or quotes in nominal types
|
45
40
|
* Commas in quoted attributes or in nominal types
|
@@ -49,7 +44,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
49
44
|
* Dates - do some work to create, translate, and interpret date format strings.
|
50
45
|
|
51
46
|
|
52
|
-
|
47
|
+
## Weka
|
53
48
|
|
54
49
|
Weka is "a collection of machine learning algorithms for data mining tasks."
|
55
50
|
(http://www.cs.waikato.ac.nz/ml/weka/) Weka accompanies the following book:
|
@@ -58,7 +53,7 @@ Ian H. Witten and Eibe Frank (2005) "Data Mining: Practical machine learning
|
|
58
53
|
tools and techniques", 2nd Edition, Morgan Kaufmann, San Francisco, 2005.
|
59
54
|
|
60
55
|
|
61
|
-
|
56
|
+
## ARFF Information
|
62
57
|
|
63
58
|
ARFF files are similar to CSV files, but are strongly-typed, have a pre-defined
|
64
59
|
set of data types, and include a sparse representation.
|
@@ -66,15 +61,7 @@ set of data types, and include a sparse representation.
|
|
66
61
|
Links to documentation:
|
67
62
|
|
68
63
|
* http://www.cs.waikato.ac.nz/~ml/weka/arff.html
|
69
|
-
* http://
|
70
|
-
|
71
|
-
== Contact Information
|
72
|
-
|
73
|
-
Andy Payne
|
74
|
-
Website: http://andy-payne.com/
|
75
|
-
Email: apayne .at. gmail.com
|
76
|
-
Twitter: http://twitter.com/andypayne
|
77
|
-
RARFF website: http://rubyforge.org/projects/rarff/
|
64
|
+
* http://sourceforge.net/projects/weka/files/
|
78
65
|
|
79
66
|
|
80
67
|
|
data/lib/rarff.rb
CHANGED
@@ -8,14 +8,14 @@
|
|
8
8
|
# Custom scan that returns a boolean indicating whether the regex matched.
|
9
9
|
# TODO: Is there a way to avoid doing this?
|
10
10
|
class String
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
11
|
+
def my_scan(re)
|
12
|
+
hit = false
|
13
|
+
scan(re) { |arr|
|
14
|
+
yield arr if block_given?
|
15
|
+
hit = true
|
16
|
+
}
|
17
|
+
hit
|
18
|
+
end
|
19
19
|
end
|
20
20
|
|
21
21
|
################################################################################
|
@@ -24,240 +24,233 @@ module Enumerable
|
|
24
24
|
# This map_with_index hack allows access to the index of each item as the map
|
25
25
|
# iterates.
|
26
26
|
# TODO: Is there a better way?
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
27
|
+
def map_with_index
|
28
|
+
# Ugly, but I need the yield to be the last statement in the map.
|
29
|
+
i = -1
|
30
|
+
return map { |item|
|
31
|
+
i += 1
|
32
|
+
yield item, i
|
33
|
+
}
|
34
|
+
end
|
35
35
|
end
|
36
36
|
|
37
37
|
################################################################################
|
38
38
|
|
39
39
|
module Rarff
|
40
40
|
|
41
|
-
COMMENT_MARKER = '%'
|
42
|
-
RELATION_MARKER = '@RELATION'
|
43
|
-
ATTRIBUTE_MARKER = '@ATTRIBUTE'
|
44
|
-
DATA_MARKER = '@DATA'
|
41
|
+
COMMENT_MARKER = '%'
|
42
|
+
RELATION_MARKER = '@RELATION'
|
43
|
+
ATTRIBUTE_MARKER = '@ATTRIBUTE'
|
44
|
+
DATA_MARKER = '@DATA'
|
45
45
|
|
46
|
-
SPARSE_ARFF_BEGIN = '{'
|
47
|
-
ESC_SPARSE_ARFF_BEGIN = '\\' + SPARSE_ARFF_BEGIN
|
48
|
-
SPARSE_ARFF_END = '}'
|
49
|
-
ESC_SPARSE_ARFF_END = '\\' + SPARSE_ARFF_END
|
46
|
+
SPARSE_ARFF_BEGIN = '{'
|
47
|
+
ESC_SPARSE_ARFF_BEGIN = '\\' + SPARSE_ARFF_BEGIN
|
48
|
+
SPARSE_ARFF_END = '}'
|
49
|
+
ESC_SPARSE_ARFF_END = '\\' + SPARSE_ARFF_END
|
50
50
|
|
51
|
-
ATTRIBUTE_NUMERIC = 'NUMERIC'
|
52
|
-
ATTRIBUTE_REAL = 'REAL'
|
53
|
-
ATTRIBUTE_INTEGER = 'INTEGER'
|
54
|
-
ATTRIBUTE_STRING = 'STRING'
|
55
|
-
ATTRIBUTE_DATE = 'DATE'
|
51
|
+
ATTRIBUTE_NUMERIC = 'NUMERIC'
|
52
|
+
ATTRIBUTE_REAL = 'REAL'
|
53
|
+
ATTRIBUTE_INTEGER = 'INTEGER'
|
54
|
+
ATTRIBUTE_STRING = 'STRING'
|
55
|
+
ATTRIBUTE_DATE = 'DATE'
|
56
56
|
|
57
57
|
|
58
58
|
################################################################################
|
59
59
|
|
60
|
-
class Attribute
|
61
|
-
|
60
|
+
class Attribute
|
61
|
+
attr_accessor :name, :type
|
62
62
|
|
63
|
-
|
64
|
-
|
63
|
+
def initialize(name='', type='')
|
64
|
+
@name = name
|
65
65
|
|
66
|
-
|
67
|
-
|
66
|
+
@type_is_nominal = false
|
67
|
+
@type = type
|
68
68
|
|
69
|
-
|
70
|
-
|
69
|
+
check_nominal()
|
70
|
+
end
|
71
71
|
|
72
72
|
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
73
|
+
def type=(type)
|
74
|
+
@type = type
|
75
|
+
check_nominal()
|
76
|
+
end
|
77
77
|
|
78
78
|
|
79
79
|
# Convert string representation of nominal type to array, if necessary
|
80
80
|
# TODO: This might falsely trigger on wacky date formats.
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
81
|
+
def check_nominal
|
82
|
+
if @type =~ /^\s*\{.*(\,.*)+\}\s*$/
|
83
|
+
@type_is_nominal = true
|
84
|
+
# Example format: "{nom1,nom2, nom3, nom4,nom5 } "
|
85
|
+
# Split on '{' ',' or '}'
|
86
|
+
@type = @type.gsub(/^\s*\{\s*/, '').gsub(/\s*\}\s*$/, '').split(/\s*\,\s*/)
|
87
|
+
end
|
88
|
+
end
|
89
89
|
|
90
90
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
91
|
+
def add_nominal_value(str)
|
92
|
+
if @type_is_nominal == false
|
93
|
+
@type = Array.new
|
94
|
+
end
|
95
95
|
|
96
|
-
|
97
|
-
|
96
|
+
@type << str
|
97
|
+
end
|
98
98
|
|
99
99
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
100
|
+
def to_arff
|
101
|
+
if @type_is_nominal == true
|
102
|
+
ATTRIBUTE_MARKER + " #{@name} #{@type.join(',')}"
|
103
|
+
else
|
104
|
+
ATTRIBUTE_MARKER + " #{@name} #{@type}"
|
105
|
+
end
|
106
|
+
end
|
107
107
|
|
108
108
|
|
109
|
-
|
110
|
-
|
111
|
-
|
109
|
+
def to_s
|
110
|
+
to_arff
|
111
|
+
end
|
112
112
|
|
113
|
-
end
|
113
|
+
end
|
114
114
|
|
115
|
+
Comment = Struct.new(:text,:row)
|
115
116
|
|
117
|
+
class Relation
|
118
|
+
attr_accessor :name, :attributes, :instances, :comments
|
116
119
|
|
117
|
-
class Relation
|
118
|
-
attr_accessor :name, :attributes, :instances
|
119
|
-
|
120
|
-
|
121
|
-
def initialize(name='')
|
122
|
-
@name = name
|
123
|
-
@attributes = Array.new
|
124
|
-
@instances = Array.new
|
125
|
-
end
|
126
|
-
|
127
|
-
|
128
|
-
def parse(str)
|
129
|
-
in_data_section = false
|
130
|
-
|
131
|
-
# TODO: Doesn't handle commas in quoted attributes.
|
132
|
-
str.split("\n").each { |line|
|
133
|
-
next if line =~ /^\s*$/
|
134
|
-
next if line =~ /^\s*#{COMMENT_MARKER}/
|
135
|
-
next if line.my_scan(/^\s*#{RELATION_MARKER}\s*(.*)\s*$/i) { |name| @name = name }
|
136
|
-
next if line.my_scan(/^\s*#{ATTRIBUTE_MARKER}\s*([^\s]*)\s+(.*)\s*$/i) { |name, type|
|
137
|
-
@attributes.push(Attribute.new(name, type))
|
138
|
-
}
|
139
|
-
next if line.my_scan(/^\s*#{DATA_MARKER}/i) { in_data_section = true }
|
140
|
-
next if in_data_section == false ## Below is data section handling
|
141
|
-
# next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^\s*#{SPARSE_ARFF_BEGIN}(.*)#{SPARSE_ARFF_END}\s*$/) { |data|
|
142
|
-
next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^#{ESC_SPARSE_ARFF_BEGIN}(.*)#{ESC_SPARSE_ARFF_END}$/) { |data|
|
143
|
-
# Sparse ARFF
|
144
|
-
# TODO: Factor duplication with non-sparse data below
|
145
|
-
@instances << expand_sparse(data.first)
|
146
|
-
create_attributes()
|
147
|
-
}
|
148
|
-
next if line.my_scan(/^\s*(.*)\s*$/) { |data|
|
149
|
-
@instances << data.first.split(/,\s*/).map { |field|
|
150
|
-
# Remove outer single quotes on strings, if any ('foo bar' --> foo bar)
|
151
|
-
field.gsub(/^\s*\'(.*)\'\s*$/, "\\1")
|
152
|
-
}
|
153
|
-
create_attributes()
|
154
|
-
}
|
155
|
-
}
|
156
|
-
end
|
157
|
-
|
158
|
-
|
159
|
-
def instances=(instances)
|
160
|
-
@instances = instances
|
161
|
-
create_attributes()
|
162
|
-
end
|
163
|
-
|
164
|
-
|
165
|
-
def create_attributes
|
166
|
-
attr_pass = true
|
167
|
-
|
168
|
-
@instances.each_index { |i|
|
169
|
-
@instances[i].each_index { |j|
|
170
|
-
if @instances[i][j].class != String
|
171
|
-
if attr_pass == true
|
172
|
-
@attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_NUMERIC)
|
173
|
-
end
|
174
|
-
elsif @instances[i][j] =~ /^\-?\d+\.?\d*$/
|
175
|
-
# TODO: Should I have a separate to_i conversion, or is to_f sufficient?
|
176
|
-
@instances[i][j] = @instances[i][j].to_f
|
177
|
-
if attr_pass == true
|
178
|
-
@attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_NUMERIC)
|
179
|
-
end
|
180
|
-
else
|
181
|
-
if attr_pass == true
|
182
|
-
@attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_STRING)
|
183
|
-
end
|
184
|
-
end
|
185
|
-
}
|
186
|
-
|
187
|
-
attr_pass = false
|
188
|
-
}
|
189
|
-
end
|
190
|
-
|
191
|
-
|
192
|
-
def expand_sparse(str)
|
193
|
-
arr = Array.new(@attributes.size, 0)
|
194
|
-
str.gsub(/^\s*\{(.*)\}\s*$/, "\\1").split(/\s*\,\s*/).map { |pr|
|
195
|
-
pra = pr.split(/\s/)
|
196
|
-
arr[pra[0].to_i] = pra[1]
|
197
|
-
}
|
198
|
-
arr
|
199
|
-
end
|
200
|
-
|
201
|
-
|
202
|
-
def to_arff(sparse=false)
|
203
|
-
RELATION_MARKER + " #{@name}\n" +
|
204
|
-
@attributes.map{ |attr| attr.to_arff }.join("\n") +
|
205
|
-
"\n" +
|
206
|
-
DATA_MARKER + "\n" +
|
207
|
-
@instances.map { |inst|
|
208
|
-
inst.map_with_index { |col, i|
|
209
|
-
# Quote strings with spaces.
|
210
|
-
# TODO: Doesn't handle cases in which strings already contain
|
211
|
-
# quotes or are already quoted.
|
212
|
-
if @attributes[i].type =~ /^#{ATTRIBUTE_STRING}$/i
|
213
|
-
if col =~ /\s+/
|
214
|
-
col = "'" + col + "'"
|
215
|
-
end
|
216
|
-
elsif @attributes[i].type =~ /^#{ATTRIBUTE_DATE}/i ## Hack comparison. Ugh.
|
217
|
-
col = '"' + col + '"'
|
218
|
-
end
|
219
|
-
if @attributes[i].type =~ /^#{ATTRIBUTE_NUMERIC}$/i and col == 0
|
220
|
-
nil
|
221
|
-
else
|
222
|
-
sparse ? "#{i} #{col}" : col
|
223
|
-
end
|
224
|
-
}.select{|c|not c.nil?}.join(', ')
|
225
|
-
}.join("\n").gsub(/^/, sparse ? '{' : '').gsub(/$/, sparse ? '}' : '')
|
226
|
-
end
|
227
|
-
|
228
|
-
|
229
|
-
def to_s
|
230
|
-
to_arff
|
231
|
-
end
|
232
120
|
|
233
|
-
|
121
|
+
def initialize(name='')
|
122
|
+
@name = name
|
123
|
+
@attributes = Array.new
|
124
|
+
@instances = Array.new
|
125
|
+
@comments = Array.new
|
126
|
+
end
|
234
127
|
|
235
128
|
|
236
|
-
|
129
|
+
def parse(str)
|
130
|
+
in_data_section = false
|
237
131
|
|
238
|
-
|
132
|
+
# TODO: Doesn't handle commas in quoted attributes.
|
133
|
+
str.split("\n").each_with_index { |line, idx|
|
134
|
+
next if line =~ /^\s*$/
|
135
|
+
next if line.my_scan(/^\s*#{COMMENT_MARKER}/) { @comments << Comment.new(line.slice(1..-1), idx+1)}
|
136
|
+
next if line.my_scan(/^\s*#{RELATION_MARKER}\s*(.*)\s*$/i) { |name| @name = name }
|
137
|
+
next if line.my_scan(/^\s*#{ATTRIBUTE_MARKER}\s*([^\s]*)\s+(.*)\s*$/i) { |name, type|
|
138
|
+
@attributes.push(Attribute.new(name, type))
|
139
|
+
}
|
140
|
+
next if line.my_scan(/^\s*#{DATA_MARKER}/i) { in_data_section = true }
|
141
|
+
next if in_data_section == false ## Below is data section handling
|
142
|
+
# next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^\s*#{SPARSE_ARFF_BEGIN}(.*)#{SPARSE_ARFF_END}\s*$/) { |data|
|
143
|
+
next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^#{ESC_SPARSE_ARFF_BEGIN}(.*)#{ESC_SPARSE_ARFF_END}$/) { |data|
|
144
|
+
# Sparse ARFF
|
145
|
+
# TODO: Factor duplication with non-sparse data below
|
146
|
+
@instances << expand_sparse(data.first)
|
147
|
+
create_attributes()
|
148
|
+
}
|
149
|
+
next if line.my_scan(/^\s*(.*)\s*$/) { |data|
|
150
|
+
@instances << data.first.split(/,\s*/).map { |field|
|
151
|
+
# Remove outer single quotes on strings, if any ('foo bar' --> foo bar)
|
152
|
+
field.gsub(/^\s*\'(.*)\'\s*$/, "\\1")
|
153
|
+
}
|
154
|
+
create_attributes()
|
155
|
+
}
|
156
|
+
}
|
157
|
+
end
|
158
|
+
|
159
|
+
|
160
|
+
def instances=(instances)
|
161
|
+
@instances = instances
|
162
|
+
create_attributes()
|
163
|
+
end
|
239
164
|
|
240
|
-
if $0 == __FILE__ then
|
241
165
|
|
166
|
+
def create_attributes
|
167
|
+
attr_pass = true
|
168
|
+
|
169
|
+
@instances.each_index { |i|
|
170
|
+
@instances[i].each_index { |j|
|
171
|
+
if @instances[i][j].class != String
|
172
|
+
assign_or_build_attr(j, ATTRIBUTE_NUMERIC) if attr_pass
|
173
|
+
elsif @instances[i][j] =~ /^\-?\d+\.?\d*$/
|
174
|
+
# TODO: Should I have a separate to_i conversion, or is to_f sufficient?
|
175
|
+
@instances[i][j] = @instances[i][j].to_f
|
176
|
+
assign_or_build_attr(j, ATTRIBUTE_NUMERIC) if attr_pass
|
177
|
+
else
|
178
|
+
assign_or_build_attr(j, ATTRIBUTE_STRING) if attr_pass
|
179
|
+
end
|
180
|
+
}
|
242
181
|
|
243
|
-
|
244
|
-
|
245
|
-
|
182
|
+
attr_pass = false
|
183
|
+
}
|
184
|
+
end
|
246
185
|
|
247
|
-
contents = File.open(in_file).read
|
248
186
|
|
249
|
-
|
250
|
-
|
187
|
+
def assign_or_build_attr(j, attr_type)
|
188
|
+
if @attributes[j].is_a?(Attribute)
|
189
|
+
@attributes[j].type = attr_type
|
190
|
+
else
|
191
|
+
@attributes[j] = Attribute.new("Attr#{j}", attr_type)
|
192
|
+
end
|
193
|
+
end
|
251
194
|
|
252
|
-
|
253
|
-
|
254
|
-
|
195
|
+
def expand_sparse(str)
|
196
|
+
arr = Array.new(@attributes.size, 0)
|
197
|
+
str.gsub(/^\s*\{(.*)\}\s*$/, "\\1").split(/\s*\,\s*/).map { |pr|
|
198
|
+
pra = pr.split(/\s/)
|
199
|
+
arr[pra[0].to_i] = pra[1]
|
200
|
+
}
|
201
|
+
arr
|
202
|
+
end
|
203
|
+
|
204
|
+
|
205
|
+
def to_arff
|
206
|
+
RELATION_MARKER + " #{@name}\n" +
|
207
|
+
@attributes.map { |attr| attr.to_arff }.join("\n") +
|
208
|
+
"\n" +
|
209
|
+
DATA_MARKER + "\n" +
|
210
|
+
@instances.map { |inst|
|
211
|
+
inst.map_with_index { |col, i|
|
212
|
+
# Quote strings with spaces.
|
213
|
+
# TODO: Doesn't handle cases in which strings already contain
|
214
|
+
# quotes or are already quoted.
|
215
|
+
if @attributes[i].type =~ /^#{ATTRIBUTE_STRING}$/i
|
216
|
+
if col =~ /\s+/
|
217
|
+
col = "'" + col + "'"
|
218
|
+
end
|
219
|
+
elsif @attributes[i].type =~ /^#{ATTRIBUTE_DATE}/i ## Hack comparison. Ugh.
|
220
|
+
col = '"' + col + '"'
|
221
|
+
end
|
222
|
+
col
|
223
|
+
}.join(', ')
|
224
|
+
}.join("\n")
|
225
|
+
end
|
226
|
+
|
227
|
+
|
228
|
+
def to_s
|
229
|
+
to_arff
|
230
|
+
end
|
231
|
+
|
232
|
+
end
|
233
|
+
|
234
|
+
|
235
|
+
end # module Rarff
|
236
|
+
|
237
|
+
################################################################################
|
238
|
+
|
239
|
+
if $0 == __FILE__
|
240
|
+
|
241
|
+
exit unless ARGV[0]
|
242
|
+
in_file = ARGV[0]
|
243
|
+
contents = ''
|
244
|
+
|
245
|
+
contents = File.open(in_file).read
|
255
246
|
|
256
|
-
|
257
|
-
|
258
|
-
puts "ARFF:"
|
259
|
-
puts rel
|
247
|
+
rel = Rarff::Relation.new
|
248
|
+
rel.parse(contents)
|
260
249
|
|
250
|
+
puts '='*80
|
251
|
+
puts '='*80
|
252
|
+
puts "ARFF:"
|
253
|
+
puts rel
|
261
254
|
|
262
255
|
end
|
263
256
|
|