rarff 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/{README → README.md} +10 -23
- data/lib/rarff.rb +191 -198
- data/tests/test_case_arff.arff +694 -0
- data/tests/test_comments_arff.arff +21 -0
- data/tests/test_comments_raw.csv +11 -0
- data/tests/ts_rarff.rb +90 -77
- metadata +35 -38
data/{README → README.md}
RENAMED
@@ -1,24 +1,24 @@
|
|
1
|
-
|
1
|
+
# rarff
|
2
2
|
|
3
3
|
Rarff - Ruby ARFF Library
|
4
4
|
|
5
5
|
Rarff is a Ruby library for dealing with Attribute-Relation File Format (ARFF) files. ARFF files are used to specify data sets for data mining and machine learning.
|
6
6
|
|
7
7
|
|
8
|
-
|
8
|
+
## License
|
9
9
|
|
10
|
-
Copyright (c)
|
10
|
+
Copyright (c) 2006-2012 Andy Payne
|
11
11
|
All rights reserved.
|
12
12
|
|
13
13
|
Redistribution and use in source and binary forms, with or without
|
14
14
|
modification, are permitted provided that the following conditions are met:
|
15
15
|
|
16
|
-
|
16
|
+
* Redistributions of source code must retain the above copyright notice,
|
17
17
|
this list of conditions and the following disclaimer.
|
18
|
-
|
18
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
19
19
|
this list of conditions and the following disclaimer in the
|
20
20
|
documentation and/or other materials provided with the distribution.
|
21
|
-
|
21
|
+
* Neither the name of the COPYRIGHT OWNER nor the names of its contributors
|
22
22
|
may be used to endorse or promote products derived from this software
|
23
23
|
without specific prior written permission.
|
24
24
|
|
@@ -34,12 +34,7 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
34
34
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
35
35
|
|
36
36
|
|
37
|
-
|
38
|
-
|
39
|
-
* Sparse ARFF files (thanks to Tom Adams)
|
40
|
-
|
41
|
-
|
42
|
-
== Todo
|
37
|
+
## Todo
|
43
38
|
|
44
39
|
* Spaces or quotes in nominal types
|
45
40
|
* Commas in quoted attributes or in nominal types
|
@@ -49,7 +44,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
49
44
|
* Dates - do some work to create, translate, and interpret date format strings.
|
50
45
|
|
51
46
|
|
52
|
-
|
47
|
+
## Weka
|
53
48
|
|
54
49
|
Weka is "a collection of machine learning algorithms for data mining tasks."
|
55
50
|
(http://www.cs.waikato.ac.nz/ml/weka/) Weka accompanies the following book:
|
@@ -58,7 +53,7 @@ Ian H. Witten and Eibe Frank (2005) "Data Mining: Practical machine learning
|
|
58
53
|
tools and techniques", 2nd Edition, Morgan Kaufmann, San Francisco, 2005.
|
59
54
|
|
60
55
|
|
61
|
-
|
56
|
+
## ARFF Information
|
62
57
|
|
63
58
|
ARFF files are similar to CSV files, but are strongly-typed, have a pre-defined
|
64
59
|
set of data types, and include a sparse representation.
|
@@ -66,15 +61,7 @@ set of data types, and include a sparse representation.
|
|
66
61
|
Links to documentation:
|
67
62
|
|
68
63
|
* http://www.cs.waikato.ac.nz/~ml/weka/arff.html
|
69
|
-
* http://
|
70
|
-
|
71
|
-
== Contact Information
|
72
|
-
|
73
|
-
Andy Payne
|
74
|
-
Website: http://andy-payne.com/
|
75
|
-
Email: apayne .at. gmail.com
|
76
|
-
Twitter: http://twitter.com/andypayne
|
77
|
-
RARFF website: http://rubyforge.org/projects/rarff/
|
64
|
+
* http://sourceforge.net/projects/weka/files/
|
78
65
|
|
79
66
|
|
80
67
|
|
data/lib/rarff.rb
CHANGED
@@ -8,14 +8,14 @@
|
|
8
8
|
# Custom scan that returns a boolean indicating whether the regex matched.
|
9
9
|
# TODO: Is there a way to avoid doing this?
|
10
10
|
class String
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
11
|
+
def my_scan(re)
|
12
|
+
hit = false
|
13
|
+
scan(re) { |arr|
|
14
|
+
yield arr if block_given?
|
15
|
+
hit = true
|
16
|
+
}
|
17
|
+
hit
|
18
|
+
end
|
19
19
|
end
|
20
20
|
|
21
21
|
################################################################################
|
@@ -24,240 +24,233 @@ module Enumerable
|
|
24
24
|
# This map_with_index hack allows access to the index of each item as the map
|
25
25
|
# iterates.
|
26
26
|
# TODO: Is there a better way?
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
27
|
+
def map_with_index
|
28
|
+
# Ugly, but I need the yield to be the last statement in the map.
|
29
|
+
i = -1
|
30
|
+
return map { |item|
|
31
|
+
i += 1
|
32
|
+
yield item, i
|
33
|
+
}
|
34
|
+
end
|
35
35
|
end
|
36
36
|
|
37
37
|
################################################################################
|
38
38
|
|
39
39
|
module Rarff
|
40
40
|
|
41
|
-
COMMENT_MARKER = '%'
|
42
|
-
RELATION_MARKER = '@RELATION'
|
43
|
-
ATTRIBUTE_MARKER = '@ATTRIBUTE'
|
44
|
-
DATA_MARKER = '@DATA'
|
41
|
+
COMMENT_MARKER = '%'
|
42
|
+
RELATION_MARKER = '@RELATION'
|
43
|
+
ATTRIBUTE_MARKER = '@ATTRIBUTE'
|
44
|
+
DATA_MARKER = '@DATA'
|
45
45
|
|
46
|
-
SPARSE_ARFF_BEGIN = '{'
|
47
|
-
ESC_SPARSE_ARFF_BEGIN = '\\' + SPARSE_ARFF_BEGIN
|
48
|
-
SPARSE_ARFF_END = '}'
|
49
|
-
ESC_SPARSE_ARFF_END = '\\' + SPARSE_ARFF_END
|
46
|
+
SPARSE_ARFF_BEGIN = '{'
|
47
|
+
ESC_SPARSE_ARFF_BEGIN = '\\' + SPARSE_ARFF_BEGIN
|
48
|
+
SPARSE_ARFF_END = '}'
|
49
|
+
ESC_SPARSE_ARFF_END = '\\' + SPARSE_ARFF_END
|
50
50
|
|
51
|
-
ATTRIBUTE_NUMERIC = 'NUMERIC'
|
52
|
-
ATTRIBUTE_REAL = 'REAL'
|
53
|
-
ATTRIBUTE_INTEGER = 'INTEGER'
|
54
|
-
ATTRIBUTE_STRING = 'STRING'
|
55
|
-
ATTRIBUTE_DATE = 'DATE'
|
51
|
+
ATTRIBUTE_NUMERIC = 'NUMERIC'
|
52
|
+
ATTRIBUTE_REAL = 'REAL'
|
53
|
+
ATTRIBUTE_INTEGER = 'INTEGER'
|
54
|
+
ATTRIBUTE_STRING = 'STRING'
|
55
|
+
ATTRIBUTE_DATE = 'DATE'
|
56
56
|
|
57
57
|
|
58
58
|
################################################################################
|
59
59
|
|
60
|
-
class Attribute
|
61
|
-
|
60
|
+
class Attribute
|
61
|
+
attr_accessor :name, :type
|
62
62
|
|
63
|
-
|
64
|
-
|
63
|
+
def initialize(name='', type='')
|
64
|
+
@name = name
|
65
65
|
|
66
|
-
|
67
|
-
|
66
|
+
@type_is_nominal = false
|
67
|
+
@type = type
|
68
68
|
|
69
|
-
|
70
|
-
|
69
|
+
check_nominal()
|
70
|
+
end
|
71
71
|
|
72
72
|
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
73
|
+
def type=(type)
|
74
|
+
@type = type
|
75
|
+
check_nominal()
|
76
|
+
end
|
77
77
|
|
78
78
|
|
79
79
|
# Convert string representation of nominal type to array, if necessary
|
80
80
|
# TODO: This might falsely trigger on wacky date formats.
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
81
|
+
def check_nominal
|
82
|
+
if @type =~ /^\s*\{.*(\,.*)+\}\s*$/
|
83
|
+
@type_is_nominal = true
|
84
|
+
# Example format: "{nom1,nom2, nom3, nom4,nom5 } "
|
85
|
+
# Split on '{' ',' or '}'
|
86
|
+
@type = @type.gsub(/^\s*\{\s*/, '').gsub(/\s*\}\s*$/, '').split(/\s*\,\s*/)
|
87
|
+
end
|
88
|
+
end
|
89
89
|
|
90
90
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
91
|
+
def add_nominal_value(str)
|
92
|
+
if @type_is_nominal == false
|
93
|
+
@type = Array.new
|
94
|
+
end
|
95
95
|
|
96
|
-
|
97
|
-
|
96
|
+
@type << str
|
97
|
+
end
|
98
98
|
|
99
99
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
100
|
+
def to_arff
|
101
|
+
if @type_is_nominal == true
|
102
|
+
ATTRIBUTE_MARKER + " #{@name} #{@type.join(',')}"
|
103
|
+
else
|
104
|
+
ATTRIBUTE_MARKER + " #{@name} #{@type}"
|
105
|
+
end
|
106
|
+
end
|
107
107
|
|
108
108
|
|
109
|
-
|
110
|
-
|
111
|
-
|
109
|
+
def to_s
|
110
|
+
to_arff
|
111
|
+
end
|
112
112
|
|
113
|
-
end
|
113
|
+
end
|
114
114
|
|
115
|
+
Comment = Struct.new(:text,:row)
|
115
116
|
|
117
|
+
class Relation
|
118
|
+
attr_accessor :name, :attributes, :instances, :comments
|
116
119
|
|
117
|
-
class Relation
|
118
|
-
attr_accessor :name, :attributes, :instances
|
119
|
-
|
120
|
-
|
121
|
-
def initialize(name='')
|
122
|
-
@name = name
|
123
|
-
@attributes = Array.new
|
124
|
-
@instances = Array.new
|
125
|
-
end
|
126
|
-
|
127
|
-
|
128
|
-
def parse(str)
|
129
|
-
in_data_section = false
|
130
|
-
|
131
|
-
# TODO: Doesn't handle commas in quoted attributes.
|
132
|
-
str.split("\n").each { |line|
|
133
|
-
next if line =~ /^\s*$/
|
134
|
-
next if line =~ /^\s*#{COMMENT_MARKER}/
|
135
|
-
next if line.my_scan(/^\s*#{RELATION_MARKER}\s*(.*)\s*$/i) { |name| @name = name }
|
136
|
-
next if line.my_scan(/^\s*#{ATTRIBUTE_MARKER}\s*([^\s]*)\s+(.*)\s*$/i) { |name, type|
|
137
|
-
@attributes.push(Attribute.new(name, type))
|
138
|
-
}
|
139
|
-
next if line.my_scan(/^\s*#{DATA_MARKER}/i) { in_data_section = true }
|
140
|
-
next if in_data_section == false ## Below is data section handling
|
141
|
-
# next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^\s*#{SPARSE_ARFF_BEGIN}(.*)#{SPARSE_ARFF_END}\s*$/) { |data|
|
142
|
-
next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^#{ESC_SPARSE_ARFF_BEGIN}(.*)#{ESC_SPARSE_ARFF_END}$/) { |data|
|
143
|
-
# Sparse ARFF
|
144
|
-
# TODO: Factor duplication with non-sparse data below
|
145
|
-
@instances << expand_sparse(data.first)
|
146
|
-
create_attributes()
|
147
|
-
}
|
148
|
-
next if line.my_scan(/^\s*(.*)\s*$/) { |data|
|
149
|
-
@instances << data.first.split(/,\s*/).map { |field|
|
150
|
-
# Remove outer single quotes on strings, if any ('foo bar' --> foo bar)
|
151
|
-
field.gsub(/^\s*\'(.*)\'\s*$/, "\\1")
|
152
|
-
}
|
153
|
-
create_attributes()
|
154
|
-
}
|
155
|
-
}
|
156
|
-
end
|
157
|
-
|
158
|
-
|
159
|
-
def instances=(instances)
|
160
|
-
@instances = instances
|
161
|
-
create_attributes()
|
162
|
-
end
|
163
|
-
|
164
|
-
|
165
|
-
def create_attributes
|
166
|
-
attr_pass = true
|
167
|
-
|
168
|
-
@instances.each_index { |i|
|
169
|
-
@instances[i].each_index { |j|
|
170
|
-
if @instances[i][j].class != String
|
171
|
-
if attr_pass == true
|
172
|
-
@attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_NUMERIC)
|
173
|
-
end
|
174
|
-
elsif @instances[i][j] =~ /^\-?\d+\.?\d*$/
|
175
|
-
# TODO: Should I have a separate to_i conversion, or is to_f sufficient?
|
176
|
-
@instances[i][j] = @instances[i][j].to_f
|
177
|
-
if attr_pass == true
|
178
|
-
@attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_NUMERIC)
|
179
|
-
end
|
180
|
-
else
|
181
|
-
if attr_pass == true
|
182
|
-
@attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_STRING)
|
183
|
-
end
|
184
|
-
end
|
185
|
-
}
|
186
|
-
|
187
|
-
attr_pass = false
|
188
|
-
}
|
189
|
-
end
|
190
|
-
|
191
|
-
|
192
|
-
def expand_sparse(str)
|
193
|
-
arr = Array.new(@attributes.size, 0)
|
194
|
-
str.gsub(/^\s*\{(.*)\}\s*$/, "\\1").split(/\s*\,\s*/).map { |pr|
|
195
|
-
pra = pr.split(/\s/)
|
196
|
-
arr[pra[0].to_i] = pra[1]
|
197
|
-
}
|
198
|
-
arr
|
199
|
-
end
|
200
|
-
|
201
|
-
|
202
|
-
def to_arff(sparse=false)
|
203
|
-
RELATION_MARKER + " #{@name}\n" +
|
204
|
-
@attributes.map{ |attr| attr.to_arff }.join("\n") +
|
205
|
-
"\n" +
|
206
|
-
DATA_MARKER + "\n" +
|
207
|
-
@instances.map { |inst|
|
208
|
-
inst.map_with_index { |col, i|
|
209
|
-
# Quote strings with spaces.
|
210
|
-
# TODO: Doesn't handle cases in which strings already contain
|
211
|
-
# quotes or are already quoted.
|
212
|
-
if @attributes[i].type =~ /^#{ATTRIBUTE_STRING}$/i
|
213
|
-
if col =~ /\s+/
|
214
|
-
col = "'" + col + "'"
|
215
|
-
end
|
216
|
-
elsif @attributes[i].type =~ /^#{ATTRIBUTE_DATE}/i ## Hack comparison. Ugh.
|
217
|
-
col = '"' + col + '"'
|
218
|
-
end
|
219
|
-
if @attributes[i].type =~ /^#{ATTRIBUTE_NUMERIC}$/i and col == 0
|
220
|
-
nil
|
221
|
-
else
|
222
|
-
sparse ? "#{i} #{col}" : col
|
223
|
-
end
|
224
|
-
}.select{|c|not c.nil?}.join(', ')
|
225
|
-
}.join("\n").gsub(/^/, sparse ? '{' : '').gsub(/$/, sparse ? '}' : '')
|
226
|
-
end
|
227
|
-
|
228
|
-
|
229
|
-
def to_s
|
230
|
-
to_arff
|
231
|
-
end
|
232
120
|
|
233
|
-
|
121
|
+
def initialize(name='')
|
122
|
+
@name = name
|
123
|
+
@attributes = Array.new
|
124
|
+
@instances = Array.new
|
125
|
+
@comments = Array.new
|
126
|
+
end
|
234
127
|
|
235
128
|
|
236
|
-
|
129
|
+
def parse(str)
|
130
|
+
in_data_section = false
|
237
131
|
|
238
|
-
|
132
|
+
# TODO: Doesn't handle commas in quoted attributes.
|
133
|
+
str.split("\n").each_with_index { |line, idx|
|
134
|
+
next if line =~ /^\s*$/
|
135
|
+
next if line.my_scan(/^\s*#{COMMENT_MARKER}/) { @comments << Comment.new(line.slice(1..-1), idx+1)}
|
136
|
+
next if line.my_scan(/^\s*#{RELATION_MARKER}\s*(.*)\s*$/i) { |name| @name = name }
|
137
|
+
next if line.my_scan(/^\s*#{ATTRIBUTE_MARKER}\s*([^\s]*)\s+(.*)\s*$/i) { |name, type|
|
138
|
+
@attributes.push(Attribute.new(name, type))
|
139
|
+
}
|
140
|
+
next if line.my_scan(/^\s*#{DATA_MARKER}/i) { in_data_section = true }
|
141
|
+
next if in_data_section == false ## Below is data section handling
|
142
|
+
# next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^\s*#{SPARSE_ARFF_BEGIN}(.*)#{SPARSE_ARFF_END}\s*$/) { |data|
|
143
|
+
next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^#{ESC_SPARSE_ARFF_BEGIN}(.*)#{ESC_SPARSE_ARFF_END}$/) { |data|
|
144
|
+
# Sparse ARFF
|
145
|
+
# TODO: Factor duplication with non-sparse data below
|
146
|
+
@instances << expand_sparse(data.first)
|
147
|
+
create_attributes()
|
148
|
+
}
|
149
|
+
next if line.my_scan(/^\s*(.*)\s*$/) { |data|
|
150
|
+
@instances << data.first.split(/,\s*/).map { |field|
|
151
|
+
# Remove outer single quotes on strings, if any ('foo bar' --> foo bar)
|
152
|
+
field.gsub(/^\s*\'(.*)\'\s*$/, "\\1")
|
153
|
+
}
|
154
|
+
create_attributes()
|
155
|
+
}
|
156
|
+
}
|
157
|
+
end
|
158
|
+
|
159
|
+
|
160
|
+
def instances=(instances)
|
161
|
+
@instances = instances
|
162
|
+
create_attributes()
|
163
|
+
end
|
239
164
|
|
240
|
-
if $0 == __FILE__ then
|
241
165
|
|
166
|
+
def create_attributes
|
167
|
+
attr_pass = true
|
168
|
+
|
169
|
+
@instances.each_index { |i|
|
170
|
+
@instances[i].each_index { |j|
|
171
|
+
if @instances[i][j].class != String
|
172
|
+
assign_or_build_attr(j, ATTRIBUTE_NUMERIC) if attr_pass
|
173
|
+
elsif @instances[i][j] =~ /^\-?\d+\.?\d*$/
|
174
|
+
# TODO: Should I have a separate to_i conversion, or is to_f sufficient?
|
175
|
+
@instances[i][j] = @instances[i][j].to_f
|
176
|
+
assign_or_build_attr(j, ATTRIBUTE_NUMERIC) if attr_pass
|
177
|
+
else
|
178
|
+
assign_or_build_attr(j, ATTRIBUTE_STRING) if attr_pass
|
179
|
+
end
|
180
|
+
}
|
242
181
|
|
243
|
-
|
244
|
-
|
245
|
-
|
182
|
+
attr_pass = false
|
183
|
+
}
|
184
|
+
end
|
246
185
|
|
247
|
-
contents = File.open(in_file).read
|
248
186
|
|
249
|
-
|
250
|
-
|
187
|
+
def assign_or_build_attr(j, attr_type)
|
188
|
+
if @attributes[j].is_a?(Attribute)
|
189
|
+
@attributes[j].type = attr_type
|
190
|
+
else
|
191
|
+
@attributes[j] = Attribute.new("Attr#{j}", attr_type)
|
192
|
+
end
|
193
|
+
end
|
251
194
|
|
252
|
-
|
253
|
-
|
254
|
-
|
195
|
+
def expand_sparse(str)
|
196
|
+
arr = Array.new(@attributes.size, 0)
|
197
|
+
str.gsub(/^\s*\{(.*)\}\s*$/, "\\1").split(/\s*\,\s*/).map { |pr|
|
198
|
+
pra = pr.split(/\s/)
|
199
|
+
arr[pra[0].to_i] = pra[1]
|
200
|
+
}
|
201
|
+
arr
|
202
|
+
end
|
203
|
+
|
204
|
+
|
205
|
+
def to_arff
|
206
|
+
RELATION_MARKER + " #{@name}\n" +
|
207
|
+
@attributes.map { |attr| attr.to_arff }.join("\n") +
|
208
|
+
"\n" +
|
209
|
+
DATA_MARKER + "\n" +
|
210
|
+
@instances.map { |inst|
|
211
|
+
inst.map_with_index { |col, i|
|
212
|
+
# Quote strings with spaces.
|
213
|
+
# TODO: Doesn't handle cases in which strings already contain
|
214
|
+
# quotes or are already quoted.
|
215
|
+
if @attributes[i].type =~ /^#{ATTRIBUTE_STRING}$/i
|
216
|
+
if col =~ /\s+/
|
217
|
+
col = "'" + col + "'"
|
218
|
+
end
|
219
|
+
elsif @attributes[i].type =~ /^#{ATTRIBUTE_DATE}/i ## Hack comparison. Ugh.
|
220
|
+
col = '"' + col + '"'
|
221
|
+
end
|
222
|
+
col
|
223
|
+
}.join(', ')
|
224
|
+
}.join("\n")
|
225
|
+
end
|
226
|
+
|
227
|
+
|
228
|
+
def to_s
|
229
|
+
to_arff
|
230
|
+
end
|
231
|
+
|
232
|
+
end
|
233
|
+
|
234
|
+
|
235
|
+
end # module Rarff
|
236
|
+
|
237
|
+
################################################################################
|
238
|
+
|
239
|
+
if $0 == __FILE__
|
240
|
+
|
241
|
+
exit unless ARGV[0]
|
242
|
+
in_file = ARGV[0]
|
243
|
+
contents = ''
|
244
|
+
|
245
|
+
contents = File.open(in_file).read
|
255
246
|
|
256
|
-
|
257
|
-
|
258
|
-
puts "ARFF:"
|
259
|
-
puts rel
|
247
|
+
rel = Rarff::Relation.new
|
248
|
+
rel.parse(contents)
|
260
249
|
|
250
|
+
puts '='*80
|
251
|
+
puts '='*80
|
252
|
+
puts "ARFF:"
|
253
|
+
puts rel
|
261
254
|
|
262
255
|
end
|
263
256
|
|