davidrichards-data_frame 0.0.18 → 0.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +16 -0
- data/VERSION.yml +1 -1
- data/bin/plain_frame +22 -0
- data/lib/data_frame.rb +2 -1
- data/lib/data_frame/arff.rb +43 -36
- data/lib/data_frame/core/column_management.rb +102 -0
- data/lib/data_frame/core/filter.rb +48 -0
- data/lib/data_frame/core/import.rb +112 -0
- data/lib/data_frame/core/pre_process.rb +61 -0
- data/lib/data_frame/core/saving.rb +29 -0
- data/lib/data_frame/core/training.rb +36 -0
- data/lib/data_frame/data_frame.rb +37 -241
- data/lib/data_frame/id3.rb +28 -0
- data/lib/data_frame/kmeans.rb +10 -0
- data/lib/data_frame/labels_from_uci.rb +48 -0
- data/lib/data_frame/mlp.rb +18 -0
- data/lib/data_frame/sbn.rb +18 -0
- data/lib/data_frame/transposable_array.rb +1 -1
- data/lib/ext/array.rb +11 -0
- data/spec/data_frame/arff_spec.rb +1 -0
- data/spec/data_frame/core/column_management_spec.rb +97 -0
- data/spec/data_frame/core/filter_spec.rb +88 -0
- data/spec/data_frame/core/import_spec.rb +41 -0
- data/spec/data_frame/core/pre_process_spec.rb +71 -0
- data/spec/data_frame/core/saving_spec.rb +61 -0
- data/spec/data_frame/core/training_spec.rb +51 -0
- data/spec/data_frame/data_frame_spec.rb +10 -226
- data/spec/data_frame/id3_spec.rb +22 -0
- data/spec/ext/array_spec.rb +13 -0
- data/spec/fixtures/discrete_testing.csv +4 -0
- data/spec/fixtures/discrete_training.csv +21 -0
- metadata +33 -6
@@ -0,0 +1,29 @@
|
|
1
|
+
module DF #:nodoc:
|
2
|
+
module Saving #:nodoc:
|
3
|
+
|
4
|
+
# Saves a data frame as CSV.
|
5
|
+
# Examples:
|
6
|
+
# df.save('/tmp/some_filename.csv')
|
7
|
+
# df.save('/tmp/some_filename.csv', :include_header => false) # No header information is saved
|
8
|
+
# df.save('/tmp/some_filename.csv', :only => [:list, :of, :columns])
|
9
|
+
# df.save('/tmp/some_filename.csv', :subset => [:list, :of, :columns])
|
10
|
+
# df.save('/tmp/some_filename.csv',
|
11
|
+
# :filter => {:column_name => :category_value,
|
12
|
+
# :another_column_name => (range..values)}) # Filter by category
|
13
|
+
def save(filename, opts={})
|
14
|
+
|
15
|
+
df = self
|
16
|
+
df = df.subset_from_columns(*Array(opts[:only])) if opts[:only]
|
17
|
+
df = df.subset_from_columns(*Array(opts[:subset])) if opts[:subset]
|
18
|
+
df = df.filter_by_category(opts[:filter]) if opts[:filter]
|
19
|
+
df = df.filter_by_category(opts[:filter_by_category]) if opts[:filter_by_category]
|
20
|
+
|
21
|
+
File.open(filename, "w") { |f| f.write df.to_csv(opts.fetch(:include_header, true)) }
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class DataFrame
|
28
|
+
include DF::Saving
|
29
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module Training #:nodoc:
|
2
|
+
|
3
|
+
# Remove the training set if reset
|
4
|
+
# Return cached training_set, if there is one
|
5
|
+
# Get the proportion or 80%
|
6
|
+
# Get the number of items to choose, n, or a proportion of the items
|
7
|
+
# Store and return n random items
|
8
|
+
def training_set(opts={})
|
9
|
+
@training_set = nil if opts[:reset]
|
10
|
+
return @training_set if @training_set
|
11
|
+
|
12
|
+
items_size = self.items.size
|
13
|
+
proportion = opts.fetch(:proportion, 0.8)
|
14
|
+
n = opts[:n]
|
15
|
+
n ||= (items_size * proportion).to_i
|
16
|
+
n = self.items.size if n > items_size
|
17
|
+
n = 0 if n < 0
|
18
|
+
|
19
|
+
@training_set = []
|
20
|
+
while n > @training_set.size
|
21
|
+
@training_set << random_next(items_size) while n > @training_set.size
|
22
|
+
@training_set.uniq!
|
23
|
+
end
|
24
|
+
@training_set
|
25
|
+
end
|
26
|
+
|
27
|
+
protected
|
28
|
+
def random_next(n)
|
29
|
+
self.items[rand(n)]
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
class DataFrame
|
35
|
+
include Training
|
36
|
+
end
|
@@ -4,70 +4,6 @@
|
|
4
4
|
# is tainted.
|
5
5
|
class DataFrame
|
6
6
|
|
7
|
-
class << self
|
8
|
-
|
9
|
-
# This is the neatest part of this neat gem.
|
10
|
-
# DataFrame.from_csv can be called in a lot of ways:
|
11
|
-
# DataFrame.from_csv(csv_contents)
|
12
|
-
# DataFrame.from_csv(filename)
|
13
|
-
# DataFrame.from_csv(url)
|
14
|
-
# If you need to define converters for FasterCSV, do it before calling
|
15
|
-
# this method:
|
16
|
-
# FasterCSV::Converters[:special] = lambda{|f| f == 'foo' ? 'bar' : 'foo'}
|
17
|
-
# DataFrame.from_csv('http://example.com/my_special_url.csv', :converters => :special)
|
18
|
-
# This returns bar where 'foo' was found and 'foo' everywhere else.
|
19
|
-
def from_csv(obj, opts={})
|
20
|
-
labels, table = infer_csv_contents(obj, opts)
|
21
|
-
name = infer_name_from_contents(obj, opts)
|
22
|
-
return nil unless labels and table
|
23
|
-
df = new(*labels)
|
24
|
-
df.import(table)
|
25
|
-
df.name = name
|
26
|
-
df
|
27
|
-
end
|
28
|
-
|
29
|
-
protected
|
30
|
-
|
31
|
-
# Only works for names sources, urls and files
|
32
|
-
def infer_name_from_contents(obj, opts={})
|
33
|
-
begin
|
34
|
-
File.split(obj).last.split('.')[0..-2].join('.').titleize
|
35
|
-
rescue
|
36
|
-
nil
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
def infer_csv_contents(obj, opts={})
|
41
|
-
contents = File.read(obj) if File.exist?(obj)
|
42
|
-
begin
|
43
|
-
open(obj) {|f| contents = f.read} unless contents
|
44
|
-
rescue
|
45
|
-
nil
|
46
|
-
end
|
47
|
-
contents ||= obj if obj.is_a?(String)
|
48
|
-
return nil unless contents
|
49
|
-
table = FCSV.parse(contents, default_csv_opts.merge(opts))
|
50
|
-
labels = table.shift
|
51
|
-
while table.last.empty?
|
52
|
-
table.pop
|
53
|
-
end
|
54
|
-
[labels, table]
|
55
|
-
end
|
56
|
-
|
57
|
-
def default_csv_opts; {:converters => :all}; end
|
58
|
-
end
|
59
|
-
|
60
|
-
# Include the methods from arff.rb
|
61
|
-
include ARFF
|
62
|
-
|
63
|
-
# Loads a batch of rows. Expects an array of arrays, else you don't
|
64
|
-
# know what you have.
|
65
|
-
def import(rows)
|
66
|
-
rows.each do |row|
|
67
|
-
self.add_item(row)
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
7
|
def inspect
|
72
8
|
"DataFrame rows: #{self.rows.size} labels: #{self.labels.inspect}"
|
73
9
|
end
|
@@ -83,15 +19,11 @@ class DataFrame
|
|
83
19
|
attr_accessor :name
|
84
20
|
|
85
21
|
def initialize(*labels)
|
22
|
+
labels = labels.first if labels.size == 1 and labels.first.is_a?(Array)
|
86
23
|
@labels = labels.map {|e| e.to_underscore_sym }
|
87
24
|
@items = TransposableArray.new
|
88
25
|
end
|
89
26
|
|
90
|
-
def add_item(item)
|
91
|
-
self.items << item
|
92
|
-
end
|
93
|
-
alias :add :add_item
|
94
|
-
|
95
27
|
def row_labels
|
96
28
|
@row_labels ||= []
|
97
29
|
end
|
@@ -101,15 +33,22 @@ class DataFrame
|
|
101
33
|
@row_labels = ary
|
102
34
|
end
|
103
35
|
|
36
|
+
# The rows as an array of arrays, an alias for items.
|
37
|
+
alias :rows :items
|
38
|
+
|
39
|
+
def render_row(sym)
|
40
|
+
i = self.row_labels.index(sym)
|
41
|
+
return nil unless i
|
42
|
+
@items[i]
|
43
|
+
end
|
44
|
+
|
45
|
+
# Return the column, given its name
|
104
46
|
def render_column(sym)
|
105
|
-
i = @labels.index(sym)
|
47
|
+
i = @labels.index(sym.to_underscore_sym)
|
106
48
|
return nil unless i
|
107
49
|
@items.transpose[i]
|
108
50
|
end
|
109
51
|
|
110
|
-
# The rows as an array of arrays, an alias for items.
|
111
|
-
alias :rows :items
|
112
|
-
|
113
52
|
# The columns as a Dictionary or Hash
|
114
53
|
# This is cached, call columns(true) to reset the cache.
|
115
54
|
def columns(reset=false)
|
@@ -128,12 +67,6 @@ class DataFrame
|
|
128
67
|
alias :to_hash :columns
|
129
68
|
alias :to_dictionary :columns
|
130
69
|
|
131
|
-
def render_row(sym)
|
132
|
-
i = self.row_labels.index(sym)
|
133
|
-
return nil unless i
|
134
|
-
@items[i]
|
135
|
-
end
|
136
|
-
|
137
70
|
def method_missing(sym, *args, &block)
|
138
71
|
if self.labels.include?(sym)
|
139
72
|
render_column(sym)
|
@@ -146,174 +79,37 @@ class DataFrame
|
|
146
79
|
end
|
147
80
|
end
|
148
81
|
|
149
|
-
|
150
|
-
labels.each do |label|
|
151
|
-
drop_one!(label)
|
152
|
-
end
|
153
|
-
self
|
154
|
-
end
|
155
|
-
|
156
|
-
def drop_one!(label)
|
157
|
-
i = self.labels.index(label)
|
158
|
-
return nil unless i
|
159
|
-
self.items.each do |item|
|
160
|
-
item.delete_at(i)
|
161
|
-
end
|
162
|
-
self.labels.delete_at(i)
|
163
|
-
self
|
164
|
-
end
|
165
|
-
protected :drop_one!
|
166
|
-
|
167
|
-
def replace!(column, values=nil, &block)
|
168
|
-
column = validate_column(column)
|
169
|
-
if not values
|
170
|
-
values = self.send(column)
|
171
|
-
values.map! {|e| block.call(e)}
|
172
|
-
end
|
173
|
-
replace_column(column, values)
|
174
|
-
self
|
175
|
-
end
|
176
|
-
|
177
|
-
def replace_column(column, values)
|
178
|
-
column = validate_column(column)
|
179
|
-
index = self.labels.index(column)
|
180
|
-
list = []
|
181
|
-
self.items.each_with_index do |item, i|
|
182
|
-
consolidated = item
|
183
|
-
consolidated[index] = values[i]
|
184
|
-
list << consolidated
|
185
|
-
end
|
186
|
-
@items = list.dup
|
187
|
-
end
|
188
|
-
protected :replace_column
|
82
|
+
protected
|
189
83
|
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
end
|
195
|
-
protected :validate_column
|
196
|
-
|
197
|
-
# Takes a block to evaluate on each row. The row can be converted into
|
198
|
-
# an OpenStruct or a Hash for easier filter methods. Note, don't try this
|
199
|
-
# with a hash or open struct unless you have facets available.
|
200
|
-
def filter!(as=Array, &block)
|
201
|
-
as = infer_class(as)
|
202
|
-
items = []
|
203
|
-
self.items.each do |row|
|
204
|
-
value = block.call(cast_row(row, as))
|
205
|
-
items << row if value
|
84
|
+
def validate_column(column)
|
85
|
+
column = column.to_sym
|
86
|
+
raise ArgumentError, "Must provide the name of an existing column. Provided #{column.inspect}, needed to provide one of #{self.labels.inspect}" unless self.labels.include?(column)
|
87
|
+
column
|
206
88
|
end
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
def filter(as=Array, &block)
|
212
|
-
new_data_frame = self.clone
|
213
|
-
new_data_frame.filter!(as, &block)
|
214
|
-
end
|
215
|
-
|
216
|
-
def infer_class(obj)
|
217
|
-
obj = obj.to_s.classify.constantize if obj.is_a?(Symbol)
|
218
|
-
obj = obj.classify.constantize if obj.is_a?(String)
|
219
|
-
obj
|
220
|
-
end
|
221
|
-
protected :infer_class
|
222
|
-
|
223
|
-
def cast_row(row, as)
|
224
|
-
if as == Hash
|
225
|
-
obj = {}
|
226
|
-
self.labels.each_with_index do |label, i|
|
227
|
-
obj[label] = row[i]
|
228
|
-
end
|
229
|
-
obj
|
230
|
-
elsif as == OpenStruct
|
231
|
-
obj = OpenStruct.new
|
232
|
-
self.labels.each_with_index do |label, i|
|
233
|
-
obj.table[label] = row[i]
|
234
|
-
end
|
89
|
+
|
90
|
+
def infer_class(obj)
|
91
|
+
obj = obj.to_s.classify.constantize if obj.is_a?(Symbol)
|
92
|
+
obj = obj.classify.constantize if obj.is_a?(String)
|
235
93
|
obj
|
236
|
-
elsif as == Array
|
237
|
-
row
|
238
|
-
else
|
239
|
-
as.new(*row)
|
240
94
|
end
|
241
|
-
end
|
242
|
-
protected :cast_row
|
243
|
-
|
244
|
-
# Creates a new data frame, only with the specified columns.
|
245
|
-
def subset_from_columns(*cols)
|
246
|
-
new_labels = self.labels.inject([]) do |list, label|
|
247
|
-
list << label if cols.include?(label)
|
248
|
-
list
|
249
|
-
end
|
250
|
-
new_data_frame = DataFrame.new(*self.labels)
|
251
|
-
new_data_frame.import(self.items)
|
252
|
-
self.labels.each do |label|
|
253
|
-
new_data_frame.drop!(label) unless new_labels.include?(label)
|
254
|
-
end
|
255
|
-
new_data_frame
|
256
|
-
end
|
257
|
-
|
258
|
-
# A weird name. This creates a column for every category in a column
|
259
|
-
# and marks each row by its value
|
260
|
-
def j_binary_ize!(*columns)
|
261
|
-
# Allows to mix a hash with the columns.
|
262
|
-
options = columns.find_all {|e| e.is_a?(Hash)}.inject({}) {|h, e| h.merge!(e)}
|
263
|
-
columns.delete_if {|e| e.is_a?(Hash)}
|
264
95
|
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
if options[:allow_overlap]
|
271
|
-
category_map = values.inject([]) do |list, e|
|
272
|
-
list << values.all_categories(e)
|
273
|
-
end
|
274
|
-
self.append!(full_name, category_map.map{|e| e.include?(category)})
|
275
|
-
else
|
276
|
-
self.append!(full_name, values.category_map.map{|e| e == category})
|
96
|
+
def cast_row(row, as)
|
97
|
+
if as == Hash
|
98
|
+
obj = {}
|
99
|
+
self.labels.each_with_index do |label, i|
|
100
|
+
obj[label] = row[i]
|
277
101
|
end
|
102
|
+
obj
|
103
|
+
elsif as == OpenStruct
|
104
|
+
obj = OpenStruct.new
|
105
|
+
self.labels.each_with_index do |label, i|
|
106
|
+
obj.table[label] = row[i]
|
107
|
+
end
|
108
|
+
obj
|
109
|
+
elsif as == Array
|
110
|
+
row
|
111
|
+
else
|
112
|
+
as.new(*row)
|
278
113
|
end
|
279
114
|
end
|
280
|
-
end
|
281
|
-
|
282
|
-
# Adds a unique column to the table
|
283
|
-
def append!(column_name, value=nil)
|
284
|
-
raise ArgumentError, "Can't have duplicate column names" if self.labels.include?(column_name)
|
285
|
-
self.labels << column_name.to_underscore_sym
|
286
|
-
if value.is_a?(Array)
|
287
|
-
self.items.each_with_index do |item, i|
|
288
|
-
item << value[i]
|
289
|
-
end
|
290
|
-
else
|
291
|
-
self.items.each do |item|
|
292
|
-
item << value
|
293
|
-
end
|
294
|
-
end
|
295
|
-
# Because we are tainting the sub arrays, the TaintableArray doesn't know it's been changed.
|
296
|
-
self.items.taint
|
297
|
-
end
|
298
|
-
|
299
|
-
def filter_by_category(hash)
|
300
|
-
new_data_frame = self.dup
|
301
|
-
hash.each do |key, value|
|
302
|
-
key = key.to_underscore_sym
|
303
|
-
next unless self.labels.include?(key)
|
304
|
-
value = [value] unless value.is_a?(Array) or value.is_a?(Range)
|
305
|
-
new_data_frame.filter!(:hash) {|row| value.include?(row[key])}
|
306
|
-
end
|
307
|
-
new_data_frame
|
308
|
-
end
|
309
|
-
|
310
|
-
def filter_by_category!(hash)
|
311
|
-
hash.each do |key, value|
|
312
|
-
key = key.to_underscore_sym
|
313
|
-
next unless self.labels.include?(key)
|
314
|
-
value = [value] unless value.is_a?(Array) or value.is_a?(Range)
|
315
|
-
self.filter!(:hash) {|row| value.include?(row[key])}
|
316
|
-
end
|
317
|
-
end
|
318
|
-
|
319
115
|
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module DF #:nodoc:
|
2
|
+
# Uses Ilya Grigorik's ID3 decision_tree gem. Installs it if you don't have it.
|
3
|
+
module ID3
|
4
|
+
begin
|
5
|
+
gem 'decisiontree'
|
6
|
+
require 'decisiontree'
|
7
|
+
rescue
|
8
|
+
`sudo gem install decisiontree`
|
9
|
+
gem 'decisiontree'
|
10
|
+
require 'decisiontree'
|
11
|
+
end
|
12
|
+
|
13
|
+
def create_id3(dependent_column, opts={})
|
14
|
+
# Need to put the dependent column in the last column
|
15
|
+
# Probably have other pre processing as well.
|
16
|
+
default = opts.fetch(:default, 1)
|
17
|
+
@id3 = DecisionTree::ID3Tree.new(self.labels, self.training_data, default, :discrete)
|
18
|
+
# ...
|
19
|
+
end
|
20
|
+
|
21
|
+
def id3
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class DataFrame
|
27
|
+
include DF::ID3
|
28
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# The University of California - Irvine has a great set of machine
|
2
|
+
# learning sample data sets. Their data description pages have field
|
3
|
+
# label descriptors. This class extracts them and returns a DataFrame
|
4
|
+
# with the labels of a data set.
|
5
|
+
|
6
|
+
# Turns out, this isn't very useful. So...oh well.
|
7
|
+
# By the way, the code I'm talking about is found here: http://archive.ics.uci.edu/ml/
|
8
|
+
# And to use this class:
|
9
|
+
# require 'lib/data_frame/labels_from_uci'
|
10
|
+
# df = LabelsFromUCI.data_frame 'http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.names'
|
11
|
+
# df.import('http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data')
|
12
|
+
|
13
|
+
class LabelsFromUCI
|
14
|
+
|
15
|
+
class << self
|
16
|
+
def process(url)
|
17
|
+
lfu = new(url)
|
18
|
+
lfu.labels
|
19
|
+
end
|
20
|
+
|
21
|
+
def data_frame(url)
|
22
|
+
lfu = new(url)
|
23
|
+
DataFrame.new(lfu.labels)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
attr_reader :url, :contents, :labels
|
28
|
+
|
29
|
+
def initialize(url)
|
30
|
+
@url = url
|
31
|
+
open(url) { |f| @contents = f.read }
|
32
|
+
process_labels
|
33
|
+
end
|
34
|
+
|
35
|
+
protected
|
36
|
+
def process_labels
|
37
|
+
@labels = []
|
38
|
+
@contents.each_line do |line|
|
39
|
+
if line =~ label_re
|
40
|
+
@labels << $1
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def label_re
|
46
|
+
/@attribute (\w+)/
|
47
|
+
end
|
48
|
+
end
|