data_frame 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +122 -0
- data/VERSION.yml +4 -0
- data/bin/plain_frame +22 -0
- data/lib/data_frame.rb +26 -0
- data/lib/data_frame/arff.rb +52 -0
- data/lib/data_frame/callback_array.rb +152 -0
- data/lib/data_frame/core/column_management.rb +147 -0
- data/lib/data_frame/core/filter.rb +48 -0
- data/lib/data_frame/core/import.rb +113 -0
- data/lib/data_frame/core/pre_process.rb +69 -0
- data/lib/data_frame/core/saving.rb +29 -0
- data/lib/data_frame/core/training.rb +46 -0
- data/lib/data_frame/data_frame.rb +115 -0
- data/lib/data_frame/id3.rb +28 -0
- data/lib/data_frame/kmeans.rb +10 -0
- data/lib/data_frame/labels_from_uci.rb +48 -0
- data/lib/data_frame/mlp.rb +18 -0
- data/lib/data_frame/model.rb +22 -0
- data/lib/data_frame/parameter_capture.rb +50 -0
- data/lib/data_frame/sbn.rb +18 -0
- data/lib/data_frame/transposable_array.rb +23 -0
- data/lib/ext/array.rb +11 -0
- data/lib/ext/open_struct.rb +5 -0
- data/lib/ext/string.rb +5 -0
- data/lib/ext/symbol.rb +5 -0
- data/spec/data_frame/arff_spec.rb +48 -0
- data/spec/data_frame/callback_array_spec.rb +148 -0
- data/spec/data_frame/core/column_management_spec.rb +128 -0
- data/spec/data_frame/core/filter_spec.rb +88 -0
- data/spec/data_frame/core/import_spec.rb +41 -0
- data/spec/data_frame/core/pre_process_spec.rb +103 -0
- data/spec/data_frame/core/saving_spec.rb +61 -0
- data/spec/data_frame/core/training_spec.rb +72 -0
- data/spec/data_frame/data_frame_spec.rb +141 -0
- data/spec/data_frame/id3_spec.rb +22 -0
- data/spec/data_frame/model_spec.rb +36 -0
- data/spec/data_frame/parameter_capture_spec.rb +32 -0
- data/spec/data_frame/transposable_array_spec.rb +138 -0
- data/spec/data_frame_spec.rb +29 -0
- data/spec/ext/array_spec.rb +13 -0
- data/spec/fixtures/basic.csv +3 -0
- data/spec/fixtures/discrete_testing.csv +4 -0
- data/spec/fixtures/discrete_training.csv +21 -0
- data/spec/spec_helper.rb +8 -0
- metadata +128 -0
@@ -0,0 +1,48 @@
|
|
1
|
+
module DF #:nodoc:
|
2
|
+
module Filter #:nodoc:
|
3
|
+
|
4
|
+
# Takes a block to evaluate on each row. The row can be converted into
|
5
|
+
# an OpenStruct or a Hash for easier filter methods. Note, don't try this
|
6
|
+
# with a hash or open struct unless you have facets available.
|
7
|
+
def filter!(as=Array, &block)
|
8
|
+
as = infer_class(as)
|
9
|
+
items = []
|
10
|
+
self.items.each do |row|
|
11
|
+
value = block.call(cast_row(row, as))
|
12
|
+
items << row if value
|
13
|
+
end
|
14
|
+
@items = items.dup
|
15
|
+
self
|
16
|
+
end
|
17
|
+
|
18
|
+
def filter(as=Array, &block)
|
19
|
+
new_data_frame = self.clone
|
20
|
+
new_data_frame.filter!(as, &block)
|
21
|
+
end
|
22
|
+
|
23
|
+
def filter_by_category(hash)
|
24
|
+
new_data_frame = self.dup
|
25
|
+
hash.each do |key, value|
|
26
|
+
key = key.to_underscore_sym
|
27
|
+
next unless self.labels.include?(key)
|
28
|
+
value = [value] unless value.is_a?(Array) or value.is_a?(Range)
|
29
|
+
new_data_frame.filter!(:hash) {|row| value.include?(row[key])}
|
30
|
+
end
|
31
|
+
new_data_frame
|
32
|
+
end
|
33
|
+
|
34
|
+
def filter_by_category!(hash)
|
35
|
+
hash.each do |key, value|
|
36
|
+
key = key.to_underscore_sym
|
37
|
+
next unless self.labels.include?(key)
|
38
|
+
value = [value] unless value.is_a?(Array) or value.is_a?(Range)
|
39
|
+
self.filter!(:hash) {|row| value.include?(row[key])}
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
class DataFrame
|
47
|
+
include DF::Filter
|
48
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
module DF #:nodoc:
|
2
|
+
module Import #:nodoc:
|
3
|
+
|
4
|
+
module InferCSV #:nodoc:
|
5
|
+
|
6
|
+
protected
|
7
|
+
def default_csv_opts; {:converters => :all}; end
|
8
|
+
|
9
|
+
def infer_csv_contents(obj, opts={})
|
10
|
+
contents = File.read(obj) if File.exist?(obj)
|
11
|
+
begin
|
12
|
+
open(obj) {|f| contents = f.read} unless contents
|
13
|
+
rescue
|
14
|
+
nil
|
15
|
+
end
|
16
|
+
contents ||= obj if obj.is_a?(String)
|
17
|
+
return nil unless contents
|
18
|
+
table = FCSV.parse(contents, default_csv_opts.merge(opts))
|
19
|
+
labels = opts.fetch(:headers, true) ? table.shift : []
|
20
|
+
while table.last.empty?
|
21
|
+
table.pop
|
22
|
+
end
|
23
|
+
[labels, table]
|
24
|
+
end
|
25
|
+
|
26
|
+
end # InferCSV
|
27
|
+
|
28
|
+
module ClassMethods #:nodoc:
|
29
|
+
|
30
|
+
include InferCSV
|
31
|
+
|
32
|
+
# This is the neatest part of this neat gem.
|
33
|
+
# DataFrame.from_csv can be called in a lot of ways:
|
34
|
+
# DataFrame.from_csv(csv_contents)
|
35
|
+
# DataFrame.from_csv(filename)
|
36
|
+
# DataFrame.from_csv(url)
|
37
|
+
# If you need to define converters for FasterCSV, do it before calling
|
38
|
+
# this method:
|
39
|
+
# FasterCSV::Converters[:special] = lambda{|f| f == 'foo' ? 'bar' : 'foo'}
|
40
|
+
# DataFrame.from_csv('http://example.com/my_special_url.csv', :converters => :special)
|
41
|
+
# This returns bar where 'foo' was found and 'foo' everywhere else.
|
42
|
+
def from_csv(obj, opts={})
|
43
|
+
labels, table = infer_csv_contents(obj, opts)
|
44
|
+
name = infer_name_from_contents(obj, opts)
|
45
|
+
return nil unless labels and table
|
46
|
+
df = new(*labels)
|
47
|
+
df.import(table)
|
48
|
+
df.name = name
|
49
|
+
df
|
50
|
+
end
|
51
|
+
|
52
|
+
protected
|
53
|
+
|
54
|
+
# Only works for names sources, urls and files
|
55
|
+
def infer_name_from_contents(obj, opts={})
|
56
|
+
begin
|
57
|
+
File.split(obj).last.split('.')[0..-2].join('.').titleize
|
58
|
+
rescue
|
59
|
+
nil
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
end # Class Methods
|
64
|
+
|
65
|
+
module InstanceMethods #:nodoc:
|
66
|
+
|
67
|
+
include InferCSV
|
68
|
+
|
69
|
+
def add_item(item)
|
70
|
+
self.items << item
|
71
|
+
end
|
72
|
+
alias :add :add_item
|
73
|
+
|
74
|
+
# Loads a batch of rows. Expects an array of arrays, else you don't
|
75
|
+
# know what you have.
|
76
|
+
def import(rows)
|
77
|
+
case rows
|
78
|
+
when Array
|
79
|
+
import_array(rows)
|
80
|
+
when String
|
81
|
+
labels, table = infer_csv_contents(rows, :headers => false)
|
82
|
+
import(table)
|
83
|
+
else
|
84
|
+
raise ArgumentError, "Don't know how to import data from #{rows.class}"
|
85
|
+
end
|
86
|
+
true
|
87
|
+
end
|
88
|
+
|
89
|
+
protected
|
90
|
+
# Imports a table as an array of arrays.
|
91
|
+
# If the array is one-dimensional and there is more than one label, it
|
92
|
+
# imports only one row.
|
93
|
+
def import_array(rows)
|
94
|
+
raise ArgumentError, "Can only work with arrays" unless rows.is_a?(Array)
|
95
|
+
if self.labels.size > 1 and rows.dimensions == 1
|
96
|
+
self.add_item(rows)
|
97
|
+
else
|
98
|
+
# self.items = self.items + rows
|
99
|
+
rows.each do |row|
|
100
|
+
self.add_item(row)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
end # Instance Methods
|
106
|
+
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
class DataFrame
|
111
|
+
include DF::Import::InstanceMethods
|
112
|
+
extend DF::Import::ClassMethods
|
113
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module DF #:nodoc:
|
2
|
+
module PreProcess #:nodoc:
|
3
|
+
# A weird name. This creates a column for every category in a column
|
4
|
+
# and marks each row by its value
|
5
|
+
def j_binary_ize!(*columns)
|
6
|
+
# Allows to mix a hash with the columns.
|
7
|
+
options = columns.find_all {|e| e.is_a?(Hash)}.inject({}) {|h, e| h.merge!(e)}
|
8
|
+
columns.delete_if {|e| e.is_a?(Hash)}
|
9
|
+
|
10
|
+
# Generates new columns
|
11
|
+
columns.each do |col|
|
12
|
+
values = render_column(col.to_underscore_sym)
|
13
|
+
values.categories.each do |category|
|
14
|
+
full_name = (col.to_s + "_" + category.to_s).to_sym
|
15
|
+
if options[:allow_overlap]
|
16
|
+
category_map = values.inject([]) do |list, e|
|
17
|
+
list << values.all_categories(e)
|
18
|
+
end
|
19
|
+
self.append!(full_name, category_map.map{|e| e.include?(category)})
|
20
|
+
else
|
21
|
+
self.append!(full_name, values.category_map.map{|e| e == category})
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Adds a column, numerical_column_name that shows the same data as a
|
28
|
+
# nominal value, but as a number.
|
29
|
+
def numericize!(*columns)
|
30
|
+
columns.each do |col|
|
31
|
+
values = render_column(col.to_underscore_sym)
|
32
|
+
categories = values.categories
|
33
|
+
value_categories = values.map {|v| values.category(v)}
|
34
|
+
|
35
|
+
i = 0
|
36
|
+
category_map = value_categories.uniq.inject({}) do |h, c|
|
37
|
+
h[c] = i
|
38
|
+
i += 1
|
39
|
+
h
|
40
|
+
end
|
41
|
+
|
42
|
+
blank = Array.new(category_map.size, 0)
|
43
|
+
reverse_category_map = category_map.inject({}) {|h, e| h[e.last] = e.first; h}
|
44
|
+
|
45
|
+
new_values = values.inject([]) do |list, val|
|
46
|
+
a = blank.dup
|
47
|
+
a[category_map[values.category(val)]] = 1
|
48
|
+
list << a
|
49
|
+
end
|
50
|
+
|
51
|
+
new_name = "numerical #{col.to_s}".to_underscore_sym
|
52
|
+
self.append!(new_name, new_values)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def categorize!(*cs)
|
57
|
+
store_range_hashes
|
58
|
+
cs.each do |column|
|
59
|
+
self.replace!(column, category_map_from_stored_range_hash(column))
|
60
|
+
end
|
61
|
+
restore_range_hashes
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
class DataFrame
|
68
|
+
include DF::PreProcess
|
69
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module DF #:nodoc:
|
2
|
+
module Saving #:nodoc:
|
3
|
+
|
4
|
+
# Saves a data frame as CSV.
|
5
|
+
# Examples:
|
6
|
+
# df.save('/tmp/some_filename.csv')
|
7
|
+
# df.save('/tmp/some_filename.csv', :include_header => false) # No header information is saved
|
8
|
+
# df.save('/tmp/some_filename.csv', :only => [:list, :of, :columns])
|
9
|
+
# df.save('/tmp/some_filename.csv', :subset => [:list, :of, :columns])
|
10
|
+
# df.save('/tmp/some_filename.csv',
|
11
|
+
# :filter => {:column_name => :category_value,
|
12
|
+
# :another_column_name => (range..values)}) # Filter by category
|
13
|
+
def save(filename, opts={})
|
14
|
+
|
15
|
+
df = self
|
16
|
+
df = df.subset_from_columns(*Array(opts[:only])) if opts[:only]
|
17
|
+
df = df.subset_from_columns(*Array(opts[:subset])) if opts[:subset]
|
18
|
+
df = df.filter_by_category(opts[:filter]) if opts[:filter]
|
19
|
+
df = df.filter_by_category(opts[:filter_by_category]) if opts[:filter_by_category]
|
20
|
+
|
21
|
+
File.open(filename, "w") { |f| f.write df.to_csv(opts.fetch(:include_header, true)) }
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class DataFrame
|
28
|
+
include DF::Saving
|
29
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module Training #:nodoc:
|
2
|
+
|
3
|
+
# Remove the training set if reset
|
4
|
+
# Return cached training_set, if there is one
|
5
|
+
# Get the proportion or 80%
|
6
|
+
# Get the number of items to choose, n, or a proportion of the items
|
7
|
+
# Store and return n random items
|
8
|
+
def training_set(opts={})
|
9
|
+
if opts[:reset]
|
10
|
+
@training_set = nil
|
11
|
+
@test_set = nil
|
12
|
+
end
|
13
|
+
return @training_set if @training_set
|
14
|
+
|
15
|
+
items_size = self.items.size
|
16
|
+
proportion = opts.fetch(:proportion, 0.8)
|
17
|
+
n = opts[:n]
|
18
|
+
n ||= (items_size * proportion).to_i
|
19
|
+
n = self.items.size if n > items_size
|
20
|
+
n = 0 if n < 0
|
21
|
+
|
22
|
+
@training_set = []
|
23
|
+
while n > @training_set.size
|
24
|
+
@training_set << random_next(items_size) while n > @training_set.size
|
25
|
+
@training_set.uniq!
|
26
|
+
end
|
27
|
+
@training_set
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
def test_set(opts={})
|
32
|
+
@test_set = nil if opts[:reset]
|
33
|
+
return @test_set if @test_set
|
34
|
+
@test_set = self.items.exclusive_not(self.training_set)
|
35
|
+
end
|
36
|
+
|
37
|
+
protected
|
38
|
+
def random_next(n)
|
39
|
+
self.items[rand(n)]
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
class DataFrame
|
45
|
+
include Training
|
46
|
+
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
# This allows me to have named columns and optionally named rows in a
|
2
|
+
# data frame, to work calculations (usually on the columns), to
|
3
|
+
# transpose the matrix and store the transposed matrix until the object
|
4
|
+
# is tainted.
|
5
|
+
class DataFrame
|
6
|
+
|
7
|
+
def inspect
|
8
|
+
"DataFrame rows: #{self.rows.size} labels: #{self.labels.inspect}"
|
9
|
+
end
|
10
|
+
|
11
|
+
# The labels of the data items
|
12
|
+
attr_reader :labels
|
13
|
+
alias :variables :labels
|
14
|
+
|
15
|
+
# The items stored in the frame
|
16
|
+
attr_accessor :items
|
17
|
+
|
18
|
+
# An optional name, useful for arff files
|
19
|
+
attr_accessor :name
|
20
|
+
|
21
|
+
def initialize(*labels)
|
22
|
+
labels = labels.first if labels.size == 1 and labels.first.is_a?(Array)
|
23
|
+
@labels = labels.map {|e| e.to_underscore_sym }
|
24
|
+
@items = TransposableArray.new
|
25
|
+
end
|
26
|
+
|
27
|
+
def row_labels
|
28
|
+
@row_labels ||= []
|
29
|
+
end
|
30
|
+
|
31
|
+
def row_labels=(ary)
|
32
|
+
raise ArgumentError, "Row labels must be an array" unless ary.is_a?(Array)
|
33
|
+
@row_labels = ary
|
34
|
+
end
|
35
|
+
|
36
|
+
# The rows as an array of arrays, an alias for items.
|
37
|
+
alias :rows :items
|
38
|
+
|
39
|
+
def render_row(sym)
|
40
|
+
i = self.row_labels.index(sym)
|
41
|
+
return nil unless i
|
42
|
+
@items[i]
|
43
|
+
end
|
44
|
+
|
45
|
+
# Return the column, given its name
|
46
|
+
def render_column(sym)
|
47
|
+
i = @labels.index(sym.to_underscore_sym)
|
48
|
+
return nil unless i
|
49
|
+
@items.transpose[i]
|
50
|
+
end
|
51
|
+
|
52
|
+
# The columns as a Dictionary or Hash
|
53
|
+
# This is cached, call columns(true) to reset the cache.
|
54
|
+
def columns(reset=false)
|
55
|
+
@columns = nil if reset
|
56
|
+
return @columns if @columns
|
57
|
+
|
58
|
+
container = defined?(Dictionary) ? Dictionary.new : Hash.new
|
59
|
+
i = 0
|
60
|
+
|
61
|
+
@columns = @items.transpose.inject(container) do |cont, col|
|
62
|
+
cont[@labels[i]] = col
|
63
|
+
i += 1
|
64
|
+
cont
|
65
|
+
end
|
66
|
+
end
|
67
|
+
alias :to_hash :columns
|
68
|
+
alias :to_dictionary :columns
|
69
|
+
|
70
|
+
def method_missing(sym, *args, &block)
|
71
|
+
if self.labels.include?(sym)
|
72
|
+
render_column(sym)
|
73
|
+
elsif self.row_labels.include?(sym)
|
74
|
+
render_row(sym)
|
75
|
+
elsif @items.respond_to?(sym)
|
76
|
+
@items.send(sym, *args, &block)
|
77
|
+
else
|
78
|
+
super
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
protected
|
83
|
+
|
84
|
+
def validate_column(column)
|
85
|
+
column = column.to_sym
|
86
|
+
raise ArgumentError, "Must provide the name of an existing column. Provided #{column.inspect}, needed to provide one of #{self.labels.inspect}" unless self.labels.include?(column)
|
87
|
+
column
|
88
|
+
end
|
89
|
+
|
90
|
+
def infer_class(obj)
|
91
|
+
obj = obj.to_s.classify.constantize if obj.is_a?(Symbol)
|
92
|
+
obj = obj.classify.constantize if obj.is_a?(String)
|
93
|
+
obj
|
94
|
+
end
|
95
|
+
|
96
|
+
def cast_row(row, as)
|
97
|
+
if as == Hash
|
98
|
+
obj = {}
|
99
|
+
self.labels.each_with_index do |label, i|
|
100
|
+
obj[label] = row[i]
|
101
|
+
end
|
102
|
+
obj
|
103
|
+
elsif as == OpenStruct
|
104
|
+
obj = OpenStruct.new
|
105
|
+
self.labels.each_with_index do |label, i|
|
106
|
+
obj.table[label] = row[i]
|
107
|
+
end
|
108
|
+
obj
|
109
|
+
elsif as == Array
|
110
|
+
row
|
111
|
+
else
|
112
|
+
as.new(*row)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module DF #:nodoc:
|
2
|
+
# Uses Ilya Grigorik's ID3 decision_tree gem. Installs it if you don't have it.
|
3
|
+
module ID3
|
4
|
+
begin
|
5
|
+
gem 'decisiontree'
|
6
|
+
require 'decisiontree'
|
7
|
+
rescue
|
8
|
+
`sudo gem install decisiontree`
|
9
|
+
gem 'decisiontree'
|
10
|
+
require 'decisiontree'
|
11
|
+
end
|
12
|
+
|
13
|
+
def create_id3(dependent_column, opts={})
|
14
|
+
# Need to put the dependent column in the last column
|
15
|
+
# Probably have other pre processing as well.
|
16
|
+
default = opts.fetch(:default, 1)
|
17
|
+
@id3 = DecisionTree::ID3Tree.new(self.labels, self.training_data, default, :discrete)
|
18
|
+
# ...
|
19
|
+
end
|
20
|
+
|
21
|
+
def id3
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class DataFrame
|
27
|
+
include DF::ID3
|
28
|
+
end
|