data_frame 0.1.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. data/README.rdoc +122 -0
  2. data/VERSION.yml +4 -0
  3. data/bin/plain_frame +22 -0
  4. data/lib/data_frame.rb +26 -0
  5. data/lib/data_frame/arff.rb +52 -0
  6. data/lib/data_frame/callback_array.rb +152 -0
  7. data/lib/data_frame/core/column_management.rb +147 -0
  8. data/lib/data_frame/core/filter.rb +48 -0
  9. data/lib/data_frame/core/import.rb +113 -0
  10. data/lib/data_frame/core/pre_process.rb +69 -0
  11. data/lib/data_frame/core/saving.rb +29 -0
  12. data/lib/data_frame/core/training.rb +46 -0
  13. data/lib/data_frame/data_frame.rb +115 -0
  14. data/lib/data_frame/id3.rb +28 -0
  15. data/lib/data_frame/kmeans.rb +10 -0
  16. data/lib/data_frame/labels_from_uci.rb +48 -0
  17. data/lib/data_frame/mlp.rb +18 -0
  18. data/lib/data_frame/model.rb +22 -0
  19. data/lib/data_frame/parameter_capture.rb +50 -0
  20. data/lib/data_frame/sbn.rb +18 -0
  21. data/lib/data_frame/transposable_array.rb +23 -0
  22. data/lib/ext/array.rb +11 -0
  23. data/lib/ext/open_struct.rb +5 -0
  24. data/lib/ext/string.rb +5 -0
  25. data/lib/ext/symbol.rb +5 -0
  26. data/spec/data_frame/arff_spec.rb +48 -0
  27. data/spec/data_frame/callback_array_spec.rb +148 -0
  28. data/spec/data_frame/core/column_management_spec.rb +128 -0
  29. data/spec/data_frame/core/filter_spec.rb +88 -0
  30. data/spec/data_frame/core/import_spec.rb +41 -0
  31. data/spec/data_frame/core/pre_process_spec.rb +103 -0
  32. data/spec/data_frame/core/saving_spec.rb +61 -0
  33. data/spec/data_frame/core/training_spec.rb +72 -0
  34. data/spec/data_frame/data_frame_spec.rb +141 -0
  35. data/spec/data_frame/id3_spec.rb +22 -0
  36. data/spec/data_frame/model_spec.rb +36 -0
  37. data/spec/data_frame/parameter_capture_spec.rb +32 -0
  38. data/spec/data_frame/transposable_array_spec.rb +138 -0
  39. data/spec/data_frame_spec.rb +29 -0
  40. data/spec/ext/array_spec.rb +13 -0
  41. data/spec/fixtures/basic.csv +3 -0
  42. data/spec/fixtures/discrete_testing.csv +4 -0
  43. data/spec/fixtures/discrete_training.csv +21 -0
  44. data/spec/spec_helper.rb +8 -0
  45. metadata +128 -0
@@ -0,0 +1,10 @@
1
+ module DF #:nodoc:
2
+ # Uses a KMeans classifier to cluster the data set.
3
+ module KMeans
4
+
5
+ end
6
+ end
7
+
8
+ class DataFrame
9
+ include DF::KMeans
10
+ end
@@ -0,0 +1,48 @@
1
+ # The University of California - Irvine has a great set of machine
2
+ # learning sample data sets. Their data description pages have field
3
+ # label descriptors. This class extracts them and returns a DataFrame
4
+ # with the labels of a data set.
5
+
6
+ # Turns out, this isn't very useful. So...oh well.
7
+ # By the way, the code I'm talking about is found here: http://archive.ics.uci.edu/ml/
8
+ # And to use this class:
9
+ # require 'lib/data_frame/labels_from_uci'
10
+ # df = LabelsFromUCI.data_frame 'http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.names'
11
+ # df.import('http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data')
12
+
13
+ class LabelsFromUCI
14
+
15
+ class << self
16
+ def process(url)
17
+ lfu = new(url)
18
+ lfu.labels
19
+ end
20
+
21
+ def data_frame(url)
22
+ lfu = new(url)
23
+ DataFrame.new(lfu.labels)
24
+ end
25
+ end
26
+
27
+ attr_reader :url, :contents, :labels
28
+
29
+ def initialize(url)
30
+ @url = url
31
+ open(url) { |f| @contents = f.read }
32
+ process_labels
33
+ end
34
+
35
+ protected
36
+ def process_labels
37
+ @labels = []
38
+ @contents.each_line do |line|
39
+ if line =~ label_re
40
+ @labels << $1
41
+ end
42
+ end
43
+ end
44
+
45
+ def label_re
46
+ /@attribute (\w+)/
47
+ end
48
+ end
@@ -0,0 +1,18 @@
1
+ module DF #:nodoc:
2
+ # Turns Data Frame into a feeder for Red Davis' MLP classifier.
3
+ # Will install it if you don't have it.
4
+ module MLP
5
+ begin
6
+ gem 'reddavis-mlp'
7
+ require 'mlp'
8
+ rescue
9
+ `sudo gem install reddavis-mlp`
10
+ gem 'reddavis-mlp'
11
+ require 'mlp'
12
+ end
13
+ end
14
+ end
15
+
16
+ class DataFrame
17
+ include DF::MLP
18
+ end
@@ -0,0 +1,22 @@
1
+ # Adds the model methods to the data frame.
2
+ class DataFrame
3
+
4
+ # Returns a model if defined
5
+ # Defines a model with a block, if given and not defined
6
+ # Stores the model in the models container, which gives us access like:
7
+ # df.models.new_model_name...
8
+ def model(name=nil, &block)
9
+ return self.models[name] if self.models.table.keys.include?(name)
10
+ return false unless block
11
+ @pc = ParameterCapture.new(&block)
12
+ model = self.filter(Hash) do |row|
13
+ @pc.filter(row)
14
+ end
15
+ self.models.table[name] = model
16
+ end
17
+
18
+ def models
19
+ @models ||= OpenStruct.new
20
+ end
21
+
22
+ end
@@ -0,0 +1,50 @@
1
+ # Captures the intent of a model definition in a block. Usage:
2
+ # pc = ParameterCapture.new do |p|
3
+ # p.whatever :some_value
4
+ # p.another :one
5
+ # p.or_list [1, 2]
6
+ # p.or_range (1..2)
7
+ # end
8
+ # pc.parameters
9
+ # => {:whatever => :some_value, :another => :one, :or_list => [1,2], :or_range => (1..2)}
10
+ class ParameterCapture
11
+ def initialize(&block)
12
+ self.instance_eval &block
13
+ end
14
+
15
+ def parameters
16
+ @parameters ||= OpenStruct.new
17
+ end
18
+
19
+ # Exposes the set keys
20
+ def keys
21
+ self.parameters.table.keys
22
+ end
23
+
24
+ # can be used in a data_frame filter.
25
+ # @pc.filter(row) Using a Hash as a cast type for the filter.
26
+ def filter(row)
27
+ self.keys.each do |key|
28
+ value = self.parameters.send(key)
29
+ case value
30
+ when Array
31
+ return false unless value.include?(row[key])
32
+ when Range
33
+ return false unless value.include?(row[key])
34
+ else
35
+ return false unless value === row[key]
36
+ end
37
+ end
38
+ return true
39
+ end
40
+
41
+ def method_missing(key, *values, &block)
42
+ if self.parameters.table.keys.include?(key)
43
+ self.parameters.send(key)
44
+ elsif values.size == 1
45
+ self.parameters.table[key] = values.first
46
+ else
47
+ self.parameters.table[key] = values
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,18 @@
1
+ module DF #:nodoc:
2
+ # Turns Data Frame into a feeder for Carl Youngblood's Simple Bayesian classifier.
3
+ # Will install it if you don't have it.
4
+ module SBN
5
+ begin
6
+ gem 'sbn'
7
+ require 'sbn'
8
+ rescue
9
+ `sudo gem install sbn`
10
+ gem 'sbn'
11
+ require 'sbn'
12
+ end
13
+ end
14
+ end
15
+
16
+ class DataFrame
17
+ include DF::SBN
18
+ end
@@ -0,0 +1,23 @@
1
+ # The only trick in this array is that it's transpose is memoized until
2
+ # it is tainted. This should reduce computations elegantly.
3
+ class TransposableArray < CallbackArray
4
+
5
+ after_taint :clear_cache
6
+
7
+ orig_transpose = instance_method(:transpose)
8
+ define_method(:transpose) {
9
+ self.untaint
10
+ @transpose ||= orig_transpose.bind(self).call
11
+ }
12
+
13
+ # For debugging and testing purposes, it just feels dirty to always ask
14
+ # for @ta.send(:instance_variable_get, :@transpose)
15
+ def cache
16
+ @transpose
17
+ end
18
+
19
+ def clear_cache
20
+ @transpose = nil
21
+ end
22
+ protected :clear_cache
23
+ end
@@ -0,0 +1,11 @@
1
+ class Array
2
+ # Defines the number of dimensions:
3
+ # [1,2,3] is 1-dimensional
4
+ # [[1,2,3], [1,2,3]] is 2-dimensional
5
+ # [[[1,2,3], [1,2,3]], [[1,2,3], [1,2,3], [[1,2,3], [1,2,3]]]] is 3-dimensional
6
+ # So [[[1,2,3], [1,2,3]], [[1,2,3], [1,2,3], [[1,2,3], [1,2,3]]]].dimensions == 3
7
+ def dimensions(n=0)
8
+ n += 1
9
+ self.first.is_a?(Array) ? self.first.dimensions(n) : n
10
+ end
11
+ end
@@ -0,0 +1,5 @@
1
+ class OpenStruct
2
+ def table
3
+ @table
4
+ end
5
+ end
@@ -0,0 +1,5 @@
1
+ class String # :nodoc:
2
+ def to_underscore_sym
3
+ self.titleize.gsub(/\s+/, '').underscore.to_sym
4
+ end
5
+ end
@@ -0,0 +1,5 @@
1
+ class Symbol # :nodoc:
2
+ def to_underscore_sym
3
+ self.to_s.titleize.gsub(/\s+/, '').underscore.to_sym
4
+ end
5
+ end
@@ -0,0 +1,48 @@
1
+ require File.join(File.dirname(__FILE__), "/../spec_helper")
2
+ require 'data_frame/arff'
3
+
4
+ describe "ARFF" do
5
+ before do
6
+ @df = DataFrame.from_csv(File.expand_path(File.join(File.dirname(__FILE__), '..', 'fixtures', 'basic.csv')))
7
+ end
8
+
9
+ it "should allow a data frame to be expressed as an arff-formatted file" do
10
+ @df.to_arff.should eql(basic_arff)
11
+ end
12
+
13
+ it "should add a to_csv method" do
14
+ @df.to_csv.should eql(%{x,y,month,day,ffmc,dmc,dc,isi,temp,rh,wind,rain,area
15
+ 7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,0
16
+ 7,4,oct,tue,90.6,35.4,669.1,6.7,18,33,0.9,0,0
17
+ })
18
+ end
19
+
20
+ it "should allow a non-header export for to_csv" do
21
+ @df.to_csv(false).should eql(%{7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,0
22
+ 7,4,oct,tue,90.6,35.4,669.1,6.7,18,33,0.9,0,0
23
+ })
24
+ end
25
+ end
26
+
27
+ def basic_arff
28
+ %[@relation basic
29
+
30
+ @attribute x {7}
31
+ @attribute y {4,5}
32
+ @attribute month {mar,oct}
33
+ @attribute day {fri,tue}
34
+ @attribute ffmc {86.2,90.6}
35
+ @attribute dmc {26.2,35.4}
36
+ @attribute dc {94.3,669.1}
37
+ @attribute isi {5.1,6.7}
38
+ @attribute temp {8.2,18}
39
+ @attribute rh {33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51}
40
+ @attribute wind {0.9,6.7}
41
+ @attribute rain {0}
42
+ @attribute area {0}
43
+
44
+ @data
45
+ 7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,0
46
+ 7,4,oct,tue,90.6,35.4,669.1,6.7,18,33,0.9,0,0
47
+ ]
48
+ end
@@ -0,0 +1,148 @@
1
+ require File.join(File.dirname(__FILE__), "/../spec_helper")
2
+
3
+ # TransposableArray is a thorough test on the after_taint method. Here
4
+ # I only test the other callbacks.
5
+ class Register
6
+ def self.next(meth)
7
+ @@count ||= {}
8
+ @@count[meth] ||= 0
9
+ @@count[meth] += 1
10
+ end
11
+ def self.for(meth)
12
+ @@count ||= {}
13
+ @@count[meth]
14
+ end
15
+ end
16
+
17
+ class A < CallbackArray
18
+ before_taint :register_before_taint
19
+ def register_before_taint
20
+ Register.next(:before_taint)
21
+ end
22
+
23
+ before_untaint :register_before_untaint
24
+ def register_before_untaint
25
+ Register.next(:before_untaint)
26
+ end
27
+
28
+ after_untaint :register_after_untaint
29
+ def register_after_untaint
30
+ Register.next(:after_untaint)
31
+ end
32
+ end
33
+
34
+ describe CallbackArray do
35
+ before do
36
+ @a = A.new [1,2,3]
37
+ end
38
+
39
+ context "before_taint" do
40
+ before do
41
+ @c = Register.for(:before_taint) || 0
42
+ end
43
+
44
+ after do
45
+ Register.for(:before_taint).should eql(@c + 1)
46
+ @a.should be_tainted
47
+ end
48
+
49
+ it "should callback before taint" do
50
+ @a.taint
51
+ end
52
+
53
+ it "should callback before :[]=" do
54
+ @a[0] = 2
55
+ end
56
+
57
+ it "should callback before :<<" do
58
+ @a << 3
59
+ end
60
+
61
+ it "should callback before :delete" do
62
+ @a.delete(2)
63
+ end
64
+
65
+ it "should callback before :push" do
66
+ @a.push(5)
67
+ end
68
+
69
+ it "should callback before :pop" do
70
+ @a.pop
71
+ end
72
+
73
+ it "should callback before :shift" do
74
+ @a.shift
75
+ end
76
+
77
+ it "should callback before :unshift" do
78
+ @a.unshift(6)
79
+ end
80
+
81
+ it "should callback before :map!" do
82
+ @a.map! {|e| e}
83
+ end
84
+
85
+ it "should callback before :sort!" do
86
+ @a.sort!
87
+ end
88
+
89
+ it "should callback before :reverse!" do
90
+ @a.reverse!
91
+ end
92
+
93
+ it "should callback before :collect!" do
94
+ @a.collect! {|e| e}
95
+ end
96
+
97
+ it "should callback before :compact!" do
98
+ @a.compact!
99
+ end
100
+
101
+ it "should callback before :reject!" do
102
+ @a.reject! {|e| not e}
103
+ end
104
+
105
+ it "should callback before :slice!" do
106
+ @a.slice!(1,2)
107
+ end
108
+
109
+ it "should callback before :flatten!" do
110
+ @a.flatten!
111
+ end
112
+
113
+ it "should callback before :uniq!" do
114
+ @a.uniq!
115
+ end
116
+
117
+ it "should callback before :clear" do
118
+ @a.clear
119
+ end
120
+
121
+
122
+ end
123
+
124
+ it "should not adjust the array in other methods" do
125
+ @a.at(0)
126
+ @a.sort
127
+ @a.uniq
128
+ @a.find{|e| e}
129
+ Register.for(:before_taint).should be_nil
130
+ @a.should_not be_tainted
131
+ end
132
+
133
+ it "should callback before untaint" do
134
+ c = Register.for(:before_untaint) || 0
135
+ @a.taint
136
+ @a.untaint
137
+ Register.for(:before_untaint).should eql(c + 1)
138
+ end
139
+
140
+ it "should callback after untaint" do
141
+ c = Register.for(:after_untaint) || 0
142
+ @a.taint
143
+ @a.untaint
144
+ Register.for(:after_untaint).should eql(c + 1)
145
+ end
146
+
147
+ end
148
+
@@ -0,0 +1,128 @@
1
+ require File.join(File.dirname(__FILE__), "/../../spec_helper")
2
+
3
+ describe "Column Management" do
4
+ before do
5
+ @labels = [:these, :are, :the, :labels]
6
+ @df = DataFrame.new(*@labels)
7
+ @df.add [1,2,3,4]
8
+ @df.add [5, 6, 7, 8]
9
+ end
10
+
11
+ context "append!" do
12
+ it "should be able to append an array of values to the data frame" do
13
+ @df.append!(:new_column, [5,5])
14
+ @df.new_column.should eql([5,5])
15
+ end
16
+
17
+ it "should be able to append a default value to the data frame" do
18
+ @df.append!(:new_column, :value)
19
+ @df.new_column.should eql([:value, :value])
20
+ end
21
+
22
+ it "should use nil as the default value" do
23
+ @df.append!(:new_column)
24
+ @df.new_column.should eql([nil, nil])
25
+ end
26
+ end
27
+
28
+ context "move_to_last!" do
29
+ it "should be able to move a column to the end of the data frame, useful for dependency models" do
30
+ @df.labels.should eql(@labels)
31
+ @df.move_to_last!(:these)
32
+ @df.labels.should eql([:are, :the, :labels, :these])
33
+ @df.these.should eql([1,5])
34
+ end
35
+ end
36
+
37
+ context "rename!" do
38
+ it "should be able to rename a column" do
39
+ @df.rename!(:new_name, :these)
40
+ @df.labels.should eql([:new_name, :are, :the, :labels])
41
+ end
42
+
43
+ it "should be able to use the new column name with dot notation" do
44
+ v = @df.these.dup
45
+ @df.rename!(:new_name, :these)
46
+ @df.new_name.should eql(v)
47
+ end
48
+ end
49
+
50
+ context "drop!" do
51
+ it "should be able to remove a column" do
52
+ @df = DataFrame.new :twos, :threes, :fours
53
+ @df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
54
+ @df.drop!(:twos)
55
+ @df.items.all? {|i| i.should eql([3,4])}
56
+ @df.labels.should eql([:threes, :fours])
57
+ end
58
+
59
+ it "should be able to remove more than one column at a time" do
60
+ @df = DataFrame.new :twos, :threes, :fours
61
+ @df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
62
+ @df.drop!(:twos, :fours)
63
+ @df.items.all? {|i| i.should eql([3])}
64
+ @df.labels.should eql([:threes])
65
+ end
66
+
67
+ end
68
+
69
+ context "replace!" do
70
+ before do
71
+ @doubler = lambda{|e| e * 2}
72
+ end
73
+
74
+ it "should only replace columns that actually exist" do
75
+ lambda{@df.replace!(:not_a_column, &@doubler)}.should raise_error(
76
+ ArgumentError, /Must provide the name of an existing column./)
77
+ lambda{@df.replace!(:these, &@doubler)}.should_not raise_error
78
+ end
79
+
80
+ it "should be able to replace a column with a block" do
81
+ @df.replace!(:these) {|e| e * 2}
82
+ @df.these.should eql([2,10])
83
+ end
84
+
85
+ it "should be able to replace a column with an array" do
86
+ @a = [5,9]
87
+ @df.replace!(:these, @a)
88
+ @df.these.should eql(@a)
89
+ end
90
+ end
91
+
92
+ context "subset_from_columns" do
93
+
94
+ it "should be able to create a subset of columns" do
95
+ new_data_frame = @df.subset_from_columns(:these, :labels)
96
+ new_data_frame.should_not eql(@df)
97
+ new_data_frame.labels.should eql([:these, :labels])
98
+ new_data_frame.items.should eql([[1,4],[5,8]])
99
+ new_data_frame.these.should eql([1,5])
100
+ end
101
+ end
102
+
103
+ context "duplicate!" do
104
+ it "should be able to duplicate a column" do
105
+ @df.duplicate!(:these)
106
+ @df.these1.should eql(@df.these)
107
+ end
108
+
109
+ it "should use unique names for the duplicate column" do
110
+ @df.duplicate!(:these)
111
+ @df.duplicate!(:these)
112
+ @df.duplicate!(:these)
113
+ @df.these3.should eql(@df.these2)
114
+ @df.these2.should eql(@df.these1)
115
+ @df.these1.should eql(@df.these)
116
+ end
117
+
118
+ it "should reset the labels list when a column is duplicated" do
119
+ @df.duplicate!(:these)
120
+ @df.labels.should be_include(:these1)
121
+ end
122
+
123
+ it "should return true, rather than the whole data set" do
124
+ @df.duplicate!(:these).should eql(true)
125
+ end
126
+ end
127
+
128
+ end