davidrichards-data_frame 0.0.18 → 0.0.19

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,18 @@
1
+ module DF #:nodoc:
2
+ # Turns Data Frame into a feeder for Red Davis' MLP classifier.
3
+ # Will install it if you don't have it.
4
+ module MLP
5
+ begin
6
+ gem 'reddavis-mlp'
7
+ require 'mlp'
8
+ rescue
9
+ `sudo gem install reddavis-mlp`
10
+ gem 'reddavis-mlp'
11
+ require 'mlp'
12
+ end
13
+ end
14
+ end
15
+
16
+ class DataFrame
17
+ include DF::MLP
18
+ end
@@ -0,0 +1,18 @@
1
+ module DF #:nodoc:
2
+ # Turns Data Frame into a feeder for Carl Youngblood's Simple Bayesian classifier.
3
+ # Will install it if you don't have it.
4
+ module SBN
5
+ begin
6
+ gem 'sbn'
7
+ require 'sbn'
8
+ rescue
9
+ `sudo gem install sbn`
10
+ gem 'sbn'
11
+ require 'sbn'
12
+ end
13
+ end
14
+ end
15
+
16
+ class DataFrame
17
+ include DF::SBN
18
+ end
@@ -1,5 +1,5 @@
1
1
  # The only trick in this array is that it's transpose is memoized until
2
- # it is tainted. This will reduce computations elegantly.
2
+ # it is tainted. This should reduce computations elegantly.
3
3
  class TransposableArray < CallbackArray
4
4
 
5
5
  after_taint :clear_cache
data/lib/ext/array.rb ADDED
@@ -0,0 +1,11 @@
1
+ class Array
2
+ # Defines the number of dimensions:
3
+ # [1,2,3] is 1-dimensional
4
+ # [[1,2,3], [1,2,3]] is 2-dimensional
5
+ # [[[1,2,3], [1,2,3]], [[1,2,3], [1,2,3], [[1,2,3], [1,2,3]]]] is 3-dimensional
6
+ # So [[[1,2,3], [1,2,3]], [[1,2,3], [1,2,3], [[1,2,3], [1,2,3]]]].dimensions == 3
7
+ def dimensions(n=0)
8
+ n += 1
9
+ self.first.is_a?(Array) ? self.first.dimensions(n) : n
10
+ end
11
+ end
@@ -1,4 +1,5 @@
1
1
  require File.join(File.dirname(__FILE__), "/../spec_helper")
2
+ require 'data_frame/arff'
2
3
 
3
4
  describe "ARFF" do
4
5
  before do
@@ -0,0 +1,97 @@
1
+ require File.join(File.dirname(__FILE__), "/../../spec_helper")
2
+
3
+ describe "Column Management" do
4
+ before do
5
+ @labels = [:these, :are, :the, :labels]
6
+ @df = DataFrame.new(*@labels)
7
+ @df.add [1,2,3,4]
8
+ @df.add [5, 6, 7, 8]
9
+ end
10
+
11
+ context "append!" do
12
+ it "should be able to append an array of values to the data frame" do
13
+ @df.append!(:new_column, [5,5])
14
+ @df.new_column.should eql([5,5])
15
+ end
16
+
17
+ it "should be able to append a default value to the data frame" do
18
+ @df.append!(:new_column, :value)
19
+ @df.new_column.should eql([:value, :value])
20
+ end
21
+
22
+ it "should use nil as the default value" do
23
+ @df.append!(:new_column)
24
+ @df.new_column.should eql([nil, nil])
25
+ end
26
+ end
27
+
28
+ context "move_to_last!" do
29
+ it "should be able to move a column to the end of the data frame, useful for dependency models" do
30
+ @df.labels.should eql(@labels)
31
+ @df.move_to_last!(:these)
32
+ @df.labels.should eql([:are, :the, :labels, :these])
33
+ @df.these.should eql([1,5])
34
+ end
35
+ end
36
+
37
+ context "rename!" do
38
+ it "should be able to rename a column" do
39
+ @df.rename!(:new_name, :these)
40
+ @df.labels.should eql([:new_name, :are, :the, :labels])
41
+ end
42
+ end
43
+
44
+ context "drop!" do
45
+ it "should be able to remove a column" do
46
+ @df = DataFrame.new :twos, :threes, :fours
47
+ @df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
48
+ @df.drop!(:twos)
49
+ @df.items.all? {|i| i.should eql([3,4])}
50
+ @df.labels.should eql([:threes, :fours])
51
+ end
52
+
53
+ it "should be able to remove more than one column at a time" do
54
+ @df = DataFrame.new :twos, :threes, :fours
55
+ @df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
56
+ @df.drop!(:twos, :fours)
57
+ @df.items.all? {|i| i.should eql([3])}
58
+ @df.labels.should eql([:threes])
59
+ end
60
+
61
+ end
62
+
63
+ context "replace!" do
64
+ before do
65
+ @doubler = lambda{|e| e * 2}
66
+ end
67
+
68
+ it "should only replace columns that actually exist" do
69
+ lambda{@df.replace!(:not_a_column, &@doubler)}.should raise_error(
70
+ ArgumentError, /Must provide the name of an existing column./)
71
+ lambda{@df.replace!(:these, &@doubler)}.should_not raise_error
72
+ end
73
+
74
+ it "should be able to replace a column with a block" do
75
+ @df.replace!(:these) {|e| e * 2}
76
+ @df.these.should eql([2,10])
77
+ end
78
+
79
+ it "should be able to replace a column with an array" do
80
+ @a = [5,9]
81
+ @df.replace!(:these, @a)
82
+ @df.these.should eql(@a)
83
+ end
84
+ end
85
+
86
+ context "subset_from_columns" do
87
+
88
+ it "should be able to create a subset of columns" do
89
+ new_data_frame = @df.subset_from_columns(:these, :labels)
90
+ new_data_frame.should_not eql(@df)
91
+ new_data_frame.labels.should eql([:these, :labels])
92
+ new_data_frame.items.should eql([[1,4],[5,8]])
93
+ new_data_frame.these.should eql([1,5])
94
+ end
95
+ end
96
+
97
+ end
@@ -0,0 +1,88 @@
1
+ require File.join(File.dirname(__FILE__), "/../../spec_helper")
2
+
3
+ describe "Filter" do
4
+ before do
5
+ @labels = [:these, :are, :the, :labels]
6
+ @df = DataFrame.new(*@labels)
7
+ @df.add [1,2,3,4]
8
+ @df.add [5, 6, 7, 8]
9
+ end
10
+
11
+ it "should be able to filter a data frame with a block using an OpenStruct for each row" do
12
+ @df.filter!(:open_struct) {|row| row.these == 5}
13
+ @df.items.should eql([[5, 6, 7, 8]])
14
+ end
15
+
16
+ it "should be able to filter a data frame with a block using a Hash for each row" do
17
+ @df.filter!(:hash) {|row| row[:these] == 5}
18
+ @df.items.should eql([[5, 6, 7, 8]])
19
+ end
20
+
21
+ S4 = Struct.new(:one, :two, :three, :four)
22
+ it "should be able to filter a data frame with a block using another class that uses the row as input" do
23
+ @df.filter!(S4) {|row| row.one == 5}
24
+ @df.items.should eql([[5, 6, 7, 8]])
25
+ end
26
+
27
+ it "should be able to filter a data frame with a block using an array for each row" do
28
+ @df.filter! {|row| row.first == 5}
29
+ @df.items.should eql([[5, 6, 7, 8]])
30
+ end
31
+
32
+ it "should be able to do fancy things with the row as the filter" do
33
+ @df.filter! {|row| row.sum > 10}
34
+ @df.items.should eql([[5, 6, 7, 8]])
35
+ end
36
+
37
+ it "should be able to generate a new data frame with filter" do
38
+ new_df = @df.filter(:open_struct) {|row| row.these == 5}
39
+ new_df.items.should eql([[5, 6, 7, 8]])
40
+ @df.items.should eql([[1, 2, 3, 4], [5, 6, 7, 8]])
41
+ end
42
+
43
+ end
44
+
45
+ context "filter_by_category" do
46
+
47
+ before do
48
+ @df = DataFrame.new(:weather, :date)
49
+
50
+ (1..31).each do |i|
51
+ @df.add [(i % 3 == 1) ? :fair : :good, Date.parse("07/#{i}/2009")]
52
+ end
53
+
54
+ @d1 = Date.parse("07/15/2009")
55
+ @d2 = Date.parse("07/31/2009")
56
+
57
+ end
58
+
59
+ it "should be able to filter by category" do
60
+ filtered = @df.filter_by_category(:weather => :good)
61
+ filtered.weather.uniq.should eql([:good])
62
+ @df.weather.uniq.should be_include(:fair)
63
+ end
64
+
65
+ it "should be able to manage ranges for filter values" do
66
+ filtered = @df.filter_by_category(:date => (@d1..@d2))
67
+ filtered.date.should_not be_include(Date.parse("07/01/2009"))
68
+ filtered.date.should_not be_include(Date.parse("07/14/2009"))
69
+ filtered.date.should be_include(Date.parse("07/15/2009"))
70
+ filtered.date.should be_include(Date.parse("07/31/2009"))
71
+ @df.date.should be_include(Date.parse("07/01/2009"))
72
+ end
73
+
74
+ it "should be able to take an array of values to filter with" do
75
+ filtered = @df.filter_by_category(:date => [@d1, @d2])
76
+ filtered.date.should_not be_include(Date.parse("07/01/2009"))
77
+ filtered.date.should be_include(Date.parse("07/15/2009"))
78
+ filtered.date.should be_include(Date.parse("07/31/2009"))
79
+ end
80
+
81
+ it "should have a destructive version" do
82
+ @df.filter_by_category!(:date => [@d1, @d2])
83
+ @df.date.should_not be_include(Date.parse("07/01/2009"))
84
+ @df.date.should be_include(Date.parse("07/15/2009"))
85
+ @df.date.should be_include(Date.parse("07/31/2009"))
86
+ end
87
+
88
+ end
@@ -0,0 +1,41 @@
1
+ require File.join(File.dirname(__FILE__), "/../../spec_helper")
2
+
3
+ describe "Import" do
4
+
5
+ before do
6
+ @labels = [:these, :are, :the, :labels]
7
+ @df = DataFrame.new(*@labels)
8
+ end
9
+
10
+ it "should be able to add an item" do
11
+ item = [1,2,3,4]
12
+ @df.add_item(item)
13
+ @df.items.should eql([item])
14
+ end
15
+
16
+ it "should be able to import more than one row at a time" do
17
+ @df.import([[2,2,2,2],[3,3,3,3],[4,4,4,4]])
18
+ @df.row_labels = [:twos, :threes, :fours]
19
+ @df.twos.should eql([2,2,2,2])
20
+ @df.threes.should eql([3,3,3,3])
21
+ @df.fours.should eql([4,4,4,4])
22
+ end
23
+
24
+ it "should be able to import only one row" do
25
+ @df.import([2,2,2,2])
26
+ @df.these.should eql([2])
27
+ end
28
+
29
+ it "should be able to import a reference to csv" do
30
+ contents = %{7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,0
31
+ 7,4,oct,tue,90.6,35.4,669.1,6.7,18,33,0.9,0,0
32
+ }
33
+
34
+ @labels = [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
35
+ @df = DataFrame.new(@labels)
36
+ @df.import(contents)
37
+ @df.x.should eql([7,7])
38
+ @df.area.should eql([0,0])
39
+ end
40
+
41
+ end
@@ -0,0 +1,71 @@
1
+ require File.join(File.dirname(__FILE__), "/../../spec_helper")
2
+
3
+ describe "PreProcess" do
4
+ it "should be able to j_binary_ize! a column, taking its categories and creating a column for each" do
5
+ df = DataFrame.new(:observations)
6
+ df.add [:many]
7
+ df.add [:fine]
8
+ df.add [:things]
9
+ df.add [:are]
10
+ df.add [:available]
11
+ df.j_binary_ize!(:observations)
12
+ df.observations_many.should eql([true, false, false, false, false])
13
+ df.observations_fine.should eql([false, true, false, false, false])
14
+ df.observations_things.should eql([false, false, true, false, false])
15
+ df.observations_are.should eql([false, false, false, true, false])
16
+ df.observations_available.should eql([false, false, false, false, true])
17
+ df.observations.should eql([:many, :fine, :things, :are, :available])
18
+ end
19
+
20
+ it "should be able to j_binary_ize! a more normal column" do
21
+ df = DataFrame.new(:observations)
22
+ df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
23
+ df.observations.add_category(:small) {|e| e <= 3}
24
+ df.observations.add_category(:large) {|e| e >= 3}
25
+ df.j_binary_ize!(:observations)
26
+ df.observations_small.should eql([true, true, true, false, false, false, true, true, true])
27
+ df.observations_large.should eql([false, false, false, true, true, true, false, false, false])
28
+ end
29
+
30
+ it "should be able to j_binary_ize with non-adjacent sets (sets that allow a value to have more than one category)" do
31
+ df = DataFrame.new(:observations)
32
+ df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
33
+ df.observations.add_category(:small) {|e| e <= 3}
34
+ df.observations.add_category(:large) {|e| e >= 3}
35
+ df.j_binary_ize!(:observations, :allow_overlap => true)
36
+ df.observations_small.should eql([true, true, true, false, false, false, true, true, true])
37
+ df.observations_large.should eql([false, false, true, true, true, true, true, false, false])
38
+ end
39
+
40
+ it "should be able to hold multiple ideas of a columns categories by resetting the category and re-running j_binary_ize" do
41
+ df = DataFrame.new(:observations)
42
+ df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
43
+ df.observations.add_category(:small) {|e| e <= 3}
44
+ df.observations.add_category(:large) {|e| e >= 3}
45
+ df.j_binary_ize!(:observations, :allow_overlap => true)
46
+ df.observations.set_categories(:odd => lambda{|e| e.odd?}, :even => lambda{|e| e.even?})
47
+ df.j_binary_ize!(:observations)
48
+ df.observations_small.should eql([true, true, true, false, false, false, true, true, true])
49
+ df.observations_large.should eql([false, false, true, true, true, true, true, false, false])
50
+ df.observations.should eql([1,2,3,4,5,4,3,2,1])
51
+ df.observations_even.should eql([false, true, false, true, false, true, false, true, false])
52
+ df.observations_odd.should eql([true, false, true, false, true, false, true, false, true])
53
+ end
54
+
55
+ context "numericize!" do
56
+
57
+ before do
58
+ @df = DataFrame.new(:observations)
59
+ @df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
60
+ @df.observations.add_category(:small) {|e| e <= 3}
61
+ @df.observations.add_category(:large) {|e| e > 3}
62
+ end
63
+
64
+ it "should be able to numericize nominal data" do
65
+ @df.numericize!(:observations)
66
+ @df.numerical_observations.should eql([[1,0],[1,0],[1,0],[0,1],[0,1],[0,1],[1,0],[1,0],[1,0]])
67
+ end
68
+
69
+ end
70
+
71
+ end
@@ -0,0 +1,61 @@
1
+ require File.join(File.dirname(__FILE__), "/../../spec_helper")
2
+
3
+ describe "Saving" do
4
+ before do
5
+ @df = DataFrame.new(:observations)
6
+ @df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
7
+ @df.observations.add_category(:small) {|e| e <= 3}
8
+ @df.observations.add_category(:large) {|e| e > 3}
9
+ @filename = "/tmp/numericized_observations"
10
+ end
11
+
12
+ after do
13
+ `rm -rf #{@filename}`
14
+ end
15
+
16
+ it "should be able to save the data frame" do
17
+ @df.numericize!(:observations)
18
+ @df.save(@filename)
19
+ File.read(@filename).should eql(@df.to_csv)
20
+ end
21
+
22
+ it "should be able to save the data frame without the header" do
23
+ @df.save(@filename, :include_header => false)
24
+ File.read(@filename).should eql(@df.to_csv(false))
25
+ end
26
+
27
+ it "should be able to save off a subset" do
28
+ @df = DataFrame.new(:observations, :junk)
29
+ @df.import( [1,2,3,4,5,4,3,2,1].map{ |e| [e,e] } )
30
+ @df.save(@filename, :subset => :observations)
31
+ File.read(@filename).should eql(@df.subset_from_columns(:observations).to_csv)
32
+ end
33
+
34
+ it "should be able to filter the rows" do
35
+ @df = DataFrame.new(:observations, :junk)
36
+ @df.import( [1,2,3,4,5,4,3,2,1].map{ |e| [e,e] } )
37
+ @df.save(@filename, :subset => :observations)
38
+ @df.observations.add_category(:small) {|e| e <= 3}
39
+ @df.observations.add_category(:large) {|e| e > 3}
40
+ @df.save(@filename, :filter_by_category => {:observations => :small})
41
+ File.read(@filename).should eql(@df.filter_by_category(:observations => :small).to_csv)
42
+ end
43
+
44
+ it "should have a shortcut for subset, only" do
45
+ @df = DataFrame.new(:observations, :junk)
46
+ @df.import( [1,2,3,4,5,4,3,2,1].map{ |e| [e,e] } )
47
+ @df.save(@filename, :only => :observations)
48
+ File.read(@filename).should eql(@df.subset_from_columns(:observations).to_csv)
49
+ end
50
+
51
+ it "should have a shortcut for filter_by_category, filter" do
52
+ @df = DataFrame.new(:observations, :junk)
53
+ @df.import( [1,2,3,4,5,4,3,2,1].map{ |e| [e,e] } )
54
+ @df.save(@filename, :subset => :observations)
55
+ @df.observations.add_category(:small) {|e| e <= 3}
56
+ @df.observations.add_category(:large) {|e| e > 3}
57
+ @df.save(@filename, :filter => {:observations => :small})
58
+ File.read(@filename).should eql(@df.filter_by_category(:observations => :small).to_csv)
59
+ end
60
+
61
+ end
@@ -0,0 +1,51 @@
1
+ require File.join(File.dirname(__FILE__), "/../../spec_helper")
2
+
3
+ describe "Training" do
4
+ before do
5
+ @df = DataFrame.new(:one)
6
+ @df.import((0...100).to_a)
7
+ end
8
+
9
+ it "should be able to create a proportional training set from a data frame" do
10
+ @df.training_set(:n => 3)
11
+ @df.training_set.size.should eql(3)
12
+ @df.training_set.all? {|e| @df.items.should be_include(e)}
13
+ end
14
+
15
+ it "should use the same training set unless reset is passed to it" do
16
+ @df.training_set(:n => 5)
17
+ @df.training_set.should eql(@df.training_set)
18
+ old = @df.training_set
19
+ @df.training_set(:reset => true, :n => 5)
20
+ @df.training_set.should_not eql(old)
21
+ end
22
+
23
+ it "should be able to create a proportional training set" do
24
+ @df.training_set(:proportion => 0.6)
25
+ @df.training_set.size.should eql(60)
26
+ @df.training_set(:proportion => 0.42, :reset => true)
27
+ @df.training_set.size.should eql(42)
28
+ @df.training_set(:proportion => 0, :reset => true)
29
+ @df.training_set.size.should eql(0)
30
+ @df.training_set(:proportion => 1, :reset => true)
31
+ @df.training_set.size.should eql(100)
32
+ end
33
+
34
+ it "should not have a set size exceeding the items size" do
35
+ @df.training_set(:proportion => 2)
36
+ @df.training_set.size.should eql(100)
37
+ @df.training_set(:n => 200, :reset => true)
38
+ @df.training_set.size.should eql(100)
39
+ end
40
+
41
+ it "should not have any items when the proportion is calculated below 0" do
42
+ @df.training_set(:proportion => -2)
43
+ @df.training_set.size.should eql(0)
44
+ @df.training_set(:n => -2, :reset => true)
45
+ @df.training_set.size.should eql(0)
46
+ end
47
+
48
+ it "should have a default proportion of 80%" do
49
+ @df.training_set.size.should eql(80)
50
+ end
51
+ end