davidrichards-data_frame 0.0.18 → 0.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,18 @@
1
+ module DF #:nodoc:
2
+ # Turns Data Frame into a feeder for Red Davis' MLP classifier.
3
+ # Will install it if you don't have it.
4
+ module MLP
5
+ begin
6
+ gem 'reddavis-mlp'
7
+ require 'mlp'
8
+ rescue
9
+ `sudo gem install reddavis-mlp`
10
+ gem 'reddavis-mlp'
11
+ require 'mlp'
12
+ end
13
+ end
14
+ end
15
+
16
+ class DataFrame
17
+ include DF::MLP
18
+ end
@@ -0,0 +1,18 @@
1
+ module DF #:nodoc:
2
+ # Turns Data Frame into a feeder for Carl Youngblood's Simple Bayesian classifier.
3
+ # Will install it if you don't have it.
4
+ module SBN
5
+ begin
6
+ gem 'sbn'
7
+ require 'sbn'
8
+ rescue
9
+ `sudo gem install sbn`
10
+ gem 'sbn'
11
+ require 'sbn'
12
+ end
13
+ end
14
+ end
15
+
16
+ class DataFrame
17
+ include DF::SBN
18
+ end
@@ -1,5 +1,5 @@
1
1
  # The only trick in this array is that it's transpose is memoized until
2
- # it is tainted. This will reduce computations elegantly.
2
+ # it is tainted. This should reduce computations elegantly.
3
3
  class TransposableArray < CallbackArray
4
4
 
5
5
  after_taint :clear_cache
data/lib/ext/array.rb ADDED
@@ -0,0 +1,11 @@
1
+ class Array
2
+ # Defines the number of dimensions:
3
+ # [1,2,3] is 1-dimensional
4
+ # [[1,2,3], [1,2,3]] is 2-dimensional
5
+ # [[[1,2,3], [1,2,3]], [[1,2,3], [1,2,3], [[1,2,3], [1,2,3]]]] is 3-dimensional
6
+ # So [[[1,2,3], [1,2,3]], [[1,2,3], [1,2,3], [[1,2,3], [1,2,3]]]].dimensions == 3
7
+ def dimensions(n=0)
8
+ n += 1
9
+ self.first.is_a?(Array) ? self.first.dimensions(n) : n
10
+ end
11
+ end
@@ -1,4 +1,5 @@
1
1
  require File.join(File.dirname(__FILE__), "/../spec_helper")
2
+ require 'data_frame/arff'
2
3
 
3
4
  describe "ARFF" do
4
5
  before do
@@ -0,0 +1,97 @@
1
+ require File.join(File.dirname(__FILE__), "/../../spec_helper")
2
+
3
+ describe "Column Management" do
4
+ before do
5
+ @labels = [:these, :are, :the, :labels]
6
+ @df = DataFrame.new(*@labels)
7
+ @df.add [1,2,3,4]
8
+ @df.add [5, 6, 7, 8]
9
+ end
10
+
11
+ context "append!" do
12
+ it "should be able to append an array of values to the data frame" do
13
+ @df.append!(:new_column, [5,5])
14
+ @df.new_column.should eql([5,5])
15
+ end
16
+
17
+ it "should be able to append a default value to the data frame" do
18
+ @df.append!(:new_column, :value)
19
+ @df.new_column.should eql([:value, :value])
20
+ end
21
+
22
+ it "should use nil as the default value" do
23
+ @df.append!(:new_column)
24
+ @df.new_column.should eql([nil, nil])
25
+ end
26
+ end
27
+
28
+ context "move_to_last!" do
29
+ it "should be able to move a column to the end of the data frame, useful for dependency models" do
30
+ @df.labels.should eql(@labels)
31
+ @df.move_to_last!(:these)
32
+ @df.labels.should eql([:are, :the, :labels, :these])
33
+ @df.these.should eql([1,5])
34
+ end
35
+ end
36
+
37
+ context "rename!" do
38
+ it "should be able to rename a column" do
39
+ @df.rename!(:new_name, :these)
40
+ @df.labels.should eql([:new_name, :are, :the, :labels])
41
+ end
42
+ end
43
+
44
+ context "drop!" do
45
+ it "should be able to remove a column" do
46
+ @df = DataFrame.new :twos, :threes, :fours
47
+ @df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
48
+ @df.drop!(:twos)
49
+ @df.items.all? {|i| i.should eql([3,4])}
50
+ @df.labels.should eql([:threes, :fours])
51
+ end
52
+
53
+ it "should be able to remove more than one column at a time" do
54
+ @df = DataFrame.new :twos, :threes, :fours
55
+ @df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
56
+ @df.drop!(:twos, :fours)
57
+ @df.items.all? {|i| i.should eql([3])}
58
+ @df.labels.should eql([:threes])
59
+ end
60
+
61
+ end
62
+
63
+ context "replace!" do
64
+ before do
65
+ @doubler = lambda{|e| e * 2}
66
+ end
67
+
68
+ it "should only replace columns that actually exist" do
69
+ lambda{@df.replace!(:not_a_column, &@doubler)}.should raise_error(
70
+ ArgumentError, /Must provide the name of an existing column./)
71
+ lambda{@df.replace!(:these, &@doubler)}.should_not raise_error
72
+ end
73
+
74
+ it "should be able to replace a column with a block" do
75
+ @df.replace!(:these) {|e| e * 2}
76
+ @df.these.should eql([2,10])
77
+ end
78
+
79
+ it "should be able to replace a column with an array" do
80
+ @a = [5,9]
81
+ @df.replace!(:these, @a)
82
+ @df.these.should eql(@a)
83
+ end
84
+ end
85
+
86
+ context "subset_from_columns" do
87
+
88
+ it "should be able to create a subset of columns" do
89
+ new_data_frame = @df.subset_from_columns(:these, :labels)
90
+ new_data_frame.should_not eql(@df)
91
+ new_data_frame.labels.should eql([:these, :labels])
92
+ new_data_frame.items.should eql([[1,4],[5,8]])
93
+ new_data_frame.these.should eql([1,5])
94
+ end
95
+ end
96
+
97
+ end
@@ -0,0 +1,88 @@
1
+ require File.join(File.dirname(__FILE__), "/../../spec_helper")
2
+
3
+ describe "Filter" do
4
+ before do
5
+ @labels = [:these, :are, :the, :labels]
6
+ @df = DataFrame.new(*@labels)
7
+ @df.add [1,2,3,4]
8
+ @df.add [5, 6, 7, 8]
9
+ end
10
+
11
+ it "should be able to filter a data frame with a block using an OpenStruct for each row" do
12
+ @df.filter!(:open_struct) {|row| row.these == 5}
13
+ @df.items.should eql([[5, 6, 7, 8]])
14
+ end
15
+
16
+ it "should be able to filter a data frame with a block using a Hash for each row" do
17
+ @df.filter!(:hash) {|row| row[:these] == 5}
18
+ @df.items.should eql([[5, 6, 7, 8]])
19
+ end
20
+
21
+ S4 = Struct.new(:one, :two, :three, :four)
22
+ it "should be able to filter a data frame with a block using another class that uses the row as input" do
23
+ @df.filter!(S4) {|row| row.one == 5}
24
+ @df.items.should eql([[5, 6, 7, 8]])
25
+ end
26
+
27
+ it "should be able to filter a data frame with a block using an array for each row" do
28
+ @df.filter! {|row| row.first == 5}
29
+ @df.items.should eql([[5, 6, 7, 8]])
30
+ end
31
+
32
+ it "should be able to do fancy things with the row as the filter" do
33
+ @df.filter! {|row| row.sum > 10}
34
+ @df.items.should eql([[5, 6, 7, 8]])
35
+ end
36
+
37
+ it "should be able to generate a new data frame with filter" do
38
+ new_df = @df.filter(:open_struct) {|row| row.these == 5}
39
+ new_df.items.should eql([[5, 6, 7, 8]])
40
+ @df.items.should eql([[1, 2, 3, 4], [5, 6, 7, 8]])
41
+ end
42
+
43
+ end
44
+
45
+ context "filter_by_category" do
46
+
47
+ before do
48
+ @df = DataFrame.new(:weather, :date)
49
+
50
+ (1..31).each do |i|
51
+ @df.add [(i % 3 == 1) ? :fair : :good, Date.parse("07/#{i}/2009")]
52
+ end
53
+
54
+ @d1 = Date.parse("07/15/2009")
55
+ @d2 = Date.parse("07/31/2009")
56
+
57
+ end
58
+
59
+ it "should be able to filter by category" do
60
+ filtered = @df.filter_by_category(:weather => :good)
61
+ filtered.weather.uniq.should eql([:good])
62
+ @df.weather.uniq.should be_include(:fair)
63
+ end
64
+
65
+ it "should be able to manage ranges for filter values" do
66
+ filtered = @df.filter_by_category(:date => (@d1..@d2))
67
+ filtered.date.should_not be_include(Date.parse("07/01/2009"))
68
+ filtered.date.should_not be_include(Date.parse("07/14/2009"))
69
+ filtered.date.should be_include(Date.parse("07/15/2009"))
70
+ filtered.date.should be_include(Date.parse("07/31/2009"))
71
+ @df.date.should be_include(Date.parse("07/01/2009"))
72
+ end
73
+
74
+ it "should be able to take an array of values to filter with" do
75
+ filtered = @df.filter_by_category(:date => [@d1, @d2])
76
+ filtered.date.should_not be_include(Date.parse("07/01/2009"))
77
+ filtered.date.should be_include(Date.parse("07/15/2009"))
78
+ filtered.date.should be_include(Date.parse("07/31/2009"))
79
+ end
80
+
81
+ it "should have a destructive version" do
82
+ @df.filter_by_category!(:date => [@d1, @d2])
83
+ @df.date.should_not be_include(Date.parse("07/01/2009"))
84
+ @df.date.should be_include(Date.parse("07/15/2009"))
85
+ @df.date.should be_include(Date.parse("07/31/2009"))
86
+ end
87
+
88
+ end
@@ -0,0 +1,41 @@
1
+ require File.join(File.dirname(__FILE__), "/../../spec_helper")
2
+
3
+ describe "Import" do
4
+
5
+ before do
6
+ @labels = [:these, :are, :the, :labels]
7
+ @df = DataFrame.new(*@labels)
8
+ end
9
+
10
+ it "should be able to add an item" do
11
+ item = [1,2,3,4]
12
+ @df.add_item(item)
13
+ @df.items.should eql([item])
14
+ end
15
+
16
+ it "should be able to import more than one row at a time" do
17
+ @df.import([[2,2,2,2],[3,3,3,3],[4,4,4,4]])
18
+ @df.row_labels = [:twos, :threes, :fours]
19
+ @df.twos.should eql([2,2,2,2])
20
+ @df.threes.should eql([3,3,3,3])
21
+ @df.fours.should eql([4,4,4,4])
22
+ end
23
+
24
+ it "should be able to import only one row" do
25
+ @df.import([2,2,2,2])
26
+ @df.these.should eql([2])
27
+ end
28
+
29
+ it "should be able to import a reference to csv" do
30
+ contents = %{7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,0
31
+ 7,4,oct,tue,90.6,35.4,669.1,6.7,18,33,0.9,0,0
32
+ }
33
+
34
+ @labels = [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
35
+ @df = DataFrame.new(@labels)
36
+ @df.import(contents)
37
+ @df.x.should eql([7,7])
38
+ @df.area.should eql([0,0])
39
+ end
40
+
41
+ end
@@ -0,0 +1,71 @@
1
+ require File.join(File.dirname(__FILE__), "/../../spec_helper")
2
+
3
+ describe "PreProcess" do
4
+ it "should be able to j_binary_ize! a column, taking its categories and creating a column for each" do
5
+ df = DataFrame.new(:observations)
6
+ df.add [:many]
7
+ df.add [:fine]
8
+ df.add [:things]
9
+ df.add [:are]
10
+ df.add [:available]
11
+ df.j_binary_ize!(:observations)
12
+ df.observations_many.should eql([true, false, false, false, false])
13
+ df.observations_fine.should eql([false, true, false, false, false])
14
+ df.observations_things.should eql([false, false, true, false, false])
15
+ df.observations_are.should eql([false, false, false, true, false])
16
+ df.observations_available.should eql([false, false, false, false, true])
17
+ df.observations.should eql([:many, :fine, :things, :are, :available])
18
+ end
19
+
20
+ it "should be able to j_binary_ize! a more normal column" do
21
+ df = DataFrame.new(:observations)
22
+ df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
23
+ df.observations.add_category(:small) {|e| e <= 3}
24
+ df.observations.add_category(:large) {|e| e >= 3}
25
+ df.j_binary_ize!(:observations)
26
+ df.observations_small.should eql([true, true, true, false, false, false, true, true, true])
27
+ df.observations_large.should eql([false, false, false, true, true, true, false, false, false])
28
+ end
29
+
30
+ it "should be able to j_binary_ize with non-adjacent sets (sets that allow a value to have more than one category)" do
31
+ df = DataFrame.new(:observations)
32
+ df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
33
+ df.observations.add_category(:small) {|e| e <= 3}
34
+ df.observations.add_category(:large) {|e| e >= 3}
35
+ df.j_binary_ize!(:observations, :allow_overlap => true)
36
+ df.observations_small.should eql([true, true, true, false, false, false, true, true, true])
37
+ df.observations_large.should eql([false, false, true, true, true, true, true, false, false])
38
+ end
39
+
40
+ it "should be able to hold multiple ideas of a columns categories by resetting the category and re-running j_binary_ize" do
41
+ df = DataFrame.new(:observations)
42
+ df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
43
+ df.observations.add_category(:small) {|e| e <= 3}
44
+ df.observations.add_category(:large) {|e| e >= 3}
45
+ df.j_binary_ize!(:observations, :allow_overlap => true)
46
+ df.observations.set_categories(:odd => lambda{|e| e.odd?}, :even => lambda{|e| e.even?})
47
+ df.j_binary_ize!(:observations)
48
+ df.observations_small.should eql([true, true, true, false, false, false, true, true, true])
49
+ df.observations_large.should eql([false, false, true, true, true, true, true, false, false])
50
+ df.observations.should eql([1,2,3,4,5,4,3,2,1])
51
+ df.observations_even.should eql([false, true, false, true, false, true, false, true, false])
52
+ df.observations_odd.should eql([true, false, true, false, true, false, true, false, true])
53
+ end
54
+
55
+ context "numericize!" do
56
+
57
+ before do
58
+ @df = DataFrame.new(:observations)
59
+ @df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
60
+ @df.observations.add_category(:small) {|e| e <= 3}
61
+ @df.observations.add_category(:large) {|e| e > 3}
62
+ end
63
+
64
+ it "should be able to numericize nominal data" do
65
+ @df.numericize!(:observations)
66
+ @df.numerical_observations.should eql([[1,0],[1,0],[1,0],[0,1],[0,1],[0,1],[1,0],[1,0],[1,0]])
67
+ end
68
+
69
+ end
70
+
71
+ end
@@ -0,0 +1,61 @@
1
+ require File.join(File.dirname(__FILE__), "/../../spec_helper")
2
+
3
+ describe "Saving" do
4
+ before do
5
+ @df = DataFrame.new(:observations)
6
+ @df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
7
+ @df.observations.add_category(:small) {|e| e <= 3}
8
+ @df.observations.add_category(:large) {|e| e > 3}
9
+ @filename = "/tmp/numericized_observations"
10
+ end
11
+
12
+ after do
13
+ `rm -rf #{@filename}`
14
+ end
15
+
16
+ it "should be able to save the data frame" do
17
+ @df.numericize!(:observations)
18
+ @df.save(@filename)
19
+ File.read(@filename).should eql(@df.to_csv)
20
+ end
21
+
22
+ it "should be able to save the data frame without the header" do
23
+ @df.save(@filename, :include_header => false)
24
+ File.read(@filename).should eql(@df.to_csv(false))
25
+ end
26
+
27
+ it "should be able to save off a subset" do
28
+ @df = DataFrame.new(:observations, :junk)
29
+ @df.import( [1,2,3,4,5,4,3,2,1].map{ |e| [e,e] } )
30
+ @df.save(@filename, :subset => :observations)
31
+ File.read(@filename).should eql(@df.subset_from_columns(:observations).to_csv)
32
+ end
33
+
34
+ it "should be able to filter the rows" do
35
+ @df = DataFrame.new(:observations, :junk)
36
+ @df.import( [1,2,3,4,5,4,3,2,1].map{ |e| [e,e] } )
37
+ @df.save(@filename, :subset => :observations)
38
+ @df.observations.add_category(:small) {|e| e <= 3}
39
+ @df.observations.add_category(:large) {|e| e > 3}
40
+ @df.save(@filename, :filter_by_category => {:observations => :small})
41
+ File.read(@filename).should eql(@df.filter_by_category(:observations => :small).to_csv)
42
+ end
43
+
44
+ it "should have a shortcut for subset, only" do
45
+ @df = DataFrame.new(:observations, :junk)
46
+ @df.import( [1,2,3,4,5,4,3,2,1].map{ |e| [e,e] } )
47
+ @df.save(@filename, :only => :observations)
48
+ File.read(@filename).should eql(@df.subset_from_columns(:observations).to_csv)
49
+ end
50
+
51
+ it "should have a shortcut for filter_by_category, filter" do
52
+ @df = DataFrame.new(:observations, :junk)
53
+ @df.import( [1,2,3,4,5,4,3,2,1].map{ |e| [e,e] } )
54
+ @df.save(@filename, :subset => :observations)
55
+ @df.observations.add_category(:small) {|e| e <= 3}
56
+ @df.observations.add_category(:large) {|e| e > 3}
57
+ @df.save(@filename, :filter => {:observations => :small})
58
+ File.read(@filename).should eql(@df.filter_by_category(:observations => :small).to_csv)
59
+ end
60
+
61
+ end
@@ -0,0 +1,51 @@
1
+ require File.join(File.dirname(__FILE__), "/../../spec_helper")
2
+
3
+ describe "Training" do
4
+ before do
5
+ @df = DataFrame.new(:one)
6
+ @df.import((0...100).to_a)
7
+ end
8
+
9
+ it "should be able to create a proportional training set from a data frame" do
10
+ @df.training_set(:n => 3)
11
+ @df.training_set.size.should eql(3)
12
+ @df.training_set.all? {|e| @df.items.should be_include(e)}
13
+ end
14
+
15
+ it "should use the same training set unless reset is passed to it" do
16
+ @df.training_set(:n => 5)
17
+ @df.training_set.should eql(@df.training_set)
18
+ old = @df.training_set
19
+ @df.training_set(:reset => true, :n => 5)
20
+ @df.training_set.should_not eql(old)
21
+ end
22
+
23
+ it "should be able to create a proportional training set" do
24
+ @df.training_set(:proportion => 0.6)
25
+ @df.training_set.size.should eql(60)
26
+ @df.training_set(:proportion => 0.42, :reset => true)
27
+ @df.training_set.size.should eql(42)
28
+ @df.training_set(:proportion => 0, :reset => true)
29
+ @df.training_set.size.should eql(0)
30
+ @df.training_set(:proportion => 1, :reset => true)
31
+ @df.training_set.size.should eql(100)
32
+ end
33
+
34
+ it "should not have a set size exceeding the items size" do
35
+ @df.training_set(:proportion => 2)
36
+ @df.training_set.size.should eql(100)
37
+ @df.training_set(:n => 200, :reset => true)
38
+ @df.training_set.size.should eql(100)
39
+ end
40
+
41
+ it "should not have any items when the proportion is calculated below 0" do
42
+ @df.training_set(:proportion => -2)
43
+ @df.training_set.size.should eql(0)
44
+ @df.training_set(:n => -2, :reset => true)
45
+ @df.training_set.size.should eql(0)
46
+ end
47
+
48
+ it "should have a default proportion of 80%" do
49
+ @df.training_set.size.should eql(80)
50
+ end
51
+ end