data_frame 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. data/README.rdoc +122 -0
  2. data/VERSION.yml +4 -0
  3. data/bin/plain_frame +22 -0
  4. data/lib/data_frame.rb +26 -0
  5. data/lib/data_frame/arff.rb +52 -0
  6. data/lib/data_frame/callback_array.rb +152 -0
  7. data/lib/data_frame/core/column_management.rb +147 -0
  8. data/lib/data_frame/core/filter.rb +48 -0
  9. data/lib/data_frame/core/import.rb +113 -0
  10. data/lib/data_frame/core/pre_process.rb +69 -0
  11. data/lib/data_frame/core/saving.rb +29 -0
  12. data/lib/data_frame/core/training.rb +46 -0
  13. data/lib/data_frame/data_frame.rb +115 -0
  14. data/lib/data_frame/id3.rb +28 -0
  15. data/lib/data_frame/kmeans.rb +10 -0
  16. data/lib/data_frame/labels_from_uci.rb +48 -0
  17. data/lib/data_frame/mlp.rb +18 -0
  18. data/lib/data_frame/model.rb +22 -0
  19. data/lib/data_frame/parameter_capture.rb +50 -0
  20. data/lib/data_frame/sbn.rb +18 -0
  21. data/lib/data_frame/transposable_array.rb +23 -0
  22. data/lib/ext/array.rb +11 -0
  23. data/lib/ext/open_struct.rb +5 -0
  24. data/lib/ext/string.rb +5 -0
  25. data/lib/ext/symbol.rb +5 -0
  26. data/spec/data_frame/arff_spec.rb +48 -0
  27. data/spec/data_frame/callback_array_spec.rb +148 -0
  28. data/spec/data_frame/core/column_management_spec.rb +128 -0
  29. data/spec/data_frame/core/filter_spec.rb +88 -0
  30. data/spec/data_frame/core/import_spec.rb +41 -0
  31. data/spec/data_frame/core/pre_process_spec.rb +103 -0
  32. data/spec/data_frame/core/saving_spec.rb +61 -0
  33. data/spec/data_frame/core/training_spec.rb +72 -0
  34. data/spec/data_frame/data_frame_spec.rb +141 -0
  35. data/spec/data_frame/id3_spec.rb +22 -0
  36. data/spec/data_frame/model_spec.rb +36 -0
  37. data/spec/data_frame/parameter_capture_spec.rb +32 -0
  38. data/spec/data_frame/transposable_array_spec.rb +138 -0
  39. data/spec/data_frame_spec.rb +29 -0
  40. data/spec/ext/array_spec.rb +13 -0
  41. data/spec/fixtures/basic.csv +3 -0
  42. data/spec/fixtures/discrete_testing.csv +4 -0
  43. data/spec/fixtures/discrete_training.csv +21 -0
  44. data/spec/spec_helper.rb +8 -0
  45. metadata +128 -0
@@ -0,0 +1,10 @@
1
+ module DF #:nodoc:
2
+ # Uses a KMeans classifier to cluster the data set.
3
+ module KMeans
4
+
5
+ end
6
+ end
7
+
8
+ class DataFrame
9
+ include DF::KMeans
10
+ end
@@ -0,0 +1,48 @@
1
+ # The University of California - Irvine has a great set of machine
2
+ # learning sample data sets. Their data description pages have field
3
+ # label descriptors. This class extracts them and returns a DataFrame
4
+ # with the labels of a data set.
5
+
6
+ # Turns out, this isn't very useful. So...oh well.
7
+ # By the way, the code I'm talking about is found here: http://archive.ics.uci.edu/ml/
8
+ # And to use this class:
9
+ # require 'lib/data_frame/labels_from_uci'
10
+ # df = LabelsFromUCI.data_frame 'http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.names'
11
+ # df.import('http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data')
12
+
13
+ class LabelsFromUCI
14
+
15
+ class << self
16
+ def process(url)
17
+ lfu = new(url)
18
+ lfu.labels
19
+ end
20
+
21
+ def data_frame(url)
22
+ lfu = new(url)
23
+ DataFrame.new(lfu.labels)
24
+ end
25
+ end
26
+
27
+ attr_reader :url, :contents, :labels
28
+
29
+ def initialize(url)
30
+ @url = url
31
+ open(url) { |f| @contents = f.read }
32
+ process_labels
33
+ end
34
+
35
+ protected
36
+ def process_labels
37
+ @labels = []
38
+ @contents.each_line do |line|
39
+ if line =~ label_re
40
+ @labels << $1
41
+ end
42
+ end
43
+ end
44
+
45
+ def label_re
46
+ /@attribute (\w+)/
47
+ end
48
+ end
@@ -0,0 +1,18 @@
1
+ module DF #:nodoc:
2
+ # Turns Data Frame into a feeder for Red Davis' MLP classifier.
3
+ # Will install it if you don't have it.
4
+ module MLP
5
+ begin
6
+ gem 'reddavis-mlp'
7
+ require 'mlp'
8
+ rescue
9
+ `sudo gem install reddavis-mlp`
10
+ gem 'reddavis-mlp'
11
+ require 'mlp'
12
+ end
13
+ end
14
+ end
15
+
16
+ class DataFrame
17
+ include DF::MLP
18
+ end
@@ -0,0 +1,22 @@
1
+ # Adds the model methods to the data frame.
2
+ class DataFrame
3
+
4
+ # Returns a model if defined
5
+ # Defines a model with a block, if given and not defined
6
+ # Stores the model in the models container, which gives us access like:
7
+ # df.models.new_model_name...
8
+ def model(name=nil, &block)
9
+ return self.models[name] if self.models.table.keys.include?(name)
10
+ return false unless block
11
+ @pc = ParameterCapture.new(&block)
12
+ model = self.filter(Hash) do |row|
13
+ @pc.filter(row)
14
+ end
15
+ self.models.table[name] = model
16
+ end
17
+
18
+ def models
19
+ @models ||= OpenStruct.new
20
+ end
21
+
22
+ end
@@ -0,0 +1,50 @@
1
+ # Captures the intent of a model definition in a block. Usage:
2
+ # pc = ParameterCapture.new do |p|
3
+ # p.whatever :some_value
4
+ # p.another :one
5
+ # p.or_list [1, 2]
6
+ # p.or_range (1..2)
7
+ # end
8
+ # pc.parameters
9
+ # => {:whatever => :some_value, :another => :one, :or_list => [1,2], :or_range => (1..2)}
10
+ class ParameterCapture
11
+ def initialize(&block)
12
+ self.instance_eval &block
13
+ end
14
+
15
+ def parameters
16
+ @parameters ||= OpenStruct.new
17
+ end
18
+
19
+ # Exposes the set keys
20
+ def keys
21
+ self.parameters.table.keys
22
+ end
23
+
24
+ # can be used in a data_frame filter.
25
+ # @pc.filter(row) Using a Hash as a cast type for the filter.
26
+ def filter(row)
27
+ self.keys.each do |key|
28
+ value = self.parameters.send(key)
29
+ case value
30
+ when Array
31
+ return false unless value.include?(row[key])
32
+ when Range
33
+ return false unless value.include?(row[key])
34
+ else
35
+ return false unless value === row[key]
36
+ end
37
+ end
38
+ return true
39
+ end
40
+
41
+ def method_missing(key, *values, &block)
42
+ if self.parameters.table.keys.include?(key)
43
+ self.parameters.send(key)
44
+ elsif values.size == 1
45
+ self.parameters.table[key] = values.first
46
+ else
47
+ self.parameters.table[key] = values
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,18 @@
1
+ module DF #:nodoc:
2
+ # Turns Data Frame into a feeder for Carl Youngblood's Simple Bayesian classifier.
3
+ # Will install it if you don't have it.
4
+ module SBN
5
+ begin
6
+ gem 'sbn'
7
+ require 'sbn'
8
+ rescue
9
+ `sudo gem install sbn`
10
+ gem 'sbn'
11
+ require 'sbn'
12
+ end
13
+ end
14
+ end
15
+
16
+ class DataFrame
17
+ include DF::SBN
18
+ end
@@ -0,0 +1,23 @@
1
+ # The only trick in this array is that it's transpose is memoized until
2
+ # it is tainted. This should reduce computations elegantly.
3
+ class TransposableArray < CallbackArray
4
+
5
+ after_taint :clear_cache
6
+
7
+ orig_transpose = instance_method(:transpose)
8
+ define_method(:transpose) {
9
+ self.untaint
10
+ @transpose ||= orig_transpose.bind(self).call
11
+ }
12
+
13
+ # For debugging and testing purposes, it just feels dirty to always ask
14
+ # for @ta.send(:instance_variable_get, :@transpose)
15
+ def cache
16
+ @transpose
17
+ end
18
+
19
+ def clear_cache
20
+ @transpose = nil
21
+ end
22
+ protected :clear_cache
23
+ end
@@ -0,0 +1,11 @@
1
+ class Array
2
+ # Defines the number of dimensions:
3
+ # [1,2,3] is 1-dimensional
4
+ # [[1,2,3], [1,2,3]] is 2-dimensional
5
+ # [[[1,2,3], [1,2,3]], [[1,2,3], [1,2,3], [[1,2,3], [1,2,3]]]] is 3-dimensional
6
+ # So [[[1,2,3], [1,2,3]], [[1,2,3], [1,2,3], [[1,2,3], [1,2,3]]]].dimensions == 3
7
+ def dimensions(n=0)
8
+ n += 1
9
+ self.first.is_a?(Array) ? self.first.dimensions(n) : n
10
+ end
11
+ end
@@ -0,0 +1,5 @@
1
+ class OpenStruct
2
+ def table
3
+ @table
4
+ end
5
+ end
@@ -0,0 +1,5 @@
1
+ class String # :nodoc:
2
+ def to_underscore_sym
3
+ self.titleize.gsub(/\s+/, '').underscore.to_sym
4
+ end
5
+ end
@@ -0,0 +1,5 @@
1
+ class Symbol # :nodoc:
2
+ def to_underscore_sym
3
+ self.to_s.titleize.gsub(/\s+/, '').underscore.to_sym
4
+ end
5
+ end
@@ -0,0 +1,48 @@
1
+ require File.join(File.dirname(__FILE__), "/../spec_helper")
2
+ require 'data_frame/arff'
3
+
4
+ describe "ARFF" do
5
+ before do
6
+ @df = DataFrame.from_csv(File.expand_path(File.join(File.dirname(__FILE__), '..', 'fixtures', 'basic.csv')))
7
+ end
8
+
9
+ it "should allow a data frame to be expressed as an arff-formatted file" do
10
+ @df.to_arff.should eql(basic_arff)
11
+ end
12
+
13
+ it "should add a to_csv method" do
14
+ @df.to_csv.should eql(%{x,y,month,day,ffmc,dmc,dc,isi,temp,rh,wind,rain,area
15
+ 7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,0
16
+ 7,4,oct,tue,90.6,35.4,669.1,6.7,18,33,0.9,0,0
17
+ })
18
+ end
19
+
20
+ it "should allow a non-header export for to_csv" do
21
+ @df.to_csv(false).should eql(%{7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,0
22
+ 7,4,oct,tue,90.6,35.4,669.1,6.7,18,33,0.9,0,0
23
+ })
24
+ end
25
+ end
26
+
27
+ def basic_arff
28
+ %[@relation basic
29
+
30
+ @attribute x {7}
31
+ @attribute y {4,5}
32
+ @attribute month {mar,oct}
33
+ @attribute day {fri,tue}
34
+ @attribute ffmc {86.2,90.6}
35
+ @attribute dmc {26.2,35.4}
36
+ @attribute dc {94.3,669.1}
37
+ @attribute isi {5.1,6.7}
38
+ @attribute temp {8.2,18}
39
+ @attribute rh {33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51}
40
+ @attribute wind {0.9,6.7}
41
+ @attribute rain {0}
42
+ @attribute area {0}
43
+
44
+ @data
45
+ 7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,0
46
+ 7,4,oct,tue,90.6,35.4,669.1,6.7,18,33,0.9,0,0
47
+ ]
48
+ end
@@ -0,0 +1,148 @@
1
+ require File.join(File.dirname(__FILE__), "/../spec_helper")
2
+
3
+ # TransposableArray is a thorough test on the after_taint method. Here
4
+ # I only test the other callbacks.
5
+ class Register
6
+ def self.next(meth)
7
+ @@count ||= {}
8
+ @@count[meth] ||= 0
9
+ @@count[meth] += 1
10
+ end
11
+ def self.for(meth)
12
+ @@count ||= {}
13
+ @@count[meth]
14
+ end
15
+ end
16
+
17
+ class A < CallbackArray
18
+ before_taint :register_before_taint
19
+ def register_before_taint
20
+ Register.next(:before_taint)
21
+ end
22
+
23
+ before_untaint :register_before_untaint
24
+ def register_before_untaint
25
+ Register.next(:before_untaint)
26
+ end
27
+
28
+ after_untaint :register_after_untaint
29
+ def register_after_untaint
30
+ Register.next(:after_untaint)
31
+ end
32
+ end
33
+
34
+ describe CallbackArray do
35
+ before do
36
+ @a = A.new [1,2,3]
37
+ end
38
+
39
+ context "before_taint" do
40
+ before do
41
+ @c = Register.for(:before_taint) || 0
42
+ end
43
+
44
+ after do
45
+ Register.for(:before_taint).should eql(@c + 1)
46
+ @a.should be_tainted
47
+ end
48
+
49
+ it "should callback before taint" do
50
+ @a.taint
51
+ end
52
+
53
+ it "should callback before :[]=" do
54
+ @a[0] = 2
55
+ end
56
+
57
+ it "should callback before :<<" do
58
+ @a << 3
59
+ end
60
+
61
+ it "should callback before :delete" do
62
+ @a.delete(2)
63
+ end
64
+
65
+ it "should callback before :push" do
66
+ @a.push(5)
67
+ end
68
+
69
+ it "should callback before :pop" do
70
+ @a.pop
71
+ end
72
+
73
+ it "should callback before :shift" do
74
+ @a.shift
75
+ end
76
+
77
+ it "should callback before :unshift" do
78
+ @a.unshift(6)
79
+ end
80
+
81
+ it "should callback before :map!" do
82
+ @a.map! {|e| e}
83
+ end
84
+
85
+ it "should callback before :sort!" do
86
+ @a.sort!
87
+ end
88
+
89
+ it "should callback before :reverse!" do
90
+ @a.reverse!
91
+ end
92
+
93
+ it "should callback before :collect!" do
94
+ @a.collect! {|e| e}
95
+ end
96
+
97
+ it "should callback before :compact!" do
98
+ @a.compact!
99
+ end
100
+
101
+ it "should callback before :reject!" do
102
+ @a.reject! {|e| not e}
103
+ end
104
+
105
+ it "should callback before :slice!" do
106
+ @a.slice!(1,2)
107
+ end
108
+
109
+ it "should callback before :flatten!" do
110
+ @a.flatten!
111
+ end
112
+
113
+ it "should callback before :uniq!" do
114
+ @a.uniq!
115
+ end
116
+
117
+ it "should callback before :clear" do
118
+ @a.clear
119
+ end
120
+
121
+
122
+ end
123
+
124
+ it "should not adjust the array in other methods" do
125
+ @a.at(0)
126
+ @a.sort
127
+ @a.uniq
128
+ @a.find{|e| e}
129
+ Register.for(:before_taint).should be_nil
130
+ @a.should_not be_tainted
131
+ end
132
+
133
+ it "should callback before untaint" do
134
+ c = Register.for(:before_untaint) || 0
135
+ @a.taint
136
+ @a.untaint
137
+ Register.for(:before_untaint).should eql(c + 1)
138
+ end
139
+
140
+ it "should callback after untaint" do
141
+ c = Register.for(:after_untaint) || 0
142
+ @a.taint
143
+ @a.untaint
144
+ Register.for(:after_untaint).should eql(c + 1)
145
+ end
146
+
147
+ end
148
+
@@ -0,0 +1,128 @@
1
+ require File.join(File.dirname(__FILE__), "/../../spec_helper")
2
+
3
+ describe "Column Management" do
4
+ before do
5
+ @labels = [:these, :are, :the, :labels]
6
+ @df = DataFrame.new(*@labels)
7
+ @df.add [1,2,3,4]
8
+ @df.add [5, 6, 7, 8]
9
+ end
10
+
11
+ context "append!" do
12
+ it "should be able to append an array of values to the data frame" do
13
+ @df.append!(:new_column, [5,5])
14
+ @df.new_column.should eql([5,5])
15
+ end
16
+
17
+ it "should be able to append a default value to the data frame" do
18
+ @df.append!(:new_column, :value)
19
+ @df.new_column.should eql([:value, :value])
20
+ end
21
+
22
+ it "should use nil as the default value" do
23
+ @df.append!(:new_column)
24
+ @df.new_column.should eql([nil, nil])
25
+ end
26
+ end
27
+
28
+ context "move_to_last!" do
29
+ it "should be able to move a column to the end of the data frame, useful for dependency models" do
30
+ @df.labels.should eql(@labels)
31
+ @df.move_to_last!(:these)
32
+ @df.labels.should eql([:are, :the, :labels, :these])
33
+ @df.these.should eql([1,5])
34
+ end
35
+ end
36
+
37
+ context "rename!" do
38
+ it "should be able to rename a column" do
39
+ @df.rename!(:new_name, :these)
40
+ @df.labels.should eql([:new_name, :are, :the, :labels])
41
+ end
42
+
43
+ it "should be able to use the new column name with dot notation" do
44
+ v = @df.these.dup
45
+ @df.rename!(:new_name, :these)
46
+ @df.new_name.should eql(v)
47
+ end
48
+ end
49
+
50
+ context "drop!" do
51
+ it "should be able to remove a column" do
52
+ @df = DataFrame.new :twos, :threes, :fours
53
+ @df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
54
+ @df.drop!(:twos)
55
+ @df.items.all? {|i| i.should eql([3,4])}
56
+ @df.labels.should eql([:threes, :fours])
57
+ end
58
+
59
+ it "should be able to remove more than one column at a time" do
60
+ @df = DataFrame.new :twos, :threes, :fours
61
+ @df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
62
+ @df.drop!(:twos, :fours)
63
+ @df.items.all? {|i| i.should eql([3])}
64
+ @df.labels.should eql([:threes])
65
+ end
66
+
67
+ end
68
+
69
+ context "replace!" do
70
+ before do
71
+ @doubler = lambda{|e| e * 2}
72
+ end
73
+
74
+ it "should only replace columns that actually exist" do
75
+ lambda{@df.replace!(:not_a_column, &@doubler)}.should raise_error(
76
+ ArgumentError, /Must provide the name of an existing column./)
77
+ lambda{@df.replace!(:these, &@doubler)}.should_not raise_error
78
+ end
79
+
80
+ it "should be able to replace a column with a block" do
81
+ @df.replace!(:these) {|e| e * 2}
82
+ @df.these.should eql([2,10])
83
+ end
84
+
85
+ it "should be able to replace a column with an array" do
86
+ @a = [5,9]
87
+ @df.replace!(:these, @a)
88
+ @df.these.should eql(@a)
89
+ end
90
+ end
91
+
92
+ context "subset_from_columns" do
93
+
94
+ it "should be able to create a subset of columns" do
95
+ new_data_frame = @df.subset_from_columns(:these, :labels)
96
+ new_data_frame.should_not eql(@df)
97
+ new_data_frame.labels.should eql([:these, :labels])
98
+ new_data_frame.items.should eql([[1,4],[5,8]])
99
+ new_data_frame.these.should eql([1,5])
100
+ end
101
+ end
102
+
103
+ context "duplicate!" do
104
+ it "should be able to duplicate a column" do
105
+ @df.duplicate!(:these)
106
+ @df.these1.should eql(@df.these)
107
+ end
108
+
109
+ it "should use unique names for the duplicate column" do
110
+ @df.duplicate!(:these)
111
+ @df.duplicate!(:these)
112
+ @df.duplicate!(:these)
113
+ @df.these3.should eql(@df.these2)
114
+ @df.these2.should eql(@df.these1)
115
+ @df.these1.should eql(@df.these)
116
+ end
117
+
118
+ it "should reset the labels list when a column is duplicated" do
119
+ @df.duplicate!(:these)
120
+ @df.labels.should be_include(:these1)
121
+ end
122
+
123
+ it "should return true, rather than the whole data set" do
124
+ @df.duplicate!(:these).should eql(true)
125
+ end
126
+ end
127
+
128
+ end