davidrichards-data_frame 0.0.15 → 0.0.17

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,22 @@
1
+ # Adds the model methods to the data frame.
2
+ class DataFrame
3
+
4
+ # Returns a model if defined
5
+ # Defines a model with a block, if given and not defined
6
+ # Stores the model in the models container, which gives us access like:
7
+ # df.models.new_model_name...
8
+ def model(name=nil, &block)
9
+ return self.models[name] if self.models.table.keys.include?(name)
10
+ return false unless block
11
+ @pc = ParameterCapture.new(&block)
12
+ model = self.filter(Hash) do |row|
13
+ @pc.filter(row)
14
+ end
15
+ self.models.table[name] = model
16
+ end
17
+
18
+ def models
19
+ @models ||= OpenStruct.new
20
+ end
21
+
22
+ end
@@ -0,0 +1,50 @@
1
+ # Captures the intent of a model definition in a block. Usage:
2
+ # pc = ParameterCapture.new do |p|
3
+ # p.whatever :some_value
4
+ # p.another :one
5
+ # p.or_list [1, 2]
6
+ # p.or_range (1..2)
7
+ # end
8
+ # pc.parameters
9
+ # => {:whatever => :some_value, :another => :one, :or_list => [1,2], :or_range => (1..2)}
10
+ class ParameterCapture
11
+ def initialize(&block)
12
+ self.instance_eval &block
13
+ end
14
+
15
+ def parameters
16
+ @parameters ||= OpenStruct.new
17
+ end
18
+
19
+ # Exposes the set keys
20
+ def keys
21
+ self.parameters.table.keys
22
+ end
23
+
24
+ # can be used in a data_frame filter.
25
+ # @pc.filter(row) Using a Hash as a cast type for the filter.
26
+ def filter(row)
27
+ self.keys.each do |key|
28
+ value = self.parameters.send(key)
29
+ case value
30
+ when Array
31
+ return false unless value.include?(row[key])
32
+ when Range
33
+ return false unless value.include?(row[key])
34
+ else
35
+ return false unless value === row[key]
36
+ end
37
+ end
38
+ return true
39
+ end
40
+
41
+ def method_missing(key, *values, &block)
42
+ if self.parameters.table.keys.include?(key)
43
+ self.parameters.send(key)
44
+ elsif values.size == 1
45
+ self.parameters.table[key] = values.first
46
+ else
47
+ self.parameters.table[key] = values
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,341 @@
1
+ require File.join(File.dirname(__FILE__), "/../spec_helper")
2
+
3
+ describe DataFrame do
4
+
5
+ before do
6
+ @labels = [:these, :are, :the, :labels]
7
+ @df = DataFrame.new(*@labels)
8
+ end
9
+
10
+ it "should initialize with labels" do
11
+ df = DataFrame.new(*@labels)
12
+ df.labels.should eql(@labels)
13
+ end
14
+
15
+ it "should initialize with an empty items list" do
16
+ @df.items.should be_is_a(TransposableArray)
17
+ @df.items.should be_empty
18
+ end
19
+
20
+ it "should be able to add an item" do
21
+ item = [1,2,3,4]
22
+ @df.add_item(item)
23
+ @df.items.should eql([item])
24
+ end
25
+
26
+ it "should use just_enumerable_stats" do
27
+ [1,2,3].std.should eql(1)
28
+ lambda{[1,2,3].cor([2,3,5])}.should_not raise_error
29
+ end
30
+
31
+ context "column and row operations" do
32
+ before do
33
+ @df.add_item([1,2,3,4])
34
+ @df.add_item([5,6,7,8])
35
+ @df.add_item([9,10,11,12])
36
+ end
37
+
38
+ it "should have a method for every label, the column in the data frame" do
39
+ @df.these.should eql([1,5,9])
40
+ end
41
+
42
+ it "should make columns easily computable" do
43
+ @df.these.std.should eql([1,5,9].std)
44
+ end
45
+
46
+ it "should defer unknown methods to the items in the data frame" do
47
+ @df[0].should eql([1,2,3,4])
48
+ @df << [13,14,15,16]
49
+ @df.last.should eql([13,14,15,16])
50
+ @df.map { |e| e.sum }.should eql([10,26,42,58])
51
+ end
52
+
53
+ it "should allow optional row labels" do
54
+ @df.row_labels.should eql([])
55
+ end
56
+
57
+ it "should have a setter for row labels" do
58
+ @df.row_labels = [:other, :things, :here]
59
+ @df.row_labels.should eql([:other, :things, :here])
60
+ end
61
+
62
+ it "should be able to access rows by their labels" do
63
+ @df.row_labels = [:other, :things, :here]
64
+ @df.here.should eql([9,10,11,12])
65
+ end
66
+
67
+ it "should make rows easily computable" do
68
+ @df.row_labels = [:other, :things, :here]
69
+ @df.here.sum.should eql(42)
70
+ end
71
+ end
72
+
73
+ it "should be able to import more than one row at a time" do
74
+ @df.import([[2,2,2,2],[3,3,3,3],[4,4,4,4]])
75
+ @df.row_labels = [:twos, :threes, :fours]
76
+ @df.twos.should eql([2,2,2,2])
77
+ @df.threes.should eql([3,3,3,3])
78
+ @df.fours.should eql([4,4,4,4])
79
+ end
80
+
81
+ context "csv" do
82
+ it "should compute easily from csv" do
83
+ contents = %{X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
84
+ 7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,0
85
+ 7,4,oct,tue,90.6,35.4,669.1,6.7,18,33,0.9,0,0
86
+ }
87
+ labels = [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
88
+
89
+ @df = DataFrame.from_csv(contents)
90
+ @df.labels.should eql(labels)
91
+ @df.x.should eql([7,7])
92
+ @df.area.should eql([0,0])
93
+ end
94
+ end
95
+
96
+ it "should be able to remove a column" do
97
+ @df = DataFrame.new :twos, :threes, :fours
98
+ @df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
99
+ @df.drop!(:twos)
100
+ @df.items.all? {|i| i.should eql([3,4])}
101
+ @df.labels.should eql([:threes, :fours])
102
+ end
103
+
104
+ it "should be able to remove more than one column at a time" do
105
+ @df = DataFrame.new :twos, :threes, :fours
106
+ @df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
107
+ @df.drop!(:twos, :fours)
108
+ @df.items.all? {|i| i.should eql([3])}
109
+ @df.labels.should eql([:threes])
110
+ end
111
+
112
+ it "should offer a hash-like structure of columns" do
113
+ @df.add [1,2,3,4]
114
+ @df.add [5, 6, 7, 8]
115
+ @df.columns[:these].should eql([1, 5])
116
+ @df.columns[:are].should eql([2, 6])
117
+ @df.columns[:the].should eql([3, 7])
118
+ @df.columns[:labels].should eql([4, 8])
119
+ end
120
+
121
+ it "should alias items with rows" do
122
+ @df.add [1,2,3,4]
123
+ @df.add [5, 6, 7, 8]
124
+ @df.rows.should eql(@df.items)
125
+ end
126
+
127
+ it "should be able to export a hash" do
128
+ @df.add [1,2,3,4]
129
+ @df.add [5, 6, 7, 8]
130
+ hash = @df.to_hash
131
+ values = [[1,5],[2,6],[3,7],[4,8]]
132
+ hash.keys.size.should eql(@labels.size)
133
+ hash.keys.all? {|e| @labels.should be_include(e)}
134
+ hash.values.size.should eql(@labels.size)
135
+ hash.values.all? {|e| values.should be_include(e)}
136
+ end
137
+
138
+ it "should use variables like labels" do
139
+ @df.labels.should eql(@labels)
140
+ @df.variables.should eql(@labels)
141
+ end
142
+
143
+ context "replace!" do
144
+ before do
145
+ @df.add [1,2,3,4]
146
+ @df.add [5, 6, 7, 8]
147
+ @doubler = lambda{|e| e * 2}
148
+ end
149
+
150
+ it "should only replace columns that actually exist" do
151
+ lambda{@df.replace!(:not_a_column, &@doubler)}.should raise_error(
152
+ ArgumentError, /Must provide the name of an existing column./)
153
+ lambda{@df.replace!(:these, &@doubler)}.should_not raise_error
154
+ end
155
+
156
+ it "should be able to replace a column with a block" do
157
+ @df.replace!(:these) {|e| e * 2}
158
+ @df.these.should eql([2,10])
159
+ end
160
+
161
+ it "should be able to replace a column with an array" do
162
+ @a = [5,9]
163
+ @df.replace!(:these, @a)
164
+ @df.these.should eql(@a)
165
+ end
166
+ end
167
+
168
+ context "filter!" do
169
+ before do
170
+ @df.add [1,2,3,4]
171
+ @df.add [5, 6, 7, 8]
172
+ end
173
+
174
+ it "should be able to filter a data frame with a block using an OpenStruct for each row" do
175
+ @df.filter!(:open_struct) {|row| row.these == 5}
176
+ @df.items.should eql([[5, 6, 7, 8]])
177
+ end
178
+
179
+ it "should be able to filter a data frame with a block using a Hash for each row" do
180
+ @df.filter!(:hash) {|row| row[:these] == 5}
181
+ @df.items.should eql([[5, 6, 7, 8]])
182
+ end
183
+
184
+ S4 = Struct.new(:one, :two, :three, :four)
185
+ it "should be able to filter a data frame with a block using another class that uses the row as input" do
186
+ @df.filter!(S4) {|row| row.one == 5}
187
+ @df.items.should eql([[5, 6, 7, 8]])
188
+ end
189
+
190
+ it "should be able to filter a data frame with a block using an array for each row" do
191
+ @df.filter! {|row| row.first == 5}
192
+ @df.items.should eql([[5, 6, 7, 8]])
193
+ end
194
+
195
+ it "should be able to do fancy things with the row as the filter" do
196
+ @df.filter! {|row| row.sum > 10}
197
+ @df.items.should eql([[5, 6, 7, 8]])
198
+ end
199
+
200
+ it "should be able to generate a new data frame with filter" do
201
+ new_df = @df.filter(:open_struct) {|row| row.these == 5}
202
+ new_df.items.should eql([[5, 6, 7, 8]])
203
+ @df.items.should eql([[1, 2, 3, 4], [5, 6, 7, 8]])
204
+ end
205
+
206
+ end
207
+
208
+ context "filter_by_category" do
209
+
210
+ before do
211
+ @df = DataFrame.new(:weather, :date)
212
+
213
+ (1..31).each do |i|
214
+ @df.add [(i % 3 == 1) ? :fair : :good, Date.parse("07/#{i}/2009")]
215
+ end
216
+
217
+ @d1 = Date.parse("07/15/2009")
218
+ @d2 = Date.parse("07/31/2009")
219
+
220
+ end
221
+
222
+ it "should be able to filter by category" do
223
+ filtered = @df.filter_by_category(:weather => :good)
224
+ filtered.weather.uniq.should eql([:good])
225
+ @df.weather.uniq.should be_include(:fair)
226
+ end
227
+
228
+ it "should be able to manage ranges for filter values" do
229
+ filtered = @df.filter_by_category(:date => (@d1..@d2))
230
+ filtered.date.should_not be_include(Date.parse("07/01/2009"))
231
+ filtered.date.should_not be_include(Date.parse("07/14/2009"))
232
+ filtered.date.should be_include(Date.parse("07/15/2009"))
233
+ filtered.date.should be_include(Date.parse("07/31/2009"))
234
+ @df.date.should be_include(Date.parse("07/01/2009"))
235
+ end
236
+
237
+ it "should be able to take an array of values to filter with" do
238
+ filtered = @df.filter_by_category(:date => [@d1, @d2])
239
+ filtered.date.should_not be_include(Date.parse("07/01/2009"))
240
+ filtered.date.should be_include(Date.parse("07/15/2009"))
241
+ filtered.date.should be_include(Date.parse("07/31/2009"))
242
+ end
243
+
244
+ it "should have a destructive version" do
245
+ @df.filter_by_category!(:date => [@d1, @d2])
246
+ @df.date.should_not be_include(Date.parse("07/01/2009"))
247
+ @df.date.should be_include(Date.parse("07/15/2009"))
248
+ @df.date.should be_include(Date.parse("07/31/2009"))
249
+ end
250
+
251
+ end
252
+
253
+ context "subset_from_columns" do
254
+ before do
255
+ @df.add [1,2,3,4]
256
+ @df.add [5, 6, 7, 8]
257
+ end
258
+
259
+ it "should be able to create a subset of columns" do
260
+ new_data_frame = @df.subset_from_columns(:these, :labels)
261
+ new_data_frame.should_not eql(@df)
262
+ new_data_frame.labels.should eql([:these, :labels])
263
+ new_data_frame.items.should eql([[1,4],[5,8]])
264
+ new_data_frame.these.should eql([1,5])
265
+ end
266
+ end
267
+
268
+ it "should be able to j_binary_ize! a column, taking its categories and creating a column for each" do
269
+ df = DataFrame.new(:observations)
270
+ df.add [:many]
271
+ df.add [:fine]
272
+ df.add [:things]
273
+ df.add [:are]
274
+ df.add [:available]
275
+ df.j_binary_ize!(:observations)
276
+ df.observations_many.should eql([true, false, false, false, false])
277
+ df.observations_fine.should eql([false, true, false, false, false])
278
+ df.observations_things.should eql([false, false, true, false, false])
279
+ df.observations_are.should eql([false, false, false, true, false])
280
+ df.observations_available.should eql([false, false, false, false, true])
281
+ df.observations.should eql([:many, :fine, :things, :are, :available])
282
+ end
283
+
284
+ it "should be able to j_binary_ize! a more normal column" do
285
+ df = DataFrame.new(:observations)
286
+ df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
287
+ df.observations.add_category(:small) {|e| e <= 3}
288
+ df.observations.add_category(:large) {|e| e >= 3}
289
+ df.j_binary_ize!(:observations)
290
+ df.observations_small.should eql([true, true, true, false, false, false, true, true, true])
291
+ df.observations_large.should eql([false, false, false, true, true, true, false, false, false])
292
+ end
293
+
294
+ it "should be able to j_binary_ize with non-adjacent sets (sets that allow a value to have more than one category)" do
295
+ df = DataFrame.new(:observations)
296
+ df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
297
+ df.observations.add_category(:small) {|e| e <= 3}
298
+ df.observations.add_category(:large) {|e| e >= 3}
299
+ df.j_binary_ize!(:observations, :allow_overlap => true)
300
+ df.observations_small.should eql([true, true, true, false, false, false, true, true, true])
301
+ df.observations_large.should eql([false, false, true, true, true, true, true, false, false])
302
+ end
303
+
304
+ it "should be able to hold multiple ideas of a columns categories by resetting the category and re-running j_binary_ize" do
305
+ df = DataFrame.new(:observations)
306
+ df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
307
+ df.observations.add_category(:small) {|e| e <= 3}
308
+ df.observations.add_category(:large) {|e| e >= 3}
309
+ df.j_binary_ize!(:observations, :allow_overlap => true)
310
+ df.observations.set_categories(:odd => lambda{|e| e.odd?}, :even => lambda{|e| e.even?})
311
+ df.j_binary_ize!(:observations)
312
+ df.observations_small.should eql([true, true, true, false, false, false, true, true, true])
313
+ df.observations_large.should eql([false, false, true, true, true, true, true, false, false])
314
+ df.observations.should eql([1,2,3,4,5,4,3,2,1])
315
+ df.observations_even.should eql([false, true, false, true, false, true, false, true, false])
316
+ df.observations_odd.should eql([true, false, true, false, true, false, true, false, true])
317
+ end
318
+
319
+ context "append!" do
320
+
321
+ before do
322
+ @df.add [1,2,3,4]
323
+ @df.add [5, 6, 7, 8]
324
+ end
325
+
326
+ it "should be able to append an array of values to the data frame" do
327
+ @df.append!(:new_column, [5,5])
328
+ @df.new_column.should eql([5,5])
329
+ end
330
+
331
+ it "should be able to append a default value to the data frame" do
332
+ @df.append!(:new_column, :value)
333
+ @df.new_column.should eql([:value, :value])
334
+ end
335
+
336
+ it "should use nil as the default value" do
337
+ @df.append!(:new_column)
338
+ @df.new_column.should eql([nil, nil])
339
+ end
340
+ end
341
+ end
@@ -0,0 +1,36 @@
1
+ require File.join(File.dirname(__FILE__), "/../spec_helper")
2
+
3
+ describe DataFrame, "model" do
4
+ before do
5
+ @csv = %{a,b,c
6
+ 1,2,3
7
+ 2,2,2
8
+ 4,5,6}
9
+ @df = DataFrame.from_csv(@csv)
10
+ end
11
+
12
+ it "should be able to define a model with a block" do
13
+ @df.model(:b2) do |m|
14
+ m.b 2
15
+ end
16
+
17
+ @df.models.table.keys.should eql([:b2])
18
+ @df.models.b2.size.should eql(2)
19
+ @df.models.b2.b.should eql([2,2])
20
+ end
21
+
22
+ it "should be able to define a model with a range of values" do
23
+ @df.model(:a12) do |m|
24
+ m.a (1..2)
25
+ end
26
+ @df.models.a12.a.should eql([1,2])
27
+ end
28
+
29
+ it "should be able to define a model with a set of values" do
30
+ @df.model(:a14) do |m|
31
+ m.a [1,4]
32
+ end
33
+ @df.models.a14.a.should eql([1,4])
34
+ end
35
+
36
+ end