davidrichards-data_frame 0.0.15 → 0.0.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+ # Adds the model methods to the data frame.
2
+ class DataFrame
3
+
4
+ # Returns a model if defined
5
+ # Defines a model with a block, if given and not defined
6
+ # Stores the model in the models container, which gives us access like:
7
+ # df.models.new_model_name...
8
+ def model(name=nil, &block)
9
+ return self.models[name] if self.models.table.keys.include?(name)
10
+ return false unless block
11
+ @pc = ParameterCapture.new(&block)
12
+ model = self.filter(Hash) do |row|
13
+ @pc.filter(row)
14
+ end
15
+ self.models.table[name] = model
16
+ end
17
+
18
+ def models
19
+ @models ||= OpenStruct.new
20
+ end
21
+
22
+ end
@@ -0,0 +1,50 @@
1
+ # Captures the intent of a model definition in a block. Usage:
2
+ # pc = ParameterCapture.new do |p|
3
+ # p.whatever :some_value
4
+ # p.another :one
5
+ # p.or_list [1, 2]
6
+ # p.or_range (1..2)
7
+ # end
8
+ # pc.parameters
9
+ # => {:whatever => :some_value, :another => :one, :or_list => [1,2], :or_range => (1..2)}
10
+ class ParameterCapture
11
+ def initialize(&block)
12
+ self.instance_eval &block
13
+ end
14
+
15
+ def parameters
16
+ @parameters ||= OpenStruct.new
17
+ end
18
+
19
+ # Exposes the set keys
20
+ def keys
21
+ self.parameters.table.keys
22
+ end
23
+
24
+ # can be used in a data_frame filter.
25
+ # @pc.filter(row) Using a Hash as a cast type for the filter.
26
+ def filter(row)
27
+ self.keys.each do |key|
28
+ value = self.parameters.send(key)
29
+ case value
30
+ when Array
31
+ return false unless value.include?(row[key])
32
+ when Range
33
+ return false unless value.include?(row[key])
34
+ else
35
+ return false unless value === row[key]
36
+ end
37
+ end
38
+ return true
39
+ end
40
+
41
+ def method_missing(key, *values, &block)
42
+ if self.parameters.table.keys.include?(key)
43
+ self.parameters.send(key)
44
+ elsif values.size == 1
45
+ self.parameters.table[key] = values.first
46
+ else
47
+ self.parameters.table[key] = values
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,341 @@
1
+ require File.join(File.dirname(__FILE__), "/../spec_helper")
2
+
3
+ describe DataFrame do
4
+
5
+ before do
6
+ @labels = [:these, :are, :the, :labels]
7
+ @df = DataFrame.new(*@labels)
8
+ end
9
+
10
+ it "should initialize with labels" do
11
+ df = DataFrame.new(*@labels)
12
+ df.labels.should eql(@labels)
13
+ end
14
+
15
+ it "should initialize with an empty items list" do
16
+ @df.items.should be_is_a(TransposableArray)
17
+ @df.items.should be_empty
18
+ end
19
+
20
+ it "should be able to add an item" do
21
+ item = [1,2,3,4]
22
+ @df.add_item(item)
23
+ @df.items.should eql([item])
24
+ end
25
+
26
+ it "should use just_enumerable_stats" do
27
+ [1,2,3].std.should eql(1)
28
+ lambda{[1,2,3].cor([2,3,5])}.should_not raise_error
29
+ end
30
+
31
+ context "column and row operations" do
32
+ before do
33
+ @df.add_item([1,2,3,4])
34
+ @df.add_item([5,6,7,8])
35
+ @df.add_item([9,10,11,12])
36
+ end
37
+
38
+ it "should have a method for every label, the column in the data frame" do
39
+ @df.these.should eql([1,5,9])
40
+ end
41
+
42
+ it "should make columns easily computable" do
43
+ @df.these.std.should eql([1,5,9].std)
44
+ end
45
+
46
+ it "should defer unknown methods to the items in the data frame" do
47
+ @df[0].should eql([1,2,3,4])
48
+ @df << [13,14,15,16]
49
+ @df.last.should eql([13,14,15,16])
50
+ @df.map { |e| e.sum }.should eql([10,26,42,58])
51
+ end
52
+
53
+ it "should allow optional row labels" do
54
+ @df.row_labels.should eql([])
55
+ end
56
+
57
+ it "should have a setter for row labels" do
58
+ @df.row_labels = [:other, :things, :here]
59
+ @df.row_labels.should eql([:other, :things, :here])
60
+ end
61
+
62
+ it "should be able to access rows by their labels" do
63
+ @df.row_labels = [:other, :things, :here]
64
+ @df.here.should eql([9,10,11,12])
65
+ end
66
+
67
+ it "should make rows easily computable" do
68
+ @df.row_labels = [:other, :things, :here]
69
+ @df.here.sum.should eql(42)
70
+ end
71
+ end
72
+
73
+ it "should be able to import more than one row at a time" do
74
+ @df.import([[2,2,2,2],[3,3,3,3],[4,4,4,4]])
75
+ @df.row_labels = [:twos, :threes, :fours]
76
+ @df.twos.should eql([2,2,2,2])
77
+ @df.threes.should eql([3,3,3,3])
78
+ @df.fours.should eql([4,4,4,4])
79
+ end
80
+
81
+ context "csv" do
82
+ it "should compute easily from csv" do
83
+ contents = %{X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
84
+ 7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,0
85
+ 7,4,oct,tue,90.6,35.4,669.1,6.7,18,33,0.9,0,0
86
+ }
87
+ labels = [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
88
+
89
+ @df = DataFrame.from_csv(contents)
90
+ @df.labels.should eql(labels)
91
+ @df.x.should eql([7,7])
92
+ @df.area.should eql([0,0])
93
+ end
94
+ end
95
+
96
+ it "should be able to remove a column" do
97
+ @df = DataFrame.new :twos, :threes, :fours
98
+ @df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
99
+ @df.drop!(:twos)
100
+ @df.items.all? {|i| i.should eql([3,4])}
101
+ @df.labels.should eql([:threes, :fours])
102
+ end
103
+
104
+ it "should be able to remove more than one column at a time" do
105
+ @df = DataFrame.new :twos, :threes, :fours
106
+ @df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
107
+ @df.drop!(:twos, :fours)
108
+ @df.items.all? {|i| i.should eql([3])}
109
+ @df.labels.should eql([:threes])
110
+ end
111
+
112
+ it "should offer a hash-like structure of columns" do
113
+ @df.add [1,2,3,4]
114
+ @df.add [5, 6, 7, 8]
115
+ @df.columns[:these].should eql([1, 5])
116
+ @df.columns[:are].should eql([2, 6])
117
+ @df.columns[:the].should eql([3, 7])
118
+ @df.columns[:labels].should eql([4, 8])
119
+ end
120
+
121
+ it "should alias items with rows" do
122
+ @df.add [1,2,3,4]
123
+ @df.add [5, 6, 7, 8]
124
+ @df.rows.should eql(@df.items)
125
+ end
126
+
127
+ it "should be able to export a hash" do
128
+ @df.add [1,2,3,4]
129
+ @df.add [5, 6, 7, 8]
130
+ hash = @df.to_hash
131
+ values = [[1,5],[2,6],[3,7],[4,8]]
132
+ hash.keys.size.should eql(@labels.size)
133
+ hash.keys.all? {|e| @labels.should be_include(e)}
134
+ hash.values.size.should eql(@labels.size)
135
+ hash.values.all? {|e| values.should be_include(e)}
136
+ end
137
+
138
+ it "should use variables like labels" do
139
+ @df.labels.should eql(@labels)
140
+ @df.variables.should eql(@labels)
141
+ end
142
+
143
+ context "replace!" do
144
+ before do
145
+ @df.add [1,2,3,4]
146
+ @df.add [5, 6, 7, 8]
147
+ @doubler = lambda{|e| e * 2}
148
+ end
149
+
150
+ it "should only replace columns that actually exist" do
151
+ lambda{@df.replace!(:not_a_column, &@doubler)}.should raise_error(
152
+ ArgumentError, /Must provide the name of an existing column./)
153
+ lambda{@df.replace!(:these, &@doubler)}.should_not raise_error
154
+ end
155
+
156
+ it "should be able to replace a column with a block" do
157
+ @df.replace!(:these) {|e| e * 2}
158
+ @df.these.should eql([2,10])
159
+ end
160
+
161
+ it "should be able to replace a column with an array" do
162
+ @a = [5,9]
163
+ @df.replace!(:these, @a)
164
+ @df.these.should eql(@a)
165
+ end
166
+ end
167
+
168
+ context "filter!" do
169
+ before do
170
+ @df.add [1,2,3,4]
171
+ @df.add [5, 6, 7, 8]
172
+ end
173
+
174
+ it "should be able to filter a data frame with a block using an OpenStruct for each row" do
175
+ @df.filter!(:open_struct) {|row| row.these == 5}
176
+ @df.items.should eql([[5, 6, 7, 8]])
177
+ end
178
+
179
+ it "should be able to filter a data frame with a block using a Hash for each row" do
180
+ @df.filter!(:hash) {|row| row[:these] == 5}
181
+ @df.items.should eql([[5, 6, 7, 8]])
182
+ end
183
+
184
+ S4 = Struct.new(:one, :two, :three, :four)
185
+ it "should be able to filter a data frame with a block using another class that uses the row as input" do
186
+ @df.filter!(S4) {|row| row.one == 5}
187
+ @df.items.should eql([[5, 6, 7, 8]])
188
+ end
189
+
190
+ it "should be able to filter a data frame with a block using an array for each row" do
191
+ @df.filter! {|row| row.first == 5}
192
+ @df.items.should eql([[5, 6, 7, 8]])
193
+ end
194
+
195
+ it "should be able to do fancy things with the row as the filter" do
196
+ @df.filter! {|row| row.sum > 10}
197
+ @df.items.should eql([[5, 6, 7, 8]])
198
+ end
199
+
200
+ it "should be able to generate a new data frame with filter" do
201
+ new_df = @df.filter(:open_struct) {|row| row.these == 5}
202
+ new_df.items.should eql([[5, 6, 7, 8]])
203
+ @df.items.should eql([[1, 2, 3, 4], [5, 6, 7, 8]])
204
+ end
205
+
206
+ end
207
+
208
+ context "filter_by_category" do
209
+
210
+ before do
211
+ @df = DataFrame.new(:weather, :date)
212
+
213
+ (1..31).each do |i|
214
+ @df.add [(i % 3 == 1) ? :fair : :good, Date.parse("07/#{i}/2009")]
215
+ end
216
+
217
+ @d1 = Date.parse("07/15/2009")
218
+ @d2 = Date.parse("07/31/2009")
219
+
220
+ end
221
+
222
+ it "should be able to filter by category" do
223
+ filtered = @df.filter_by_category(:weather => :good)
224
+ filtered.weather.uniq.should eql([:good])
225
+ @df.weather.uniq.should be_include(:fair)
226
+ end
227
+
228
+ it "should be able to manage ranges for filter values" do
229
+ filtered = @df.filter_by_category(:date => (@d1..@d2))
230
+ filtered.date.should_not be_include(Date.parse("07/01/2009"))
231
+ filtered.date.should_not be_include(Date.parse("07/14/2009"))
232
+ filtered.date.should be_include(Date.parse("07/15/2009"))
233
+ filtered.date.should be_include(Date.parse("07/31/2009"))
234
+ @df.date.should be_include(Date.parse("07/01/2009"))
235
+ end
236
+
237
+ it "should be able to take an array of values to filter with" do
238
+ filtered = @df.filter_by_category(:date => [@d1, @d2])
239
+ filtered.date.should_not be_include(Date.parse("07/01/2009"))
240
+ filtered.date.should be_include(Date.parse("07/15/2009"))
241
+ filtered.date.should be_include(Date.parse("07/31/2009"))
242
+ end
243
+
244
+ it "should have a destructive version" do
245
+ @df.filter_by_category!(:date => [@d1, @d2])
246
+ @df.date.should_not be_include(Date.parse("07/01/2009"))
247
+ @df.date.should be_include(Date.parse("07/15/2009"))
248
+ @df.date.should be_include(Date.parse("07/31/2009"))
249
+ end
250
+
251
+ end
252
+
253
+ context "subset_from_columns" do
254
+ before do
255
+ @df.add [1,2,3,4]
256
+ @df.add [5, 6, 7, 8]
257
+ end
258
+
259
+ it "should be able to create a subset of columns" do
260
+ new_data_frame = @df.subset_from_columns(:these, :labels)
261
+ new_data_frame.should_not eql(@df)
262
+ new_data_frame.labels.should eql([:these, :labels])
263
+ new_data_frame.items.should eql([[1,4],[5,8]])
264
+ new_data_frame.these.should eql([1,5])
265
+ end
266
+ end
267
+
268
+ it "should be able to j_binary_ize! a column, taking its categories and creating a column for each" do
269
+ df = DataFrame.new(:observations)
270
+ df.add [:many]
271
+ df.add [:fine]
272
+ df.add [:things]
273
+ df.add [:are]
274
+ df.add [:available]
275
+ df.j_binary_ize!(:observations)
276
+ df.observations_many.should eql([true, false, false, false, false])
277
+ df.observations_fine.should eql([false, true, false, false, false])
278
+ df.observations_things.should eql([false, false, true, false, false])
279
+ df.observations_are.should eql([false, false, false, true, false])
280
+ df.observations_available.should eql([false, false, false, false, true])
281
+ df.observations.should eql([:many, :fine, :things, :are, :available])
282
+ end
283
+
284
+ it "should be able to j_binary_ize! a more normal column" do
285
+ df = DataFrame.new(:observations)
286
+ df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
287
+ df.observations.add_category(:small) {|e| e <= 3}
288
+ df.observations.add_category(:large) {|e| e >= 3}
289
+ df.j_binary_ize!(:observations)
290
+ df.observations_small.should eql([true, true, true, false, false, false, true, true, true])
291
+ df.observations_large.should eql([false, false, false, true, true, true, false, false, false])
292
+ end
293
+
294
+ it "should be able to j_binary_ize with non-adjacent sets (sets that allow a value to have more than one category)" do
295
+ df = DataFrame.new(:observations)
296
+ df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
297
+ df.observations.add_category(:small) {|e| e <= 3}
298
+ df.observations.add_category(:large) {|e| e >= 3}
299
+ df.j_binary_ize!(:observations, :allow_overlap => true)
300
+ df.observations_small.should eql([true, true, true, false, false, false, true, true, true])
301
+ df.observations_large.should eql([false, false, true, true, true, true, true, false, false])
302
+ end
303
+
304
+ it "should be able to hold multiple ideas of a columns categories by resetting the category and re-running j_binary_ize" do
305
+ df = DataFrame.new(:observations)
306
+ df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
307
+ df.observations.add_category(:small) {|e| e <= 3}
308
+ df.observations.add_category(:large) {|e| e >= 3}
309
+ df.j_binary_ize!(:observations, :allow_overlap => true)
310
+ df.observations.set_categories(:odd => lambda{|e| e.odd?}, :even => lambda{|e| e.even?})
311
+ df.j_binary_ize!(:observations)
312
+ df.observations_small.should eql([true, true, true, false, false, false, true, true, true])
313
+ df.observations_large.should eql([false, false, true, true, true, true, true, false, false])
314
+ df.observations.should eql([1,2,3,4,5,4,3,2,1])
315
+ df.observations_even.should eql([false, true, false, true, false, true, false, true, false])
316
+ df.observations_odd.should eql([true, false, true, false, true, false, true, false, true])
317
+ end
318
+
319
+ context "append!" do
320
+
321
+ before do
322
+ @df.add [1,2,3,4]
323
+ @df.add [5, 6, 7, 8]
324
+ end
325
+
326
+ it "should be able to append an array of values to the data frame" do
327
+ @df.append!(:new_column, [5,5])
328
+ @df.new_column.should eql([5,5])
329
+ end
330
+
331
+ it "should be able to append a default value to the data frame" do
332
+ @df.append!(:new_column, :value)
333
+ @df.new_column.should eql([:value, :value])
334
+ end
335
+
336
+ it "should use nil as the default value" do
337
+ @df.append!(:new_column)
338
+ @df.new_column.should eql([nil, nil])
339
+ end
340
+ end
341
+ end
@@ -0,0 +1,36 @@
1
+ require File.join(File.dirname(__FILE__), "/../spec_helper")
2
+
3
+ describe DataFrame, "model" do
4
+ before do
5
+ @csv = %{a,b,c
6
+ 1,2,3
7
+ 2,2,2
8
+ 4,5,6}
9
+ @df = DataFrame.from_csv(@csv)
10
+ end
11
+
12
+ it "should be able to define a model with a block" do
13
+ @df.model(:b2) do |m|
14
+ m.b 2
15
+ end
16
+
17
+ @df.models.table.keys.should eql([:b2])
18
+ @df.models.b2.size.should eql(2)
19
+ @df.models.b2.b.should eql([2,2])
20
+ end
21
+
22
+ it "should be able to define a model with a range of values" do
23
+ @df.model(:a12) do |m|
24
+ m.a (1..2)
25
+ end
26
+ @df.models.a12.a.should eql([1,2])
27
+ end
28
+
29
+ it "should be able to define a model with a set of values" do
30
+ @df.model(:a14) do |m|
31
+ m.a [1,4]
32
+ end
33
+ @df.models.a14.a.should eql([1,4])
34
+ end
35
+
36
+ end