davidrichards-data_frame 0.0.18 → 0.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +16 -0
- data/VERSION.yml +1 -1
- data/bin/plain_frame +22 -0
- data/lib/data_frame.rb +2 -1
- data/lib/data_frame/arff.rb +43 -36
- data/lib/data_frame/core/column_management.rb +102 -0
- data/lib/data_frame/core/filter.rb +48 -0
- data/lib/data_frame/core/import.rb +112 -0
- data/lib/data_frame/core/pre_process.rb +61 -0
- data/lib/data_frame/core/saving.rb +29 -0
- data/lib/data_frame/core/training.rb +36 -0
- data/lib/data_frame/data_frame.rb +37 -241
- data/lib/data_frame/id3.rb +28 -0
- data/lib/data_frame/kmeans.rb +10 -0
- data/lib/data_frame/labels_from_uci.rb +48 -0
- data/lib/data_frame/mlp.rb +18 -0
- data/lib/data_frame/sbn.rb +18 -0
- data/lib/data_frame/transposable_array.rb +1 -1
- data/lib/ext/array.rb +11 -0
- data/spec/data_frame/arff_spec.rb +1 -0
- data/spec/data_frame/core/column_management_spec.rb +97 -0
- data/spec/data_frame/core/filter_spec.rb +88 -0
- data/spec/data_frame/core/import_spec.rb +41 -0
- data/spec/data_frame/core/pre_process_spec.rb +71 -0
- data/spec/data_frame/core/saving_spec.rb +61 -0
- data/spec/data_frame/core/training_spec.rb +51 -0
- data/spec/data_frame/data_frame_spec.rb +10 -226
- data/spec/data_frame/id3_spec.rb +22 -0
- data/spec/ext/array_spec.rb +13 -0
- data/spec/fixtures/discrete_testing.csv +4 -0
- data/spec/fixtures/discrete_training.csv +21 -0
- metadata +33 -6
@@ -21,12 +21,6 @@ describe DataFrame do
|
|
21
21
|
@df.items.should be_empty
|
22
22
|
end
|
23
23
|
|
24
|
-
it "should be able to add an item" do
|
25
|
-
item = [1,2,3,4]
|
26
|
-
@df.add_item(item)
|
27
|
-
@df.items.should eql([item])
|
28
|
-
end
|
29
|
-
|
30
24
|
it "should use just_enumerable_stats" do
|
31
25
|
[1,2,3].std.should eql(1)
|
32
26
|
lambda{[1,2,3].cor([2,3,5])}.should_not raise_error
|
@@ -74,12 +68,15 @@ describe DataFrame do
|
|
74
68
|
end
|
75
69
|
end
|
76
70
|
|
77
|
-
it "should be able to
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
@
|
71
|
+
it "should be able to initialize from an array" do
|
72
|
+
contents = %{7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,0
|
73
|
+
7,4,oct,tue,90.6,35.4,669.1,6.7,18,33,0.9,0,0
|
74
|
+
}
|
75
|
+
|
76
|
+
@labels = [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
|
77
|
+
@df = DataFrame.new(@labels)
|
78
|
+
@df.import(contents)
|
79
|
+
@df.labels.should eql(@labels)
|
83
80
|
end
|
84
81
|
|
85
82
|
context "csv" do
|
@@ -89,7 +86,7 @@ describe DataFrame do
|
|
89
86
|
7,4,oct,tue,90.6,35.4,669.1,6.7,18,33,0.9,0,0
|
90
87
|
}
|
91
88
|
labels = [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
|
92
|
-
|
89
|
+
|
93
90
|
@df = DataFrame.from_csv(contents)
|
94
91
|
@df.labels.should eql(labels)
|
95
92
|
@df.x.should eql([7,7])
|
@@ -109,22 +106,6 @@ describe DataFrame do
|
|
109
106
|
end
|
110
107
|
end
|
111
108
|
|
112
|
-
it "should be able to remove a column" do
|
113
|
-
@df = DataFrame.new :twos, :threes, :fours
|
114
|
-
@df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
|
115
|
-
@df.drop!(:twos)
|
116
|
-
@df.items.all? {|i| i.should eql([3,4])}
|
117
|
-
@df.labels.should eql([:threes, :fours])
|
118
|
-
end
|
119
|
-
|
120
|
-
it "should be able to remove more than one column at a time" do
|
121
|
-
@df = DataFrame.new :twos, :threes, :fours
|
122
|
-
@df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
|
123
|
-
@df.drop!(:twos, :fours)
|
124
|
-
@df.items.all? {|i| i.should eql([3])}
|
125
|
-
@df.labels.should eql([:threes])
|
126
|
-
end
|
127
|
-
|
128
109
|
it "should offer a hash-like structure of columns" do
|
129
110
|
@df.add [1,2,3,4]
|
130
111
|
@df.add [5, 6, 7, 8]
|
@@ -156,202 +137,5 @@ describe DataFrame do
|
|
156
137
|
@df.variables.should eql(@labels)
|
157
138
|
end
|
158
139
|
|
159
|
-
context "replace!" do
|
160
|
-
before do
|
161
|
-
@df.add [1,2,3,4]
|
162
|
-
@df.add [5, 6, 7, 8]
|
163
|
-
@doubler = lambda{|e| e * 2}
|
164
|
-
end
|
165
|
-
|
166
|
-
it "should only replace columns that actually exist" do
|
167
|
-
lambda{@df.replace!(:not_a_column, &@doubler)}.should raise_error(
|
168
|
-
ArgumentError, /Must provide the name of an existing column./)
|
169
|
-
lambda{@df.replace!(:these, &@doubler)}.should_not raise_error
|
170
|
-
end
|
171
|
-
|
172
|
-
it "should be able to replace a column with a block" do
|
173
|
-
@df.replace!(:these) {|e| e * 2}
|
174
|
-
@df.these.should eql([2,10])
|
175
|
-
end
|
176
|
-
|
177
|
-
it "should be able to replace a column with an array" do
|
178
|
-
@a = [5,9]
|
179
|
-
@df.replace!(:these, @a)
|
180
|
-
@df.these.should eql(@a)
|
181
|
-
end
|
182
|
-
end
|
183
|
-
|
184
|
-
context "filter!" do
|
185
|
-
before do
|
186
|
-
@df.add [1,2,3,4]
|
187
|
-
@df.add [5, 6, 7, 8]
|
188
|
-
end
|
189
|
-
|
190
|
-
it "should be able to filter a data frame with a block using an OpenStruct for each row" do
|
191
|
-
@df.filter!(:open_struct) {|row| row.these == 5}
|
192
|
-
@df.items.should eql([[5, 6, 7, 8]])
|
193
|
-
end
|
194
|
-
|
195
|
-
it "should be able to filter a data frame with a block using a Hash for each row" do
|
196
|
-
@df.filter!(:hash) {|row| row[:these] == 5}
|
197
|
-
@df.items.should eql([[5, 6, 7, 8]])
|
198
|
-
end
|
199
|
-
|
200
|
-
S4 = Struct.new(:one, :two, :three, :four)
|
201
|
-
it "should be able to filter a data frame with a block using another class that uses the row as input" do
|
202
|
-
@df.filter!(S4) {|row| row.one == 5}
|
203
|
-
@df.items.should eql([[5, 6, 7, 8]])
|
204
|
-
end
|
205
|
-
|
206
|
-
it "should be able to filter a data frame with a block using an array for each row" do
|
207
|
-
@df.filter! {|row| row.first == 5}
|
208
|
-
@df.items.should eql([[5, 6, 7, 8]])
|
209
|
-
end
|
210
|
-
|
211
|
-
it "should be able to do fancy things with the row as the filter" do
|
212
|
-
@df.filter! {|row| row.sum > 10}
|
213
|
-
@df.items.should eql([[5, 6, 7, 8]])
|
214
|
-
end
|
215
|
-
|
216
|
-
it "should be able to generate a new data frame with filter" do
|
217
|
-
new_df = @df.filter(:open_struct) {|row| row.these == 5}
|
218
|
-
new_df.items.should eql([[5, 6, 7, 8]])
|
219
|
-
@df.items.should eql([[1, 2, 3, 4], [5, 6, 7, 8]])
|
220
|
-
end
|
221
|
-
|
222
|
-
end
|
223
|
-
|
224
|
-
context "filter_by_category" do
|
225
|
-
|
226
|
-
before do
|
227
|
-
@df = DataFrame.new(:weather, :date)
|
228
|
-
|
229
|
-
(1..31).each do |i|
|
230
|
-
@df.add [(i % 3 == 1) ? :fair : :good, Date.parse("07/#{i}/2009")]
|
231
|
-
end
|
232
|
-
|
233
|
-
@d1 = Date.parse("07/15/2009")
|
234
|
-
@d2 = Date.parse("07/31/2009")
|
235
|
-
|
236
|
-
end
|
237
|
-
|
238
|
-
it "should be able to filter by category" do
|
239
|
-
filtered = @df.filter_by_category(:weather => :good)
|
240
|
-
filtered.weather.uniq.should eql([:good])
|
241
|
-
@df.weather.uniq.should be_include(:fair)
|
242
|
-
end
|
243
|
-
|
244
|
-
it "should be able to manage ranges for filter values" do
|
245
|
-
filtered = @df.filter_by_category(:date => (@d1..@d2))
|
246
|
-
filtered.date.should_not be_include(Date.parse("07/01/2009"))
|
247
|
-
filtered.date.should_not be_include(Date.parse("07/14/2009"))
|
248
|
-
filtered.date.should be_include(Date.parse("07/15/2009"))
|
249
|
-
filtered.date.should be_include(Date.parse("07/31/2009"))
|
250
|
-
@df.date.should be_include(Date.parse("07/01/2009"))
|
251
|
-
end
|
252
|
-
|
253
|
-
it "should be able to take an array of values to filter with" do
|
254
|
-
filtered = @df.filter_by_category(:date => [@d1, @d2])
|
255
|
-
filtered.date.should_not be_include(Date.parse("07/01/2009"))
|
256
|
-
filtered.date.should be_include(Date.parse("07/15/2009"))
|
257
|
-
filtered.date.should be_include(Date.parse("07/31/2009"))
|
258
|
-
end
|
259
|
-
|
260
|
-
it "should have a destructive version" do
|
261
|
-
@df.filter_by_category!(:date => [@d1, @d2])
|
262
|
-
@df.date.should_not be_include(Date.parse("07/01/2009"))
|
263
|
-
@df.date.should be_include(Date.parse("07/15/2009"))
|
264
|
-
@df.date.should be_include(Date.parse("07/31/2009"))
|
265
|
-
end
|
266
|
-
|
267
|
-
end
|
268
|
-
|
269
|
-
context "subset_from_columns" do
|
270
|
-
before do
|
271
|
-
@df.add [1,2,3,4]
|
272
|
-
@df.add [5, 6, 7, 8]
|
273
|
-
end
|
274
|
-
|
275
|
-
it "should be able to create a subset of columns" do
|
276
|
-
new_data_frame = @df.subset_from_columns(:these, :labels)
|
277
|
-
new_data_frame.should_not eql(@df)
|
278
|
-
new_data_frame.labels.should eql([:these, :labels])
|
279
|
-
new_data_frame.items.should eql([[1,4],[5,8]])
|
280
|
-
new_data_frame.these.should eql([1,5])
|
281
|
-
end
|
282
|
-
end
|
283
|
-
|
284
|
-
it "should be able to j_binary_ize! a column, taking its categories and creating a column for each" do
|
285
|
-
df = DataFrame.new(:observations)
|
286
|
-
df.add [:many]
|
287
|
-
df.add [:fine]
|
288
|
-
df.add [:things]
|
289
|
-
df.add [:are]
|
290
|
-
df.add [:available]
|
291
|
-
df.j_binary_ize!(:observations)
|
292
|
-
df.observations_many.should eql([true, false, false, false, false])
|
293
|
-
df.observations_fine.should eql([false, true, false, false, false])
|
294
|
-
df.observations_things.should eql([false, false, true, false, false])
|
295
|
-
df.observations_are.should eql([false, false, false, true, false])
|
296
|
-
df.observations_available.should eql([false, false, false, false, true])
|
297
|
-
df.observations.should eql([:many, :fine, :things, :are, :available])
|
298
|
-
end
|
299
|
-
|
300
|
-
it "should be able to j_binary_ize! a more normal column" do
|
301
|
-
df = DataFrame.new(:observations)
|
302
|
-
df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
|
303
|
-
df.observations.add_category(:small) {|e| e <= 3}
|
304
|
-
df.observations.add_category(:large) {|e| e >= 3}
|
305
|
-
df.j_binary_ize!(:observations)
|
306
|
-
df.observations_small.should eql([true, true, true, false, false, false, true, true, true])
|
307
|
-
df.observations_large.should eql([false, false, false, true, true, true, false, false, false])
|
308
|
-
end
|
309
140
|
|
310
|
-
it "should be able to j_binary_ize with non-adjacent sets (sets that allow a value to have more than one category)" do
|
311
|
-
df = DataFrame.new(:observations)
|
312
|
-
df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
|
313
|
-
df.observations.add_category(:small) {|e| e <= 3}
|
314
|
-
df.observations.add_category(:large) {|e| e >= 3}
|
315
|
-
df.j_binary_ize!(:observations, :allow_overlap => true)
|
316
|
-
df.observations_small.should eql([true, true, true, false, false, false, true, true, true])
|
317
|
-
df.observations_large.should eql([false, false, true, true, true, true, true, false, false])
|
318
|
-
end
|
319
|
-
|
320
|
-
it "should be able to hold multiple ideas of a columns categories by resetting the category and re-running j_binary_ize" do
|
321
|
-
df = DataFrame.new(:observations)
|
322
|
-
df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
|
323
|
-
df.observations.add_category(:small) {|e| e <= 3}
|
324
|
-
df.observations.add_category(:large) {|e| e >= 3}
|
325
|
-
df.j_binary_ize!(:observations, :allow_overlap => true)
|
326
|
-
df.observations.set_categories(:odd => lambda{|e| e.odd?}, :even => lambda{|e| e.even?})
|
327
|
-
df.j_binary_ize!(:observations)
|
328
|
-
df.observations_small.should eql([true, true, true, false, false, false, true, true, true])
|
329
|
-
df.observations_large.should eql([false, false, true, true, true, true, true, false, false])
|
330
|
-
df.observations.should eql([1,2,3,4,5,4,3,2,1])
|
331
|
-
df.observations_even.should eql([false, true, false, true, false, true, false, true, false])
|
332
|
-
df.observations_odd.should eql([true, false, true, false, true, false, true, false, true])
|
333
|
-
end
|
334
|
-
|
335
|
-
context "append!" do
|
336
|
-
|
337
|
-
before do
|
338
|
-
@df.add [1,2,3,4]
|
339
|
-
@df.add [5, 6, 7, 8]
|
340
|
-
end
|
341
|
-
|
342
|
-
it "should be able to append an array of values to the data frame" do
|
343
|
-
@df.append!(:new_column, [5,5])
|
344
|
-
@df.new_column.should eql([5,5])
|
345
|
-
end
|
346
|
-
|
347
|
-
it "should be able to append a default value to the data frame" do
|
348
|
-
@df.append!(:new_column, :value)
|
349
|
-
@df.new_column.should eql([:value, :value])
|
350
|
-
end
|
351
|
-
|
352
|
-
it "should use nil as the default value" do
|
353
|
-
@df.append!(:new_column)
|
354
|
-
@df.new_column.should eql([nil, nil])
|
355
|
-
end
|
356
|
-
end
|
357
141
|
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), "/../spec_helper")
|
2
|
+
require 'data_frame/id3'
|
3
|
+
|
4
|
+
describe "DecisionTree" do
|
5
|
+
|
6
|
+
before do
|
7
|
+
@filename = File.expand_path(File.join(File.dirname(__FILE__), "../fixtures/discrete_training.csv"))
|
8
|
+
@df = DataFrame.from_csv(@filename)
|
9
|
+
@test_data = File.read(@filename)
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should require the decisiontree gem" do
|
13
|
+
defined?(DecisionTree::ID3Tree).should eql('constant')
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should be able to create a decision tree from a data frame" do
|
17
|
+
# Come back to this.
|
18
|
+
# @df.create_id3(:purchase)
|
19
|
+
# @df.id3.train
|
20
|
+
# @df.id3.predict(["36 - 55", "masters", "high", "single", 1]).should eql(1)
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), "/../spec_helper")
|
2
|
+
|
3
|
+
describe Array do
|
4
|
+
it "should be able to determine its dimensions" do
|
5
|
+
[1,2,3].dimensions.should eql(1)
|
6
|
+
[[1,2,3], [1,2,3]].dimensions.should eql(2)
|
7
|
+
[[[1,2,3], [1,2,3]], [[1,2,3], [1,2,3], [[1,2,3], [1,2,3]]]].dimensions.should eql(3)
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should depend on the first element to determine dimensions" do
|
11
|
+
[1, [1,2]].dimensions.should eql(1)
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
Age,Education,Income,Marital Status,Purchase
|
2
|
+
36 - 55,masters,high,single,will buy
|
3
|
+
18 - 35,high school,low,single,won't buy
|
4
|
+
36 - 55,masters,low,single,will buy
|
5
|
+
18 - 35,bachelors,high,single,won't buy
|
6
|
+
< 18,high school,low,single,will buy
|
7
|
+
18 - 35,bachelors,high,married,won't buy
|
8
|
+
36 - 55,bachelors,low,married,won't buy
|
9
|
+
> 55,bachelors,high,single,will buy
|
10
|
+
36 - 55,masters,low,married,won't buy
|
11
|
+
> 55,masters,low,married,will buy
|
12
|
+
36 - 55,masters,high,single,will buy
|
13
|
+
> 55,masters,high,single,will buy
|
14
|
+
< 18,high school,high,single,won't buy
|
15
|
+
36 - 55,masters,low,single,will buy
|
16
|
+
36 - 55,high school,low,single,will buy
|
17
|
+
< 18,high school,low,married,will buy
|
18
|
+
18 - 35,bachelors,high,married,won't buy
|
19
|
+
> 55,high school,high,married,will buy
|
20
|
+
> 55,bachelors,low,single,will buy
|
21
|
+
36 - 55,high school,high,married,won't buy
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: davidrichards-data_frame
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.19
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David Richards
|
@@ -9,8 +9,8 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-09-
|
13
|
-
default_executable:
|
12
|
+
date: 2009-09-24 00:00:00 -07:00
|
13
|
+
default_executable: plain_frame
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: activesupport
|
@@ -44,8 +44,8 @@ dependencies:
|
|
44
44
|
version:
|
45
45
|
description: Data Frames with memoized transpose
|
46
46
|
email: davidlamontrichards@gmail.com
|
47
|
-
executables:
|
48
|
-
|
47
|
+
executables:
|
48
|
+
- plain_frame
|
49
49
|
extensions: []
|
50
50
|
|
51
51
|
extra_rdoc_files: []
|
@@ -53,31 +53,58 @@ extra_rdoc_files: []
|
|
53
53
|
files:
|
54
54
|
- README.rdoc
|
55
55
|
- VERSION.yml
|
56
|
+
- bin/plain_frame
|
56
57
|
- lib/data_frame
|
57
58
|
- lib/data_frame/arff.rb
|
58
59
|
- lib/data_frame/callback_array.rb
|
60
|
+
- lib/data_frame/core
|
61
|
+
- lib/data_frame/core/column_management.rb
|
62
|
+
- lib/data_frame/core/filter.rb
|
63
|
+
- lib/data_frame/core/import.rb
|
64
|
+
- lib/data_frame/core/pre_process.rb
|
65
|
+
- lib/data_frame/core/saving.rb
|
66
|
+
- lib/data_frame/core/training.rb
|
59
67
|
- lib/data_frame/data_frame.rb
|
68
|
+
- lib/data_frame/id3.rb
|
69
|
+
- lib/data_frame/kmeans.rb
|
70
|
+
- lib/data_frame/labels_from_uci.rb
|
71
|
+
- lib/data_frame/mlp.rb
|
60
72
|
- lib/data_frame/model.rb
|
61
73
|
- lib/data_frame/parameter_capture.rb
|
74
|
+
- lib/data_frame/sbn.rb
|
62
75
|
- lib/data_frame/transposable_array.rb
|
63
76
|
- lib/data_frame.rb
|
64
77
|
- lib/ext
|
78
|
+
- lib/ext/array.rb
|
65
79
|
- lib/ext/open_struct.rb
|
66
80
|
- lib/ext/string.rb
|
67
81
|
- lib/ext/symbol.rb
|
68
82
|
- spec/data_frame
|
69
83
|
- spec/data_frame/arff_spec.rb
|
70
84
|
- spec/data_frame/callback_array_spec.rb
|
85
|
+
- spec/data_frame/core
|
86
|
+
- spec/data_frame/core/column_management_spec.rb
|
87
|
+
- spec/data_frame/core/filter_spec.rb
|
88
|
+
- spec/data_frame/core/import_spec.rb
|
89
|
+
- spec/data_frame/core/pre_process_spec.rb
|
90
|
+
- spec/data_frame/core/saving_spec.rb
|
91
|
+
- spec/data_frame/core/training_spec.rb
|
71
92
|
- spec/data_frame/data_frame_spec.rb
|
93
|
+
- spec/data_frame/id3_spec.rb
|
72
94
|
- spec/data_frame/model_spec.rb
|
73
95
|
- spec/data_frame/parameter_capture_spec.rb
|
74
96
|
- spec/data_frame/transposable_array_spec.rb
|
75
97
|
- spec/data_frame_spec.rb
|
98
|
+
- spec/ext
|
99
|
+
- spec/ext/array_spec.rb
|
76
100
|
- spec/fixtures
|
77
101
|
- spec/fixtures/basic.csv
|
102
|
+
- spec/fixtures/discrete_testing.csv
|
103
|
+
- spec/fixtures/discrete_training.csv
|
78
104
|
- spec/spec_helper.rb
|
79
105
|
has_rdoc: true
|
80
106
|
homepage: http://github.com/davidrichards/data_frame
|
107
|
+
licenses:
|
81
108
|
post_install_message:
|
82
109
|
rdoc_options:
|
83
110
|
- --inline-source
|
@@ -99,7 +126,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
99
126
|
requirements: []
|
100
127
|
|
101
128
|
rubyforge_project:
|
102
|
-
rubygems_version: 1.
|
129
|
+
rubygems_version: 1.3.5
|
103
130
|
signing_key:
|
104
131
|
specification_version: 2
|
105
132
|
summary: Data Frames with memoized transpose
|