davidrichards-data_frame 0.0.12 → 0.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +27 -0
- data/VERSION.yml +1 -1
- data/lib/data_frame.rb +104 -2
- data/lib/ext/open_struct.rb +5 -0
- data/spec/data_frame_spec.rb +60 -0
- metadata +3 -2
data/README.rdoc
CHANGED
@@ -48,6 +48,33 @@ To get your feet wet, you may want to play with data sets found here:
|
|
48
48
|
|
49
49
|
http://www.liaad.up.pt/~ltorgo/Regression/DataSets.html
|
50
50
|
|
51
|
+
== Transformations
|
52
|
+
|
53
|
+
A lot of the work in the data frame is to transform the actual table. You may need to drop columns, filter results, replace values in a column or create a new data frame based on the existing one. Here's how to do that:
|
54
|
+
|
55
|
+
> df = DataFrame.from_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv')
|
56
|
+
# => DataFrame rows: 517 labels: [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
|
57
|
+
> df.drop!(:ffmc)
|
58
|
+
# => DataFrame rows: 517 labels: [:x, :y, :month, :day, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
|
59
|
+
> df.drop!(:dmc, :dc, :isi, :rh)
|
60
|
+
# => DataFrame rows: 517 labels: [:x, :y, :month, :day, :temp, :wind, :rain, :area]
|
61
|
+
> df.x
|
62
|
+
# => [7, 7, 7, 8, 8, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6,...]
|
63
|
+
> df.replace!(:x) {|e| e * 3}
|
64
|
+
# => DataFrame rows: 517 labels: [:x, :y, :month, :day, :temp, :wind, :rain, :area]
|
65
|
+
> df.x
|
66
|
+
# => [21, 21, 21, 24, 24, 24, 24, 24, 24, 21, 21, 21, 18, 18, 18,...]
|
67
|
+
> df.filter!(:open_struct) {|row| row.x == 24}
|
68
|
+
# => DataFrame rows: 61 labels: [:x, :y, :month, :day, :temp, :wind, :rain, :area]
|
69
|
+
> df.x
|
70
|
+
# => [24, 24, 24, 24, 24, 24, 24, 24, 24,...]
|
71
|
+
> new_data_frame = df.subset_from_columns(:x, :y)
|
72
|
+
# => DataFrame rows: 61 labels: [:x, :y]
|
73
|
+
> new_data_frame.items
|
74
|
+
# => [[24, 6], [24, 6], [24, 6], [24, 6], ...]
|
75
|
+
|
76
|
+
|
77
|
+
Note: most of these transformations are not optimized. I'll work with things for a while before I try to optimize this library. However, I should say that I've used some fairly large data sets (thousands of rows) and have been fine with things so far.
|
51
78
|
|
52
79
|
==Installation
|
53
80
|
|
data/VERSION.yml
CHANGED
data/lib/data_frame.rb
CHANGED
@@ -3,6 +3,7 @@ require 'activesupport'
|
|
3
3
|
require 'just_enumerable_stats'
|
4
4
|
require 'open-uri'
|
5
5
|
require 'fastercsv'
|
6
|
+
require 'ostruct'
|
6
7
|
|
7
8
|
# Use a Dictionary if available
|
8
9
|
begin
|
@@ -57,6 +58,9 @@ class DataFrame
|
|
57
58
|
return nil unless contents
|
58
59
|
table = FCSV.parse(contents, default_csv_opts.merge(opts))
|
59
60
|
labels = table.shift
|
61
|
+
while table.last.empty?
|
62
|
+
table.pop
|
63
|
+
end
|
60
64
|
[labels, table]
|
61
65
|
end
|
62
66
|
|
@@ -71,6 +75,10 @@ class DataFrame
|
|
71
75
|
end
|
72
76
|
end
|
73
77
|
|
78
|
+
def inspect
|
79
|
+
"DataFrame rows: #{self.rows.size} labels: #{self.labels.inspect}"
|
80
|
+
end
|
81
|
+
|
74
82
|
# The labels of the data items
|
75
83
|
attr_reader :labels
|
76
84
|
alias :variables :labels
|
@@ -142,14 +150,108 @@ class DataFrame
|
|
142
150
|
end
|
143
151
|
end
|
144
152
|
|
145
|
-
def drop!(
|
153
|
+
def drop!(*labels)
|
154
|
+
labels.each do |label|
|
155
|
+
drop_one!(label)
|
156
|
+
end
|
157
|
+
self
|
158
|
+
end
|
159
|
+
|
160
|
+
def drop_one!(label)
|
146
161
|
i = self.labels.index(label)
|
147
162
|
return nil unless i
|
148
163
|
self.items.each do |item|
|
149
164
|
item.delete_at(i)
|
150
165
|
end
|
151
166
|
self.labels.delete_at(i)
|
152
|
-
|
167
|
+
self
|
168
|
+
end
|
169
|
+
protected :drop_one!
|
170
|
+
|
171
|
+
def replace!(column, values=nil, &block)
|
172
|
+
column = validate_column(column)
|
173
|
+
if not values
|
174
|
+
values = self.send(column)
|
175
|
+
values.map! {|e| block.call(e)}
|
176
|
+
end
|
177
|
+
replace_column(column, values)
|
178
|
+
self
|
179
|
+
end
|
180
|
+
|
181
|
+
def replace_column(column, values)
|
182
|
+
column = validate_column(column)
|
183
|
+
index = self.labels.index(column)
|
184
|
+
list = []
|
185
|
+
self.items.each_with_index do |item, i|
|
186
|
+
consolidated = item
|
187
|
+
consolidated[index] = values[i]
|
188
|
+
list << consolidated
|
189
|
+
end
|
190
|
+
@items = list.dup
|
191
|
+
end
|
192
|
+
protected :replace_column
|
193
|
+
|
194
|
+
def validate_column(column)
|
195
|
+
column = column.to_sym
|
196
|
+
raise ArgumentError, "Must provide the name of an existing column. Provided #{column.inspect}, needed to provide one of #{self.labels.inspect}" unless self.labels.include?(column)
|
197
|
+
column
|
198
|
+
end
|
199
|
+
protected :validate_column
|
200
|
+
|
201
|
+
# Takes a block to evaluate on each row. The row can be converted into
|
202
|
+
# an OpenStruct or a Hash for easier filter methods. Note, don't try this
|
203
|
+
# with a hash or open struct unless you have facets available.
|
204
|
+
def filter!(as=Array, &block)
|
205
|
+
as = infer_class(as)
|
206
|
+
items = []
|
207
|
+
self.items.each do |row|
|
208
|
+
value = block.call(cast_row(row, as))
|
209
|
+
items << row if value
|
210
|
+
end
|
211
|
+
@items = items.dup
|
212
|
+
self
|
213
|
+
end
|
214
|
+
|
215
|
+
def infer_class(obj)
|
216
|
+
obj = obj.to_s.classify.constantize if obj.is_a?(Symbol)
|
217
|
+
obj = obj.classify.constantize if obj.is_a?(String)
|
218
|
+
obj
|
219
|
+
end
|
220
|
+
protected :infer_class
|
221
|
+
|
222
|
+
def cast_row(row, as)
|
223
|
+
if as == Hash
|
224
|
+
obj = {}
|
225
|
+
self.labels.each_with_index do |label, i|
|
226
|
+
obj[label] = row[i]
|
227
|
+
end
|
228
|
+
obj
|
229
|
+
elsif as == OpenStruct
|
230
|
+
obj = OpenStruct.new
|
231
|
+
self.labels.each_with_index do |label, i|
|
232
|
+
obj.table[label] = row[i]
|
233
|
+
end
|
234
|
+
obj
|
235
|
+
elsif as == Array
|
236
|
+
row
|
237
|
+
else
|
238
|
+
as.new(row)
|
239
|
+
end
|
240
|
+
end
|
241
|
+
protected :cast_row
|
242
|
+
|
243
|
+
# Creates a new data frame, only with the specified columns.
|
244
|
+
def subset_from_columns(*cols)
|
245
|
+
new_labels = self.labels.inject([]) do |list, label|
|
246
|
+
list << label if cols.include?(label)
|
247
|
+
list
|
248
|
+
end
|
249
|
+
new_data_frame = DataFrame.new(*self.labels)
|
250
|
+
new_data_frame.import(self.items)
|
251
|
+
self.labels.each do |label|
|
252
|
+
new_data_frame.drop!(label) unless new_labels.include?(label)
|
253
|
+
end
|
254
|
+
new_data_frame
|
153
255
|
end
|
154
256
|
|
155
257
|
end
|
data/spec/data_frame_spec.rb
CHANGED
@@ -101,6 +101,14 @@ describe DataFrame do
|
|
101
101
|
@df.labels.should eql([:threes, :fours])
|
102
102
|
end
|
103
103
|
|
104
|
+
it "should be able to remove more than one column at a time" do
|
105
|
+
@df = DataFrame.new :twos, :threes, :fours
|
106
|
+
@df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
|
107
|
+
@df.drop!(:twos, :fours)
|
108
|
+
@df.items.all? {|i| i.should eql([3])}
|
109
|
+
@df.labels.should eql([:threes])
|
110
|
+
end
|
111
|
+
|
104
112
|
it "should offer a hash-like structure of columns" do
|
105
113
|
@df.add [1,2,3,4]
|
106
114
|
@df.add [5, 6, 7, 8]
|
@@ -131,4 +139,56 @@ describe DataFrame do
|
|
131
139
|
@df.labels.should eql(@labels)
|
132
140
|
@df.variables.should eql(@labels)
|
133
141
|
end
|
142
|
+
|
143
|
+
context "replace!" do
|
144
|
+
before do
|
145
|
+
@df.add [1,2,3,4]
|
146
|
+
@df.add [5, 6, 7, 8]
|
147
|
+
@doubler = lambda{|e| e * 2}
|
148
|
+
end
|
149
|
+
|
150
|
+
it "should only replace columns that actually exist" do
|
151
|
+
lambda{@df.replace!(:not_a_column, &@doubler)}.should raise_error(
|
152
|
+
ArgumentError, /Must provide the name of an existing column./)
|
153
|
+
lambda{@df.replace!(:these, &@doubler)}.should_not raise_error
|
154
|
+
end
|
155
|
+
|
156
|
+
it "should be able to replace a column with a block" do
|
157
|
+
@df.replace!(:these) {|e| e * 2}
|
158
|
+
@df.these.should eql([2,10])
|
159
|
+
end
|
160
|
+
|
161
|
+
it "should be able to replace a column with an array" do
|
162
|
+
@a = [5,9]
|
163
|
+
@df.replace!(:these, @a)
|
164
|
+
@df.these.should eql(@a)
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
context "filter!" do
|
169
|
+
before do
|
170
|
+
@df.add [1,2,3,4]
|
171
|
+
@df.add [5, 6, 7, 8]
|
172
|
+
end
|
173
|
+
|
174
|
+
it "should be able to filter a data frame with a block" do
|
175
|
+
@df.filter!(:open_struct) {|row| row.these == 5}
|
176
|
+
@df.items.should eql([[5, 6, 7, 8]])
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
context "subset_from_columns" do
|
181
|
+
before do
|
182
|
+
@df.add [1,2,3,4]
|
183
|
+
@df.add [5, 6, 7, 8]
|
184
|
+
end
|
185
|
+
|
186
|
+
it "should be able to create a subset of columns" do
|
187
|
+
new_data_frame = @df.subset_from_columns(:these, :labels)
|
188
|
+
new_data_frame.should_not eql(@df)
|
189
|
+
new_data_frame.labels.should eql([:these, :labels])
|
190
|
+
new_data_frame.items.should eql([[1,4],[5,8]])
|
191
|
+
new_data_frame.these.should eql([1,5])
|
192
|
+
end
|
193
|
+
end
|
134
194
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: davidrichards-data_frame
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David Richards
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-08-
|
12
|
+
date: 2009-08-16 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -58,6 +58,7 @@ files:
|
|
58
58
|
- lib/data_frame/transposable_array.rb
|
59
59
|
- lib/data_frame.rb
|
60
60
|
- lib/ext
|
61
|
+
- lib/ext/open_struct.rb
|
61
62
|
- lib/ext/string.rb
|
62
63
|
- lib/ext/symbol.rb
|
63
64
|
- spec/data_frame
|