davidrichards-data_frame 0.0.12 → 0.0.13

Sign up to get free protection for your applications and to get access to all the features.
@@ -48,6 +48,33 @@ To get your feet wet, you may want to play with data sets found here:
48
48
 
49
49
  http://www.liaad.up.pt/~ltorgo/Regression/DataSets.html
50
50
 
51
+ == Transformations
52
+
53
+ A lot of the work in the data frame is to transform the actual table. You may need to drop columns, filter results, replace values in a column or create a new data frame based on the existing one. Here's how to do that:
54
+
55
+ > df = DataFrame.from_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv')
56
+ # => DataFrame rows: 517 labels: [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
57
+ > df.drop!(:ffmc)
58
+ # => DataFrame rows: 517 labels: [:x, :y, :month, :day, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
59
+ > df.drop!(:dmc, :dc, :isi, :rh)
60
+ # => DataFrame rows: 517 labels: [:x, :y, :month, :day, :temp, :wind, :rain, :area]
61
+ > df.x
62
+ # => [7, 7, 7, 8, 8, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6,...]
63
+ > df.replace!(:x) {|e| e * 3}
64
+ # => DataFrame rows: 517 labels: [:x, :y, :month, :day, :temp, :wind, :rain, :area]
65
+ > df.x
66
+ # => [21, 21, 21, 24, 24, 24, 24, 24, 24, 21, 21, 21, 18, 18, 18,...]
67
+ > df.filter!(:open_struct) {|row| row.x == 24}
68
+ # => DataFrame rows: 61 labels: [:x, :y, :month, :day, :temp, :wind, :rain, :area]
69
+ > df.x
70
+ # => [24, 24, 24, 24, 24, 24, 24, 24, 24,...]
71
+ > new_data_frame = df.subset_from_columns(:x, :y)
72
+ # => DataFrame rows: 61 labels: [:x, :y]
73
+ > new_data_frame.items
74
+ # => [[24, 6], [24, 6], [24, 6], [24, 6], ...]
75
+
76
+
77
+ Note: most of these transformations are not optimized. I'll work with things for a while before I try to optimize this library. However, I should say that I've used some fairly large data sets (thousands of rows) and have been fine with things so far.
51
78
 
52
79
  ==Installation
53
80
 
@@ -1,4 +1,4 @@
1
1
  ---
2
2
  :major: 0
3
3
  :minor: 0
4
- :patch: 12
4
+ :patch: 13
@@ -3,6 +3,7 @@ require 'activesupport'
3
3
  require 'just_enumerable_stats'
4
4
  require 'open-uri'
5
5
  require 'fastercsv'
6
+ require 'ostruct'
6
7
 
7
8
  # Use a Dictionary if available
8
9
  begin
@@ -57,6 +58,9 @@ class DataFrame
57
58
  return nil unless contents
58
59
  table = FCSV.parse(contents, default_csv_opts.merge(opts))
59
60
  labels = table.shift
61
+ while table.last.empty?
62
+ table.pop
63
+ end
60
64
  [labels, table]
61
65
  end
62
66
 
@@ -71,6 +75,10 @@ class DataFrame
71
75
  end
72
76
  end
73
77
 
78
+ def inspect
79
+ "DataFrame rows: #{self.rows.size} labels: #{self.labels.inspect}"
80
+ end
81
+
74
82
  # The labels of the data items
75
83
  attr_reader :labels
76
84
  alias :variables :labels
@@ -142,14 +150,108 @@ class DataFrame
142
150
  end
143
151
  end
144
152
 
145
- def drop!(label)
153
+ def drop!(*labels)
154
+ labels.each do |label|
155
+ drop_one!(label)
156
+ end
157
+ self
158
+ end
159
+
160
+ def drop_one!(label)
146
161
  i = self.labels.index(label)
147
162
  return nil unless i
148
163
  self.items.each do |item|
149
164
  item.delete_at(i)
150
165
  end
151
166
  self.labels.delete_at(i)
152
- true
167
+ self
168
+ end
169
+ protected :drop_one!
170
+
171
+ def replace!(column, values=nil, &block)
172
+ column = validate_column(column)
173
+ if not values
174
+ values = self.send(column)
175
+ values.map! {|e| block.call(e)}
176
+ end
177
+ replace_column(column, values)
178
+ self
179
+ end
180
+
181
+ def replace_column(column, values)
182
+ column = validate_column(column)
183
+ index = self.labels.index(column)
184
+ list = []
185
+ self.items.each_with_index do |item, i|
186
+ consolidated = item
187
+ consolidated[index] = values[i]
188
+ list << consolidated
189
+ end
190
+ @items = list.dup
191
+ end
192
+ protected :replace_column
193
+
194
+ def validate_column(column)
195
+ column = column.to_sym
196
+ raise ArgumentError, "Must provide the name of an existing column. Provided #{column.inspect}, needed to provide one of #{self.labels.inspect}" unless self.labels.include?(column)
197
+ column
198
+ end
199
+ protected :validate_column
200
+
201
+ # Takes a block to evaluate on each row. The row can be converted into
202
+ # an OpenStruct or a Hash for easier filter methods. Note, don't try this
203
+ # with a hash or open struct unless you have facets available.
204
+ def filter!(as=Array, &block)
205
+ as = infer_class(as)
206
+ items = []
207
+ self.items.each do |row|
208
+ value = block.call(cast_row(row, as))
209
+ items << row if value
210
+ end
211
+ @items = items.dup
212
+ self
213
+ end
214
+
215
+ def infer_class(obj)
216
+ obj = obj.to_s.classify.constantize if obj.is_a?(Symbol)
217
+ obj = obj.classify.constantize if obj.is_a?(String)
218
+ obj
219
+ end
220
+ protected :infer_class
221
+
222
+ def cast_row(row, as)
223
+ if as == Hash
224
+ obj = {}
225
+ self.labels.each_with_index do |label, i|
226
+ obj[label] = row[i]
227
+ end
228
+ obj
229
+ elsif as == OpenStruct
230
+ obj = OpenStruct.new
231
+ self.labels.each_with_index do |label, i|
232
+ obj.table[label] = row[i]
233
+ end
234
+ obj
235
+ elsif as == Array
236
+ row
237
+ else
238
+ as.new(row)
239
+ end
240
+ end
241
+ protected :cast_row
242
+
243
+ # Creates a new data frame, only with the specified columns.
244
+ def subset_from_columns(*cols)
245
+ new_labels = self.labels.inject([]) do |list, label|
246
+ list << label if cols.include?(label)
247
+ list
248
+ end
249
+ new_data_frame = DataFrame.new(*self.labels)
250
+ new_data_frame.import(self.items)
251
+ self.labels.each do |label|
252
+ new_data_frame.drop!(label) unless new_labels.include?(label)
253
+ end
254
+ new_data_frame
153
255
  end
154
256
 
155
257
  end
@@ -0,0 +1,5 @@
1
+ class OpenStruct
2
+ def table
3
+ @table
4
+ end
5
+ end
@@ -101,6 +101,14 @@ describe DataFrame do
101
101
  @df.labels.should eql([:threes, :fours])
102
102
  end
103
103
 
104
+ it "should be able to remove more than one column at a time" do
105
+ @df = DataFrame.new :twos, :threes, :fours
106
+ @df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
107
+ @df.drop!(:twos, :fours)
108
+ @df.items.all? {|i| i.should eql([3])}
109
+ @df.labels.should eql([:threes])
110
+ end
111
+
104
112
  it "should offer a hash-like structure of columns" do
105
113
  @df.add [1,2,3,4]
106
114
  @df.add [5, 6, 7, 8]
@@ -131,4 +139,56 @@ describe DataFrame do
131
139
  @df.labels.should eql(@labels)
132
140
  @df.variables.should eql(@labels)
133
141
  end
142
+
143
+ context "replace!" do
144
+ before do
145
+ @df.add [1,2,3,4]
146
+ @df.add [5, 6, 7, 8]
147
+ @doubler = lambda{|e| e * 2}
148
+ end
149
+
150
+ it "should only replace columns that actually exist" do
151
+ lambda{@df.replace!(:not_a_column, &@doubler)}.should raise_error(
152
+ ArgumentError, /Must provide the name of an existing column./)
153
+ lambda{@df.replace!(:these, &@doubler)}.should_not raise_error
154
+ end
155
+
156
+ it "should be able to replace a column with a block" do
157
+ @df.replace!(:these) {|e| e * 2}
158
+ @df.these.should eql([2,10])
159
+ end
160
+
161
+ it "should be able to replace a column with an array" do
162
+ @a = [5,9]
163
+ @df.replace!(:these, @a)
164
+ @df.these.should eql(@a)
165
+ end
166
+ end
167
+
168
+ context "filter!" do
169
+ before do
170
+ @df.add [1,2,3,4]
171
+ @df.add [5, 6, 7, 8]
172
+ end
173
+
174
+ it "should be able to filter a data frame with a block" do
175
+ @df.filter!(:open_struct) {|row| row.these == 5}
176
+ @df.items.should eql([[5, 6, 7, 8]])
177
+ end
178
+ end
179
+
180
+ context "subset_from_columns" do
181
+ before do
182
+ @df.add [1,2,3,4]
183
+ @df.add [5, 6, 7, 8]
184
+ end
185
+
186
+ it "should be able to create a subset of columns" do
187
+ new_data_frame = @df.subset_from_columns(:these, :labels)
188
+ new_data_frame.should_not eql(@df)
189
+ new_data_frame.labels.should eql([:these, :labels])
190
+ new_data_frame.items.should eql([[1,4],[5,8]])
191
+ new_data_frame.these.should eql([1,5])
192
+ end
193
+ end
134
194
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: davidrichards-data_frame
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.12
4
+ version: 0.0.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Richards
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-08-11 00:00:00 -07:00
12
+ date: 2009-08-16 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -58,6 +58,7 @@ files:
58
58
  - lib/data_frame/transposable_array.rb
59
59
  - lib/data_frame.rb
60
60
  - lib/ext
61
+ - lib/ext/open_struct.rb
61
62
  - lib/ext/string.rb
62
63
  - lib/ext/symbol.rb
63
64
  - spec/data_frame