davidrichards-data_frame 0.0.12 → 0.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -48,6 +48,33 @@ To get your feet wet, you may want to play with data sets found here:
48
48
 
49
49
  http://www.liaad.up.pt/~ltorgo/Regression/DataSets.html
50
50
 
51
+ == Transformations
52
+
53
+ A lot of the work in the data frame is to transform the actual table. You may need to drop columns, filter results, replace values in a column or create a new data frame based on the existing one. Here's how to do that:
54
+
55
+ > df = DataFrame.from_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv')
56
+ # => DataFrame rows: 517 labels: [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
57
+ > df.drop!(:ffmc)
58
+ # => DataFrame rows: 517 labels: [:x, :y, :month, :day, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
59
+ > df.drop!(:dmc, :dc, :isi, :rh)
60
+ # => DataFrame rows: 517 labels: [:x, :y, :month, :day, :temp, :wind, :rain, :area]
61
+ > df.x
62
+ # => [7, 7, 7, 8, 8, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6,...]
63
+ > df.replace!(:x) {|e| e * 3}
64
+ # => DataFrame rows: 517 labels: [:x, :y, :month, :day, :temp, :wind, :rain, :area]
65
+ > df.x
66
+ # => [21, 21, 21, 24, 24, 24, 24, 24, 24, 21, 21, 21, 18, 18, 18,...]
67
+ > df.filter!(:open_struct) {|row| row.x == 24}
68
+ # => DataFrame rows: 61 labels: [:x, :y, :month, :day, :temp, :wind, :rain, :area]
69
+ > df.x
70
+ # => [24, 24, 24, 24, 24, 24, 24, 24, 24,...]
71
+ > new_data_frame = df.subset_from_columns(:x, :y)
72
+ # => DataFrame rows: 61 labels: [:x, :y]
73
+ > new_data_frame.items
74
+ # => [[24, 6], [24, 6], [24, 6], [24, 6], ...]
75
+
76
+
77
+ Note: most of these transformations are not optimized. I'll work with things for a while before I try to optimize this library. However, I should say that I've used some fairly large data sets (thousands of rows) and have been fine with things so far.
51
78
 
52
79
  ==Installation
53
80
 
@@ -1,4 +1,4 @@
1
1
  ---
2
2
  :major: 0
3
3
  :minor: 0
4
- :patch: 12
4
+ :patch: 13
@@ -3,6 +3,7 @@ require 'activesupport'
3
3
  require 'just_enumerable_stats'
4
4
  require 'open-uri'
5
5
  require 'fastercsv'
6
+ require 'ostruct'
6
7
 
7
8
  # Use a Dictionary if available
8
9
  begin
@@ -57,6 +58,9 @@ class DataFrame
57
58
  return nil unless contents
58
59
  table = FCSV.parse(contents, default_csv_opts.merge(opts))
59
60
  labels = table.shift
61
+ while table.last.empty?
62
+ table.pop
63
+ end
60
64
  [labels, table]
61
65
  end
62
66
 
@@ -71,6 +75,10 @@ class DataFrame
71
75
  end
72
76
  end
73
77
 
78
+ def inspect
79
+ "DataFrame rows: #{self.rows.size} labels: #{self.labels.inspect}"
80
+ end
81
+
74
82
  # The labels of the data items
75
83
  attr_reader :labels
76
84
  alias :variables :labels
@@ -142,14 +150,108 @@ class DataFrame
142
150
  end
143
151
  end
144
152
 
145
- def drop!(label)
153
+ def drop!(*labels)
154
+ labels.each do |label|
155
+ drop_one!(label)
156
+ end
157
+ self
158
+ end
159
+
160
+ def drop_one!(label)
146
161
  i = self.labels.index(label)
147
162
  return nil unless i
148
163
  self.items.each do |item|
149
164
  item.delete_at(i)
150
165
  end
151
166
  self.labels.delete_at(i)
152
- true
167
+ self
168
+ end
169
+ protected :drop_one!
170
+
171
+ def replace!(column, values=nil, &block)
172
+ column = validate_column(column)
173
+ if not values
174
+ values = self.send(column)
175
+ values.map! {|e| block.call(e)}
176
+ end
177
+ replace_column(column, values)
178
+ self
179
+ end
180
+
181
+ def replace_column(column, values)
182
+ column = validate_column(column)
183
+ index = self.labels.index(column)
184
+ list = []
185
+ self.items.each_with_index do |item, i|
186
+ consolidated = item
187
+ consolidated[index] = values[i]
188
+ list << consolidated
189
+ end
190
+ @items = list.dup
191
+ end
192
+ protected :replace_column
193
+
194
+ def validate_column(column)
195
+ column = column.to_sym
196
+ raise ArgumentError, "Must provide the name of an existing column. Provided #{column.inspect}, needed to provide one of #{self.labels.inspect}" unless self.labels.include?(column)
197
+ column
198
+ end
199
+ protected :validate_column
200
+
201
+ # Takes a block to evaluate on each row. The row can be converted into
202
+ # an OpenStruct or a Hash for easier filter methods. Note, don't try this
203
+ # with a hash or open struct unless you have facets available.
204
+ def filter!(as=Array, &block)
205
+ as = infer_class(as)
206
+ items = []
207
+ self.items.each do |row|
208
+ value = block.call(cast_row(row, as))
209
+ items << row if value
210
+ end
211
+ @items = items.dup
212
+ self
213
+ end
214
+
215
+ def infer_class(obj)
216
+ obj = obj.to_s.classify.constantize if obj.is_a?(Symbol)
217
+ obj = obj.classify.constantize if obj.is_a?(String)
218
+ obj
219
+ end
220
+ protected :infer_class
221
+
222
+ def cast_row(row, as)
223
+ if as == Hash
224
+ obj = {}
225
+ self.labels.each_with_index do |label, i|
226
+ obj[label] = row[i]
227
+ end
228
+ obj
229
+ elsif as == OpenStruct
230
+ obj = OpenStruct.new
231
+ self.labels.each_with_index do |label, i|
232
+ obj.table[label] = row[i]
233
+ end
234
+ obj
235
+ elsif as == Array
236
+ row
237
+ else
238
+ as.new(row)
239
+ end
240
+ end
241
+ protected :cast_row
242
+
243
+ # Creates a new data frame, only with the specified columns.
244
+ def subset_from_columns(*cols)
245
+ new_labels = self.labels.inject([]) do |list, label|
246
+ list << label if cols.include?(label)
247
+ list
248
+ end
249
+ new_data_frame = DataFrame.new(*self.labels)
250
+ new_data_frame.import(self.items)
251
+ self.labels.each do |label|
252
+ new_data_frame.drop!(label) unless new_labels.include?(label)
253
+ end
254
+ new_data_frame
153
255
  end
154
256
 
155
257
  end
@@ -0,0 +1,5 @@
1
+ class OpenStruct
2
+ def table
3
+ @table
4
+ end
5
+ end
@@ -101,6 +101,14 @@ describe DataFrame do
101
101
  @df.labels.should eql([:threes, :fours])
102
102
  end
103
103
 
104
+ it "should be able to remove more than one column at a time" do
105
+ @df = DataFrame.new :twos, :threes, :fours
106
+ @df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
107
+ @df.drop!(:twos, :fours)
108
+ @df.items.all? {|i| i.should eql([3])}
109
+ @df.labels.should eql([:threes])
110
+ end
111
+
104
112
  it "should offer a hash-like structure of columns" do
105
113
  @df.add [1,2,3,4]
106
114
  @df.add [5, 6, 7, 8]
@@ -131,4 +139,56 @@ describe DataFrame do
131
139
  @df.labels.should eql(@labels)
132
140
  @df.variables.should eql(@labels)
133
141
  end
142
+
143
+ context "replace!" do
144
+ before do
145
+ @df.add [1,2,3,4]
146
+ @df.add [5, 6, 7, 8]
147
+ @doubler = lambda{|e| e * 2}
148
+ end
149
+
150
+ it "should only replace columns that actually exist" do
151
+ lambda{@df.replace!(:not_a_column, &@doubler)}.should raise_error(
152
+ ArgumentError, /Must provide the name of an existing column./)
153
+ lambda{@df.replace!(:these, &@doubler)}.should_not raise_error
154
+ end
155
+
156
+ it "should be able to replace a column with a block" do
157
+ @df.replace!(:these) {|e| e * 2}
158
+ @df.these.should eql([2,10])
159
+ end
160
+
161
+ it "should be able to replace a column with an array" do
162
+ @a = [5,9]
163
+ @df.replace!(:these, @a)
164
+ @df.these.should eql(@a)
165
+ end
166
+ end
167
+
168
+ context "filter!" do
169
+ before do
170
+ @df.add [1,2,3,4]
171
+ @df.add [5, 6, 7, 8]
172
+ end
173
+
174
+ it "should be able to filter a data frame with a block" do
175
+ @df.filter!(:open_struct) {|row| row.these == 5}
176
+ @df.items.should eql([[5, 6, 7, 8]])
177
+ end
178
+ end
179
+
180
+ context "subset_from_columns" do
181
+ before do
182
+ @df.add [1,2,3,4]
183
+ @df.add [5, 6, 7, 8]
184
+ end
185
+
186
+ it "should be able to create a subset of columns" do
187
+ new_data_frame = @df.subset_from_columns(:these, :labels)
188
+ new_data_frame.should_not eql(@df)
189
+ new_data_frame.labels.should eql([:these, :labels])
190
+ new_data_frame.items.should eql([[1,4],[5,8]])
191
+ new_data_frame.these.should eql([1,5])
192
+ end
193
+ end
134
194
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: davidrichards-data_frame
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.12
4
+ version: 0.0.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Richards
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-08-11 00:00:00 -07:00
12
+ date: 2009-08-16 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -58,6 +58,7 @@ files:
58
58
  - lib/data_frame/transposable_array.rb
59
59
  - lib/data_frame.rb
60
60
  - lib/ext
61
+ - lib/ext/open_struct.rb
61
62
  - lib/ext/string.rb
62
63
  - lib/ext/symbol.rb
63
64
  - spec/data_frame