davidrichards-data_frame 0.0.15 → 0.0.17

Sign up to get free protection for your applications and to get access to all the features.
@@ -75,6 +75,21 @@ A lot of the work in the data frame is to transform the actual table. You may n
75
75
 
76
76
 
77
77
  Note: most of these transformations are not optimized. I'll work with things for a while before I try to optimize this library. However, I should say that I've used some fairly large data sets (thousands of rows) and have been fine with things so far.
78
+
79
+ == Models
80
+
81
+ Data Frame can now create sub-models:
82
+
83
+ >> df = DataFrame.from_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv')
84
+ => DataFrame rows: 517 labels: [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
85
+ >> df.model(:weekend) do |m|
86
+ ?> m.day %w(sat sun)
87
+ >> end
88
+ => DataFrame rows: 179 labels: [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
89
+ >> df.models.weekend.day.uniq
90
+ => ["sat", "sun"]
91
+ >> df.models
92
+ => #<OpenStruct weekend=DataFrame rows: 179 labels: [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]>
78
93
 
79
94
  ==Installation
80
95
 
@@ -1,4 +1,4 @@
1
1
  ---
2
2
  :major: 0
3
3
  :minor: 0
4
- :patch: 15
4
+ :patch: 17
@@ -19,305 +19,6 @@ $:.unshift(File.dirname(__FILE__))
19
19
 
20
20
  require 'data_frame/callback_array'
21
21
  require 'data_frame/transposable_array'
22
-
23
- # This allows me to have named columns and optionally named rows in a
24
- # data frame, to work calculations (usually on the columns), to
25
- # transpose the matrix and store the transposed matrix until the object
26
- # is tainted.
27
- class DataFrame
28
-
29
- class << self
30
-
31
- # This is the neatest part of this neat gem.
32
- # DataFrame.from_csv can be called in a lot of ways:
33
- # DataFrame.from_csv(csv_contents)
34
- # DataFrame.from_csv(filename)
35
- # DataFrame.from_csv(url)
36
- # If you need to define converters for FasterCSV, do it before calling
37
- # this method:
38
- # FasterCSV::Converters[:special] = lambda{|f| f == 'foo' ? 'bar' : 'foo'}
39
- # DataFrame.from_csv('http://example.com/my_special_url.csv', :converters => :special)
40
- # This returns bar where 'foo' was found and 'foo' everywhere else.
41
- def from_csv(obj, opts={})
42
- labels, table = infer_csv_contents(obj, opts)
43
- return nil unless labels and table
44
- df = new(*labels)
45
- df.import(table)
46
- df
47
- end
48
-
49
- protected
50
- def infer_csv_contents(obj, opts={})
51
- contents = File.read(obj) if File.exist?(obj)
52
- begin
53
- open(obj) {|f| contents = f.read} unless contents
54
- rescue
55
- nil
56
- end
57
- contents ||= obj if obj.is_a?(String)
58
- return nil unless contents
59
- table = FCSV.parse(contents, default_csv_opts.merge(opts))
60
- labels = table.shift
61
- while table.last.empty?
62
- table.pop
63
- end
64
- [labels, table]
65
- end
66
-
67
- def default_csv_opts; {:converters => :all}; end
68
- end
69
-
70
- # Loads a batch of rows. Expects an array of arrays, else you don't
71
- # know what you have.
72
- def import(rows)
73
- rows.each do |row|
74
- self.add_item(row)
75
- end
76
- end
77
-
78
- def inspect
79
- "DataFrame rows: #{self.rows.size} labels: #{self.labels.inspect}"
80
- end
81
-
82
- # The labels of the data items
83
- attr_reader :labels
84
- alias :variables :labels
85
-
86
- # The items stored in the frame
87
- attr_reader :items
88
-
89
- def initialize(*labels)
90
- @labels = labels.map {|e| e.to_underscore_sym }
91
- @items = TransposableArray.new
92
- end
93
-
94
- def add_item(item)
95
- self.items << item
96
- end
97
- alias :add :add_item
98
-
99
- def row_labels
100
- @row_labels ||= []
101
- end
102
-
103
- def row_labels=(ary)
104
- raise ArgumentError, "Row labels must be an array" unless ary.is_a?(Array)
105
- @row_labels = ary
106
- end
107
-
108
- def render_column(sym)
109
- i = @labels.index(sym)
110
- return nil unless i
111
- @items.transpose[i]
112
- end
113
-
114
- # The rows as an array of arrays, an alias for items.
115
- alias :rows :items
116
-
117
- # The columns as a Dictionary or Hash
118
- # This is cached, call columns(true) to reset the cache.
119
- def columns(reset=false)
120
- @columns = nil if reset
121
- return @columns if @columns
122
-
123
- container = defined?(Dictionary) ? Dictionary.new : Hash.new
124
- i = 0
125
-
126
- @columns = @items.transpose.inject(container) do |cont, col|
127
- cont[@labels[i]] = col
128
- i += 1
129
- cont
130
- end
131
- end
132
- alias :to_hash :columns
133
- alias :to_dictionary :columns
134
-
135
- def render_row(sym)
136
- i = self.row_labels.index(sym)
137
- return nil unless i
138
- @items[i]
139
- end
140
-
141
- def method_missing(sym, *args, &block)
142
- if self.labels.include?(sym)
143
- render_column(sym)
144
- elsif self.row_labels.include?(sym)
145
- render_row(sym)
146
- elsif @items.respond_to?(sym)
147
- @items.send(sym, *args, &block)
148
- else
149
- super
150
- end
151
- end
152
-
153
- def drop!(*labels)
154
- labels.each do |label|
155
- drop_one!(label)
156
- end
157
- self
158
- end
159
-
160
- def drop_one!(label)
161
- i = self.labels.index(label)
162
- return nil unless i
163
- self.items.each do |item|
164
- item.delete_at(i)
165
- end
166
- self.labels.delete_at(i)
167
- self
168
- end
169
- protected :drop_one!
170
-
171
- def replace!(column, values=nil, &block)
172
- column = validate_column(column)
173
- if not values
174
- values = self.send(column)
175
- values.map! {|e| block.call(e)}
176
- end
177
- replace_column(column, values)
178
- self
179
- end
180
-
181
- def replace_column(column, values)
182
- column = validate_column(column)
183
- index = self.labels.index(column)
184
- list = []
185
- self.items.each_with_index do |item, i|
186
- consolidated = item
187
- consolidated[index] = values[i]
188
- list << consolidated
189
- end
190
- @items = list.dup
191
- end
192
- protected :replace_column
193
-
194
- def validate_column(column)
195
- column = column.to_sym
196
- raise ArgumentError, "Must provide the name of an existing column. Provided #{column.inspect}, needed to provide one of #{self.labels.inspect}" unless self.labels.include?(column)
197
- column
198
- end
199
- protected :validate_column
200
-
201
- # Takes a block to evaluate on each row. The row can be converted into
202
- # an OpenStruct or a Hash for easier filter methods. Note, don't try this
203
- # with a hash or open struct unless you have facets available.
204
- def filter!(as=Array, &block)
205
- as = infer_class(as)
206
- items = []
207
- self.items.each do |row|
208
- value = block.call(cast_row(row, as))
209
- items << row if value
210
- end
211
- @items = items.dup
212
- self
213
- end
214
-
215
- def filter(as=Array, &block)
216
- new_data_frame = self.clone
217
- new_data_frame.filter!(as, &block)
218
- end
219
-
220
- def infer_class(obj)
221
- obj = obj.to_s.classify.constantize if obj.is_a?(Symbol)
222
- obj = obj.classify.constantize if obj.is_a?(String)
223
- obj
224
- end
225
- protected :infer_class
226
-
227
- def cast_row(row, as)
228
- if as == Hash
229
- obj = {}
230
- self.labels.each_with_index do |label, i|
231
- obj[label] = row[i]
232
- end
233
- obj
234
- elsif as == OpenStruct
235
- obj = OpenStruct.new
236
- self.labels.each_with_index do |label, i|
237
- obj.table[label] = row[i]
238
- end
239
- obj
240
- elsif as == Array
241
- row
242
- else
243
- as.new(*row)
244
- end
245
- end
246
- protected :cast_row
247
-
248
- # Creates a new data frame, only with the specified columns.
249
- def subset_from_columns(*cols)
250
- new_labels = self.labels.inject([]) do |list, label|
251
- list << label if cols.include?(label)
252
- list
253
- end
254
- new_data_frame = DataFrame.new(*self.labels)
255
- new_data_frame.import(self.items)
256
- self.labels.each do |label|
257
- new_data_frame.drop!(label) unless new_labels.include?(label)
258
- end
259
- new_data_frame
260
- end
261
-
262
- # A weird name. This creates a column for every category in a column
263
- # and marks each row by its value
264
- def j_binary_ize!(*columns)
265
- # Allows to mix a hash with the columns.
266
- options = columns.find_all {|e| e.is_a?(Hash)}.inject({}) {|h, e| h.merge!(e)}
267
- columns.delete_if {|e| e.is_a?(Hash)}
268
-
269
- # Generates new columns
270
- columns.each do |col|
271
- values = render_column(col.to_underscore_sym)
272
- values.categories.each do |category|
273
- full_name = (col.to_s + "_" + category.to_s).to_sym
274
- if options[:allow_overlap]
275
- category_map = values.inject([]) do |list, e|
276
- list << values.all_categories(e)
277
- end
278
- self.append!(full_name, category_map.map{|e| e.include?(category)})
279
- else
280
- self.append!(full_name, values.category_map.map{|e| e == category})
281
- end
282
- end
283
- end
284
- end
285
-
286
- # Adds a unique column to the table
287
- def append!(column_name, value=nil)
288
- raise ArgumentError, "Can't have duplicate column names" if self.labels.include?(column_name)
289
- self.labels << column_name.to_underscore_sym
290
- if value.is_a?(Array)
291
- self.items.each_with_index do |item, i|
292
- item << value[i]
293
- end
294
- else
295
- self.items.each do |item|
296
- item << value
297
- end
298
- end
299
- # Because we are tainting the sub arrays, the TaintableArray doesn't know it's been changed.
300
- self.items.taint
301
- end
302
-
303
- def filter_by_category(hash)
304
- new_data_frame = self.dup
305
- hash.each do |key, value|
306
- key = key.to_underscore_sym
307
- next unless self.labels.include?(key)
308
- value = [value] unless value.is_a?(Array) or value.is_a?(Range)
309
- new_data_frame.filter!(:hash) {|row| value.include?(row[key])}
310
- end
311
- new_data_frame
312
- end
313
-
314
- def filter_by_category!(hash)
315
- hash.each do |key, value|
316
- key = key.to_underscore_sym
317
- next unless self.labels.include?(key)
318
- value = [value] unless value.is_a?(Array) or value.is_a?(Range)
319
- self.filter!(:hash) {|row| value.include?(row[key])}
320
- end
321
- end
322
-
323
- end
22
+ require 'data_frame/parameter_capture'
23
+ require 'data_frame/data_frame'
24
+ require 'data_frame/model'
@@ -0,0 +1,301 @@
1
+ # This allows me to have named columns and optionally named rows in a
2
+ # data frame, to work calculations (usually on the columns), to
3
+ # transpose the matrix and store the transposed matrix until the object
4
+ # is tainted.
5
+ class DataFrame
6
+
7
+ class << self
8
+
9
+ # This is the neatest part of this neat gem.
10
+ # DataFrame.from_csv can be called in a lot of ways:
11
+ # DataFrame.from_csv(csv_contents)
12
+ # DataFrame.from_csv(filename)
13
+ # DataFrame.from_csv(url)
14
+ # If you need to define converters for FasterCSV, do it before calling
15
+ # this method:
16
+ # FasterCSV::Converters[:special] = lambda{|f| f == 'foo' ? 'bar' : 'foo'}
17
+ # DataFrame.from_csv('http://example.com/my_special_url.csv', :converters => :special)
18
+ # This returns bar where 'foo' was found and 'foo' everywhere else.
19
+ def from_csv(obj, opts={})
20
+ labels, table = infer_csv_contents(obj, opts)
21
+ return nil unless labels and table
22
+ df = new(*labels)
23
+ df.import(table)
24
+ df
25
+ end
26
+
27
+ protected
28
+ def infer_csv_contents(obj, opts={})
29
+ contents = File.read(obj) if File.exist?(obj)
30
+ begin
31
+ open(obj) {|f| contents = f.read} unless contents
32
+ rescue
33
+ nil
34
+ end
35
+ contents ||= obj if obj.is_a?(String)
36
+ return nil unless contents
37
+ table = FCSV.parse(contents, default_csv_opts.merge(opts))
38
+ labels = table.shift
39
+ while table.last.empty?
40
+ table.pop
41
+ end
42
+ [labels, table]
43
+ end
44
+
45
+ def default_csv_opts; {:converters => :all}; end
46
+ end
47
+
48
+ # Loads a batch of rows. Expects an array of arrays, else you don't
49
+ # know what you have.
50
+ def import(rows)
51
+ rows.each do |row|
52
+ self.add_item(row)
53
+ end
54
+ end
55
+
56
+ def inspect
57
+ "DataFrame rows: #{self.rows.size} labels: #{self.labels.inspect}"
58
+ end
59
+
60
+ # The labels of the data items
61
+ attr_reader :labels
62
+ alias :variables :labels
63
+
64
+ # The items stored in the frame
65
+ attr_reader :items
66
+
67
+ def initialize(*labels)
68
+ @labels = labels.map {|e| e.to_underscore_sym }
69
+ @items = TransposableArray.new
70
+ end
71
+
72
+ def add_item(item)
73
+ self.items << item
74
+ end
75
+ alias :add :add_item
76
+
77
+ def row_labels
78
+ @row_labels ||= []
79
+ end
80
+
81
+ def row_labels=(ary)
82
+ raise ArgumentError, "Row labels must be an array" unless ary.is_a?(Array)
83
+ @row_labels = ary
84
+ end
85
+
86
+ def render_column(sym)
87
+ i = @labels.index(sym)
88
+ return nil unless i
89
+ @items.transpose[i]
90
+ end
91
+
92
+ # The rows as an array of arrays, an alias for items.
93
+ alias :rows :items
94
+
95
+ # The columns as a Dictionary or Hash
96
+ # This is cached, call columns(true) to reset the cache.
97
+ def columns(reset=false)
98
+ @columns = nil if reset
99
+ return @columns if @columns
100
+
101
+ container = defined?(Dictionary) ? Dictionary.new : Hash.new
102
+ i = 0
103
+
104
+ @columns = @items.transpose.inject(container) do |cont, col|
105
+ cont[@labels[i]] = col
106
+ i += 1
107
+ cont
108
+ end
109
+ end
110
+ alias :to_hash :columns
111
+ alias :to_dictionary :columns
112
+
113
+ def render_row(sym)
114
+ i = self.row_labels.index(sym)
115
+ return nil unless i
116
+ @items[i]
117
+ end
118
+
119
+ def method_missing(sym, *args, &block)
120
+ if self.labels.include?(sym)
121
+ render_column(sym)
122
+ elsif self.row_labels.include?(sym)
123
+ render_row(sym)
124
+ elsif @items.respond_to?(sym)
125
+ @items.send(sym, *args, &block)
126
+ else
127
+ super
128
+ end
129
+ end
130
+
131
+ def drop!(*labels)
132
+ labels.each do |label|
133
+ drop_one!(label)
134
+ end
135
+ self
136
+ end
137
+
138
+ def drop_one!(label)
139
+ i = self.labels.index(label)
140
+ return nil unless i
141
+ self.items.each do |item|
142
+ item.delete_at(i)
143
+ end
144
+ self.labels.delete_at(i)
145
+ self
146
+ end
147
+ protected :drop_one!
148
+
149
+ def replace!(column, values=nil, &block)
150
+ column = validate_column(column)
151
+ if not values
152
+ values = self.send(column)
153
+ values.map! {|e| block.call(e)}
154
+ end
155
+ replace_column(column, values)
156
+ self
157
+ end
158
+
159
+ def replace_column(column, values)
160
+ column = validate_column(column)
161
+ index = self.labels.index(column)
162
+ list = []
163
+ self.items.each_with_index do |item, i|
164
+ consolidated = item
165
+ consolidated[index] = values[i]
166
+ list << consolidated
167
+ end
168
+ @items = list.dup
169
+ end
170
+ protected :replace_column
171
+
172
+ def validate_column(column)
173
+ column = column.to_sym
174
+ raise ArgumentError, "Must provide the name of an existing column. Provided #{column.inspect}, needed to provide one of #{self.labels.inspect}" unless self.labels.include?(column)
175
+ column
176
+ end
177
+ protected :validate_column
178
+
179
+ # Takes a block to evaluate on each row. The row can be converted into
180
+ # an OpenStruct or a Hash for easier filter methods. Note, don't try this
181
+ # with a hash or open struct unless you have facets available.
182
+ def filter!(as=Array, &block)
183
+ as = infer_class(as)
184
+ items = []
185
+ self.items.each do |row|
186
+ value = block.call(cast_row(row, as))
187
+ items << row if value
188
+ end
189
+ @items = items.dup
190
+ self
191
+ end
192
+
193
+ def filter(as=Array, &block)
194
+ new_data_frame = self.clone
195
+ new_data_frame.filter!(as, &block)
196
+ end
197
+
198
+ def infer_class(obj)
199
+ obj = obj.to_s.classify.constantize if obj.is_a?(Symbol)
200
+ obj = obj.classify.constantize if obj.is_a?(String)
201
+ obj
202
+ end
203
+ protected :infer_class
204
+
205
+ def cast_row(row, as)
206
+ if as == Hash
207
+ obj = {}
208
+ self.labels.each_with_index do |label, i|
209
+ obj[label] = row[i]
210
+ end
211
+ obj
212
+ elsif as == OpenStruct
213
+ obj = OpenStruct.new
214
+ self.labels.each_with_index do |label, i|
215
+ obj.table[label] = row[i]
216
+ end
217
+ obj
218
+ elsif as == Array
219
+ row
220
+ else
221
+ as.new(*row)
222
+ end
223
+ end
224
+ protected :cast_row
225
+
226
+ # Creates a new data frame, only with the specified columns.
227
+ def subset_from_columns(*cols)
228
+ new_labels = self.labels.inject([]) do |list, label|
229
+ list << label if cols.include?(label)
230
+ list
231
+ end
232
+ new_data_frame = DataFrame.new(*self.labels)
233
+ new_data_frame.import(self.items)
234
+ self.labels.each do |label|
235
+ new_data_frame.drop!(label) unless new_labels.include?(label)
236
+ end
237
+ new_data_frame
238
+ end
239
+
240
+ # A weird name. This creates a column for every category in a column
241
+ # and marks each row by its value
242
+ def j_binary_ize!(*columns)
243
+ # Allows to mix a hash with the columns.
244
+ options = columns.find_all {|e| e.is_a?(Hash)}.inject({}) {|h, e| h.merge!(e)}
245
+ columns.delete_if {|e| e.is_a?(Hash)}
246
+
247
+ # Generates new columns
248
+ columns.each do |col|
249
+ values = render_column(col.to_underscore_sym)
250
+ values.categories.each do |category|
251
+ full_name = (col.to_s + "_" + category.to_s).to_sym
252
+ if options[:allow_overlap]
253
+ category_map = values.inject([]) do |list, e|
254
+ list << values.all_categories(e)
255
+ end
256
+ self.append!(full_name, category_map.map{|e| e.include?(category)})
257
+ else
258
+ self.append!(full_name, values.category_map.map{|e| e == category})
259
+ end
260
+ end
261
+ end
262
+ end
263
+
264
+ # Adds a unique column to the table
265
+ def append!(column_name, value=nil)
266
+ raise ArgumentError, "Can't have duplicate column names" if self.labels.include?(column_name)
267
+ self.labels << column_name.to_underscore_sym
268
+ if value.is_a?(Array)
269
+ self.items.each_with_index do |item, i|
270
+ item << value[i]
271
+ end
272
+ else
273
+ self.items.each do |item|
274
+ item << value
275
+ end
276
+ end
277
+ # Because we are tainting the sub arrays, the TaintableArray doesn't know it's been changed.
278
+ self.items.taint
279
+ end
280
+
281
+ def filter_by_category(hash)
282
+ new_data_frame = self.dup
283
+ hash.each do |key, value|
284
+ key = key.to_underscore_sym
285
+ next unless self.labels.include?(key)
286
+ value = [value] unless value.is_a?(Array) or value.is_a?(Range)
287
+ new_data_frame.filter!(:hash) {|row| value.include?(row[key])}
288
+ end
289
+ new_data_frame
290
+ end
291
+
292
+ def filter_by_category!(hash)
293
+ hash.each do |key, value|
294
+ key = key.to_underscore_sym
295
+ next unless self.labels.include?(key)
296
+ value = [value] unless value.is_a?(Array) or value.is_a?(Range)
297
+ self.filter!(:hash) {|row| value.include?(row[key])}
298
+ end
299
+ end
300
+
301
+ end