davidrichards-data_frame 0.0.15 → 0.0.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -75,6 +75,21 @@ A lot of the work in the data frame is to transform the actual table. You may n
75
75
 
76
76
 
77
77
  Note: most of these transformations are not optimized. I'll work with things for a while before I try to optimize this library. However, I should say that I've used some fairly large data sets (thousands of rows) and have been fine with things so far.
78
+
79
+ == Models
80
+
81
+ Data Frame can now create sub-models:
82
+
83
+ >> df = DataFrame.from_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv')
84
+ => DataFrame rows: 517 labels: [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
85
+ >> df.model(:weekend) do |m|
86
+ ?> m.day %w(sat sun)
87
+ >> end
88
+ => DataFrame rows: 179 labels: [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
89
+ >> df.models.weekend.day.uniq
90
+ => ["sat", "sun"]
91
+ >> df.models
92
+ => #<OpenStruct weekend=DataFrame rows: 179 labels: [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]>
78
93
 
79
94
  ==Installation
80
95
 
@@ -1,4 +1,4 @@
1
1
  ---
2
2
  :major: 0
3
3
  :minor: 0
4
- :patch: 15
4
+ :patch: 17
@@ -19,305 +19,6 @@ $:.unshift(File.dirname(__FILE__))
19
19
 
20
20
  require 'data_frame/callback_array'
21
21
  require 'data_frame/transposable_array'
22
-
23
- # This allows me to have named columns and optionally named rows in a
24
- # data frame, to work calculations (usually on the columns), to
25
- # transpose the matrix and store the transposed matrix until the object
26
- # is tainted.
27
- class DataFrame
28
-
29
- class << self
30
-
31
- # This is the neatest part of this neat gem.
32
- # DataFrame.from_csv can be called in a lot of ways:
33
- # DataFrame.from_csv(csv_contents)
34
- # DataFrame.from_csv(filename)
35
- # DataFrame.from_csv(url)
36
- # If you need to define converters for FasterCSV, do it before calling
37
- # this method:
38
- # FasterCSV::Converters[:special] = lambda{|f| f == 'foo' ? 'bar' : 'foo'}
39
- # DataFrame.from_csv('http://example.com/my_special_url.csv', :converters => :special)
40
- # This returns bar where 'foo' was found and 'foo' everywhere else.
41
- def from_csv(obj, opts={})
42
- labels, table = infer_csv_contents(obj, opts)
43
- return nil unless labels and table
44
- df = new(*labels)
45
- df.import(table)
46
- df
47
- end
48
-
49
- protected
50
- def infer_csv_contents(obj, opts={})
51
- contents = File.read(obj) if File.exist?(obj)
52
- begin
53
- open(obj) {|f| contents = f.read} unless contents
54
- rescue
55
- nil
56
- end
57
- contents ||= obj if obj.is_a?(String)
58
- return nil unless contents
59
- table = FCSV.parse(contents, default_csv_opts.merge(opts))
60
- labels = table.shift
61
- while table.last.empty?
62
- table.pop
63
- end
64
- [labels, table]
65
- end
66
-
67
- def default_csv_opts; {:converters => :all}; end
68
- end
69
-
70
- # Loads a batch of rows. Expects an array of arrays, else you don't
71
- # know what you have.
72
- def import(rows)
73
- rows.each do |row|
74
- self.add_item(row)
75
- end
76
- end
77
-
78
- def inspect
79
- "DataFrame rows: #{self.rows.size} labels: #{self.labels.inspect}"
80
- end
81
-
82
- # The labels of the data items
83
- attr_reader :labels
84
- alias :variables :labels
85
-
86
- # The items stored in the frame
87
- attr_reader :items
88
-
89
- def initialize(*labels)
90
- @labels = labels.map {|e| e.to_underscore_sym }
91
- @items = TransposableArray.new
92
- end
93
-
94
- def add_item(item)
95
- self.items << item
96
- end
97
- alias :add :add_item
98
-
99
- def row_labels
100
- @row_labels ||= []
101
- end
102
-
103
- def row_labels=(ary)
104
- raise ArgumentError, "Row labels must be an array" unless ary.is_a?(Array)
105
- @row_labels = ary
106
- end
107
-
108
- def render_column(sym)
109
- i = @labels.index(sym)
110
- return nil unless i
111
- @items.transpose[i]
112
- end
113
-
114
- # The rows as an array of arrays, an alias for items.
115
- alias :rows :items
116
-
117
- # The columns as a Dictionary or Hash
118
- # This is cached, call columns(true) to reset the cache.
119
- def columns(reset=false)
120
- @columns = nil if reset
121
- return @columns if @columns
122
-
123
- container = defined?(Dictionary) ? Dictionary.new : Hash.new
124
- i = 0
125
-
126
- @columns = @items.transpose.inject(container) do |cont, col|
127
- cont[@labels[i]] = col
128
- i += 1
129
- cont
130
- end
131
- end
132
- alias :to_hash :columns
133
- alias :to_dictionary :columns
134
-
135
- def render_row(sym)
136
- i = self.row_labels.index(sym)
137
- return nil unless i
138
- @items[i]
139
- end
140
-
141
- def method_missing(sym, *args, &block)
142
- if self.labels.include?(sym)
143
- render_column(sym)
144
- elsif self.row_labels.include?(sym)
145
- render_row(sym)
146
- elsif @items.respond_to?(sym)
147
- @items.send(sym, *args, &block)
148
- else
149
- super
150
- end
151
- end
152
-
153
- def drop!(*labels)
154
- labels.each do |label|
155
- drop_one!(label)
156
- end
157
- self
158
- end
159
-
160
- def drop_one!(label)
161
- i = self.labels.index(label)
162
- return nil unless i
163
- self.items.each do |item|
164
- item.delete_at(i)
165
- end
166
- self.labels.delete_at(i)
167
- self
168
- end
169
- protected :drop_one!
170
-
171
- def replace!(column, values=nil, &block)
172
- column = validate_column(column)
173
- if not values
174
- values = self.send(column)
175
- values.map! {|e| block.call(e)}
176
- end
177
- replace_column(column, values)
178
- self
179
- end
180
-
181
- def replace_column(column, values)
182
- column = validate_column(column)
183
- index = self.labels.index(column)
184
- list = []
185
- self.items.each_with_index do |item, i|
186
- consolidated = item
187
- consolidated[index] = values[i]
188
- list << consolidated
189
- end
190
- @items = list.dup
191
- end
192
- protected :replace_column
193
-
194
- def validate_column(column)
195
- column = column.to_sym
196
- raise ArgumentError, "Must provide the name of an existing column. Provided #{column.inspect}, needed to provide one of #{self.labels.inspect}" unless self.labels.include?(column)
197
- column
198
- end
199
- protected :validate_column
200
-
201
- # Takes a block to evaluate on each row. The row can be converted into
202
- # an OpenStruct or a Hash for easier filter methods. Note, don't try this
203
- # with a hash or open struct unless you have facets available.
204
- def filter!(as=Array, &block)
205
- as = infer_class(as)
206
- items = []
207
- self.items.each do |row|
208
- value = block.call(cast_row(row, as))
209
- items << row if value
210
- end
211
- @items = items.dup
212
- self
213
- end
214
-
215
- def filter(as=Array, &block)
216
- new_data_frame = self.clone
217
- new_data_frame.filter!(as, &block)
218
- end
219
-
220
- def infer_class(obj)
221
- obj = obj.to_s.classify.constantize if obj.is_a?(Symbol)
222
- obj = obj.classify.constantize if obj.is_a?(String)
223
- obj
224
- end
225
- protected :infer_class
226
-
227
- def cast_row(row, as)
228
- if as == Hash
229
- obj = {}
230
- self.labels.each_with_index do |label, i|
231
- obj[label] = row[i]
232
- end
233
- obj
234
- elsif as == OpenStruct
235
- obj = OpenStruct.new
236
- self.labels.each_with_index do |label, i|
237
- obj.table[label] = row[i]
238
- end
239
- obj
240
- elsif as == Array
241
- row
242
- else
243
- as.new(*row)
244
- end
245
- end
246
- protected :cast_row
247
-
248
- # Creates a new data frame, only with the specified columns.
249
- def subset_from_columns(*cols)
250
- new_labels = self.labels.inject([]) do |list, label|
251
- list << label if cols.include?(label)
252
- list
253
- end
254
- new_data_frame = DataFrame.new(*self.labels)
255
- new_data_frame.import(self.items)
256
- self.labels.each do |label|
257
- new_data_frame.drop!(label) unless new_labels.include?(label)
258
- end
259
- new_data_frame
260
- end
261
-
262
- # A weird name. This creates a column for every category in a column
263
- # and marks each row by its value
264
- def j_binary_ize!(*columns)
265
- # Allows to mix a hash with the columns.
266
- options = columns.find_all {|e| e.is_a?(Hash)}.inject({}) {|h, e| h.merge!(e)}
267
- columns.delete_if {|e| e.is_a?(Hash)}
268
-
269
- # Generates new columns
270
- columns.each do |col|
271
- values = render_column(col.to_underscore_sym)
272
- values.categories.each do |category|
273
- full_name = (col.to_s + "_" + category.to_s).to_sym
274
- if options[:allow_overlap]
275
- category_map = values.inject([]) do |list, e|
276
- list << values.all_categories(e)
277
- end
278
- self.append!(full_name, category_map.map{|e| e.include?(category)})
279
- else
280
- self.append!(full_name, values.category_map.map{|e| e == category})
281
- end
282
- end
283
- end
284
- end
285
-
286
- # Adds a unique column to the table
287
- def append!(column_name, value=nil)
288
- raise ArgumentError, "Can't have duplicate column names" if self.labels.include?(column_name)
289
- self.labels << column_name.to_underscore_sym
290
- if value.is_a?(Array)
291
- self.items.each_with_index do |item, i|
292
- item << value[i]
293
- end
294
- else
295
- self.items.each do |item|
296
- item << value
297
- end
298
- end
299
- # Because we are tainting the sub arrays, the TaintableArray doesn't know it's been changed.
300
- self.items.taint
301
- end
302
-
303
- def filter_by_category(hash)
304
- new_data_frame = self.dup
305
- hash.each do |key, value|
306
- key = key.to_underscore_sym
307
- next unless self.labels.include?(key)
308
- value = [value] unless value.is_a?(Array) or value.is_a?(Range)
309
- new_data_frame.filter!(:hash) {|row| value.include?(row[key])}
310
- end
311
- new_data_frame
312
- end
313
-
314
- def filter_by_category!(hash)
315
- hash.each do |key, value|
316
- key = key.to_underscore_sym
317
- next unless self.labels.include?(key)
318
- value = [value] unless value.is_a?(Array) or value.is_a?(Range)
319
- self.filter!(:hash) {|row| value.include?(row[key])}
320
- end
321
- end
322
-
323
- end
22
+ require 'data_frame/parameter_capture'
23
+ require 'data_frame/data_frame'
24
+ require 'data_frame/model'
@@ -0,0 +1,301 @@
1
+ # This allows me to have named columns and optionally named rows in a
2
+ # data frame, to work calculations (usually on the columns), to
3
+ # transpose the matrix and store the transposed matrix until the object
4
+ # is tainted.
5
+ class DataFrame
6
+
7
+ class << self
8
+
9
+ # This is the neatest part of this neat gem.
10
+ # DataFrame.from_csv can be called in a lot of ways:
11
+ # DataFrame.from_csv(csv_contents)
12
+ # DataFrame.from_csv(filename)
13
+ # DataFrame.from_csv(url)
14
+ # If you need to define converters for FasterCSV, do it before calling
15
+ # this method:
16
+ # FasterCSV::Converters[:special] = lambda{|f| f == 'foo' ? 'bar' : 'foo'}
17
+ # DataFrame.from_csv('http://example.com/my_special_url.csv', :converters => :special)
18
+ # This returns bar where 'foo' was found and 'foo' everywhere else.
19
+ def from_csv(obj, opts={})
20
+ labels, table = infer_csv_contents(obj, opts)
21
+ return nil unless labels and table
22
+ df = new(*labels)
23
+ df.import(table)
24
+ df
25
+ end
26
+
27
+ protected
28
+ def infer_csv_contents(obj, opts={})
29
+ contents = File.read(obj) if File.exist?(obj)
30
+ begin
31
+ open(obj) {|f| contents = f.read} unless contents
32
+ rescue
33
+ nil
34
+ end
35
+ contents ||= obj if obj.is_a?(String)
36
+ return nil unless contents
37
+ table = FCSV.parse(contents, default_csv_opts.merge(opts))
38
+ labels = table.shift
39
+ while table.last.empty?
40
+ table.pop
41
+ end
42
+ [labels, table]
43
+ end
44
+
45
+ def default_csv_opts; {:converters => :all}; end
46
+ end
47
+
48
+ # Loads a batch of rows. Expects an array of arrays, else you don't
49
+ # know what you have.
50
+ def import(rows)
51
+ rows.each do |row|
52
+ self.add_item(row)
53
+ end
54
+ end
55
+
56
+ def inspect
57
+ "DataFrame rows: #{self.rows.size} labels: #{self.labels.inspect}"
58
+ end
59
+
60
+ # The labels of the data items
61
+ attr_reader :labels
62
+ alias :variables :labels
63
+
64
+ # The items stored in the frame
65
+ attr_reader :items
66
+
67
+ def initialize(*labels)
68
+ @labels = labels.map {|e| e.to_underscore_sym }
69
+ @items = TransposableArray.new
70
+ end
71
+
72
+ def add_item(item)
73
+ self.items << item
74
+ end
75
+ alias :add :add_item
76
+
77
+ def row_labels
78
+ @row_labels ||= []
79
+ end
80
+
81
+ def row_labels=(ary)
82
+ raise ArgumentError, "Row labels must be an array" unless ary.is_a?(Array)
83
+ @row_labels = ary
84
+ end
85
+
86
+ def render_column(sym)
87
+ i = @labels.index(sym)
88
+ return nil unless i
89
+ @items.transpose[i]
90
+ end
91
+
92
+ # The rows as an array of arrays, an alias for items.
93
+ alias :rows :items
94
+
95
+ # The columns as a Dictionary or Hash
96
+ # This is cached, call columns(true) to reset the cache.
97
+ def columns(reset=false)
98
+ @columns = nil if reset
99
+ return @columns if @columns
100
+
101
+ container = defined?(Dictionary) ? Dictionary.new : Hash.new
102
+ i = 0
103
+
104
+ @columns = @items.transpose.inject(container) do |cont, col|
105
+ cont[@labels[i]] = col
106
+ i += 1
107
+ cont
108
+ end
109
+ end
110
+ alias :to_hash :columns
111
+ alias :to_dictionary :columns
112
+
113
+ def render_row(sym)
114
+ i = self.row_labels.index(sym)
115
+ return nil unless i
116
+ @items[i]
117
+ end
118
+
119
+ def method_missing(sym, *args, &block)
120
+ if self.labels.include?(sym)
121
+ render_column(sym)
122
+ elsif self.row_labels.include?(sym)
123
+ render_row(sym)
124
+ elsif @items.respond_to?(sym)
125
+ @items.send(sym, *args, &block)
126
+ else
127
+ super
128
+ end
129
+ end
130
+
131
+ def drop!(*labels)
132
+ labels.each do |label|
133
+ drop_one!(label)
134
+ end
135
+ self
136
+ end
137
+
138
+ def drop_one!(label)
139
+ i = self.labels.index(label)
140
+ return nil unless i
141
+ self.items.each do |item|
142
+ item.delete_at(i)
143
+ end
144
+ self.labels.delete_at(i)
145
+ self
146
+ end
147
+ protected :drop_one!
148
+
149
+ def replace!(column, values=nil, &block)
150
+ column = validate_column(column)
151
+ if not values
152
+ values = self.send(column)
153
+ values.map! {|e| block.call(e)}
154
+ end
155
+ replace_column(column, values)
156
+ self
157
+ end
158
+
159
+ def replace_column(column, values)
160
+ column = validate_column(column)
161
+ index = self.labels.index(column)
162
+ list = []
163
+ self.items.each_with_index do |item, i|
164
+ consolidated = item
165
+ consolidated[index] = values[i]
166
+ list << consolidated
167
+ end
168
+ @items = list.dup
169
+ end
170
+ protected :replace_column
171
+
172
+ def validate_column(column)
173
+ column = column.to_sym
174
+ raise ArgumentError, "Must provide the name of an existing column. Provided #{column.inspect}, needed to provide one of #{self.labels.inspect}" unless self.labels.include?(column)
175
+ column
176
+ end
177
+ protected :validate_column
178
+
179
+ # Takes a block to evaluate on each row. The row can be converted into
180
+ # an OpenStruct or a Hash for easier filter methods. Note, don't try this
181
+ # with a hash or open struct unless you have facets available.
182
+ def filter!(as=Array, &block)
183
+ as = infer_class(as)
184
+ items = []
185
+ self.items.each do |row|
186
+ value = block.call(cast_row(row, as))
187
+ items << row if value
188
+ end
189
+ @items = items.dup
190
+ self
191
+ end
192
+
193
+ def filter(as=Array, &block)
194
+ new_data_frame = self.clone
195
+ new_data_frame.filter!(as, &block)
196
+ end
197
+
198
+ def infer_class(obj)
199
+ obj = obj.to_s.classify.constantize if obj.is_a?(Symbol)
200
+ obj = obj.classify.constantize if obj.is_a?(String)
201
+ obj
202
+ end
203
+ protected :infer_class
204
+
205
+ def cast_row(row, as)
206
+ if as == Hash
207
+ obj = {}
208
+ self.labels.each_with_index do |label, i|
209
+ obj[label] = row[i]
210
+ end
211
+ obj
212
+ elsif as == OpenStruct
213
+ obj = OpenStruct.new
214
+ self.labels.each_with_index do |label, i|
215
+ obj.table[label] = row[i]
216
+ end
217
+ obj
218
+ elsif as == Array
219
+ row
220
+ else
221
+ as.new(*row)
222
+ end
223
+ end
224
+ protected :cast_row
225
+
226
+ # Creates a new data frame, only with the specified columns.
227
+ def subset_from_columns(*cols)
228
+ new_labels = self.labels.inject([]) do |list, label|
229
+ list << label if cols.include?(label)
230
+ list
231
+ end
232
+ new_data_frame = DataFrame.new(*self.labels)
233
+ new_data_frame.import(self.items)
234
+ self.labels.each do |label|
235
+ new_data_frame.drop!(label) unless new_labels.include?(label)
236
+ end
237
+ new_data_frame
238
+ end
239
+
240
+ # A weird name. This creates a column for every category in a column
241
+ # and marks each row by its value
242
+ def j_binary_ize!(*columns)
243
+ # Allows to mix a hash with the columns.
244
+ options = columns.find_all {|e| e.is_a?(Hash)}.inject({}) {|h, e| h.merge!(e)}
245
+ columns.delete_if {|e| e.is_a?(Hash)}
246
+
247
+ # Generates new columns
248
+ columns.each do |col|
249
+ values = render_column(col.to_underscore_sym)
250
+ values.categories.each do |category|
251
+ full_name = (col.to_s + "_" + category.to_s).to_sym
252
+ if options[:allow_overlap]
253
+ category_map = values.inject([]) do |list, e|
254
+ list << values.all_categories(e)
255
+ end
256
+ self.append!(full_name, category_map.map{|e| e.include?(category)})
257
+ else
258
+ self.append!(full_name, values.category_map.map{|e| e == category})
259
+ end
260
+ end
261
+ end
262
+ end
263
+
264
+ # Adds a unique column to the table
265
+ def append!(column_name, value=nil)
266
+ raise ArgumentError, "Can't have duplicate column names" if self.labels.include?(column_name)
267
+ self.labels << column_name.to_underscore_sym
268
+ if value.is_a?(Array)
269
+ self.items.each_with_index do |item, i|
270
+ item << value[i]
271
+ end
272
+ else
273
+ self.items.each do |item|
274
+ item << value
275
+ end
276
+ end
277
+ # Because we are tainting the sub arrays, the TaintableArray doesn't know it's been changed.
278
+ self.items.taint
279
+ end
280
+
281
+ def filter_by_category(hash)
282
+ new_data_frame = self.dup
283
+ hash.each do |key, value|
284
+ key = key.to_underscore_sym
285
+ next unless self.labels.include?(key)
286
+ value = [value] unless value.is_a?(Array) or value.is_a?(Range)
287
+ new_data_frame.filter!(:hash) {|row| value.include?(row[key])}
288
+ end
289
+ new_data_frame
290
+ end
291
+
292
+ def filter_by_category!(hash)
293
+ hash.each do |key, value|
294
+ key = key.to_underscore_sym
295
+ next unless self.labels.include?(key)
296
+ value = [value] unless value.is_a?(Array) or value.is_a?(Range)
297
+ self.filter!(:hash) {|row| value.include?(row[key])}
298
+ end
299
+ end
300
+
301
+ end