davidrichards-data_frame 0.0.18 → 0.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,29 @@
1
+ module DF #:nodoc:
2
+ module Saving #:nodoc:
3
+
4
+ # Saves a data frame as CSV.
5
+ # Examples:
6
+ # df.save('/tmp/some_filename.csv')
7
+ # df.save('/tmp/some_filename.csv', :include_header => false) # No header information is saved
8
+ # df.save('/tmp/some_filename.csv', :only => [:list, :of, :columns])
9
+ # df.save('/tmp/some_filename.csv', :subset => [:list, :of, :columns])
10
+ # df.save('/tmp/some_filename.csv',
11
+ # :filter => {:column_name => :category_value,
12
+ # :another_column_name => (range..values)}) # Filter by category
13
+ def save(filename, opts={})
14
+
15
+ df = self
16
+ df = df.subset_from_columns(*Array(opts[:only])) if opts[:only]
17
+ df = df.subset_from_columns(*Array(opts[:subset])) if opts[:subset]
18
+ df = df.filter_by_category(opts[:filter]) if opts[:filter]
19
+ df = df.filter_by_category(opts[:filter_by_category]) if opts[:filter_by_category]
20
+
21
+ File.open(filename, "w") { |f| f.write df.to_csv(opts.fetch(:include_header, true)) }
22
+ end
23
+
24
+ end
25
+ end
26
+
27
+ class DataFrame
28
+ include DF::Saving
29
+ end
@@ -0,0 +1,36 @@
1
+ module Training #:nodoc:
2
+
3
+ # Remove the training set if reset
4
+ # Return cached training_set, if there is one
5
+ # Get the proportion or 80%
6
+ # Get the number of items to choose, n, or a proportion of the items
7
+ # Store and return n random items
8
+ def training_set(opts={})
9
+ @training_set = nil if opts[:reset]
10
+ return @training_set if @training_set
11
+
12
+ items_size = self.items.size
13
+ proportion = opts.fetch(:proportion, 0.8)
14
+ n = opts[:n]
15
+ n ||= (items_size * proportion).to_i
16
+ n = self.items.size if n > items_size
17
+ n = 0 if n < 0
18
+
19
+ @training_set = []
20
+ while n > @training_set.size
21
+ @training_set << random_next(items_size) while n > @training_set.size
22
+ @training_set.uniq!
23
+ end
24
+ @training_set
25
+ end
26
+
27
+ protected
28
+ def random_next(n)
29
+ self.items[rand(n)]
30
+ end
31
+
32
+ end
33
+
34
+ class DataFrame
35
+ include Training
36
+ end
@@ -4,70 +4,6 @@
4
4
  # is tainted.
5
5
  class DataFrame
6
6
 
7
- class << self
8
-
9
- # This is the neatest part of this neat gem.
10
- # DataFrame.from_csv can be called in a lot of ways:
11
- # DataFrame.from_csv(csv_contents)
12
- # DataFrame.from_csv(filename)
13
- # DataFrame.from_csv(url)
14
- # If you need to define converters for FasterCSV, do it before calling
15
- # this method:
16
- # FasterCSV::Converters[:special] = lambda{|f| f == 'foo' ? 'bar' : 'foo'}
17
- # DataFrame.from_csv('http://example.com/my_special_url.csv', :converters => :special)
18
- # This returns bar where 'foo' was found and 'foo' everywhere else.
19
- def from_csv(obj, opts={})
20
- labels, table = infer_csv_contents(obj, opts)
21
- name = infer_name_from_contents(obj, opts)
22
- return nil unless labels and table
23
- df = new(*labels)
24
- df.import(table)
25
- df.name = name
26
- df
27
- end
28
-
29
- protected
30
-
31
- # Only works for names sources, urls and files
32
- def infer_name_from_contents(obj, opts={})
33
- begin
34
- File.split(obj).last.split('.')[0..-2].join('.').titleize
35
- rescue
36
- nil
37
- end
38
- end
39
-
40
- def infer_csv_contents(obj, opts={})
41
- contents = File.read(obj) if File.exist?(obj)
42
- begin
43
- open(obj) {|f| contents = f.read} unless contents
44
- rescue
45
- nil
46
- end
47
- contents ||= obj if obj.is_a?(String)
48
- return nil unless contents
49
- table = FCSV.parse(contents, default_csv_opts.merge(opts))
50
- labels = table.shift
51
- while table.last.empty?
52
- table.pop
53
- end
54
- [labels, table]
55
- end
56
-
57
- def default_csv_opts; {:converters => :all}; end
58
- end
59
-
60
- # Include the methods from arff.rb
61
- include ARFF
62
-
63
- # Loads a batch of rows. Expects an array of arrays, else you don't
64
- # know what you have.
65
- def import(rows)
66
- rows.each do |row|
67
- self.add_item(row)
68
- end
69
- end
70
-
71
7
  def inspect
72
8
  "DataFrame rows: #{self.rows.size} labels: #{self.labels.inspect}"
73
9
  end
@@ -83,15 +19,11 @@ class DataFrame
83
19
  attr_accessor :name
84
20
 
85
21
  def initialize(*labels)
22
+ labels = labels.first if labels.size == 1 and labels.first.is_a?(Array)
86
23
  @labels = labels.map {|e| e.to_underscore_sym }
87
24
  @items = TransposableArray.new
88
25
  end
89
26
 
90
- def add_item(item)
91
- self.items << item
92
- end
93
- alias :add :add_item
94
-
95
27
  def row_labels
96
28
  @row_labels ||= []
97
29
  end
@@ -101,15 +33,22 @@ class DataFrame
101
33
  @row_labels = ary
102
34
  end
103
35
 
36
+ # The rows as an array of arrays, an alias for items.
37
+ alias :rows :items
38
+
39
+ def render_row(sym)
40
+ i = self.row_labels.index(sym)
41
+ return nil unless i
42
+ @items[i]
43
+ end
44
+
45
+ # Return the column, given its name
104
46
  def render_column(sym)
105
- i = @labels.index(sym)
47
+ i = @labels.index(sym.to_underscore_sym)
106
48
  return nil unless i
107
49
  @items.transpose[i]
108
50
  end
109
51
 
110
- # The rows as an array of arrays, an alias for items.
111
- alias :rows :items
112
-
113
52
  # The columns as a Dictionary or Hash
114
53
  # This is cached, call columns(true) to reset the cache.
115
54
  def columns(reset=false)
@@ -128,12 +67,6 @@ class DataFrame
128
67
  alias :to_hash :columns
129
68
  alias :to_dictionary :columns
130
69
 
131
- def render_row(sym)
132
- i = self.row_labels.index(sym)
133
- return nil unless i
134
- @items[i]
135
- end
136
-
137
70
  def method_missing(sym, *args, &block)
138
71
  if self.labels.include?(sym)
139
72
  render_column(sym)
@@ -146,174 +79,37 @@ class DataFrame
146
79
  end
147
80
  end
148
81
 
149
- def drop!(*labels)
150
- labels.each do |label|
151
- drop_one!(label)
152
- end
153
- self
154
- end
155
-
156
- def drop_one!(label)
157
- i = self.labels.index(label)
158
- return nil unless i
159
- self.items.each do |item|
160
- item.delete_at(i)
161
- end
162
- self.labels.delete_at(i)
163
- self
164
- end
165
- protected :drop_one!
166
-
167
- def replace!(column, values=nil, &block)
168
- column = validate_column(column)
169
- if not values
170
- values = self.send(column)
171
- values.map! {|e| block.call(e)}
172
- end
173
- replace_column(column, values)
174
- self
175
- end
176
-
177
- def replace_column(column, values)
178
- column = validate_column(column)
179
- index = self.labels.index(column)
180
- list = []
181
- self.items.each_with_index do |item, i|
182
- consolidated = item
183
- consolidated[index] = values[i]
184
- list << consolidated
185
- end
186
- @items = list.dup
187
- end
188
- protected :replace_column
82
+ protected
189
83
 
190
- def validate_column(column)
191
- column = column.to_sym
192
- raise ArgumentError, "Must provide the name of an existing column. Provided #{column.inspect}, needed to provide one of #{self.labels.inspect}" unless self.labels.include?(column)
193
- column
194
- end
195
- protected :validate_column
196
-
197
- # Takes a block to evaluate on each row. The row can be converted into
198
- # an OpenStruct or a Hash for easier filter methods. Note, don't try this
199
- # with a hash or open struct unless you have facets available.
200
- def filter!(as=Array, &block)
201
- as = infer_class(as)
202
- items = []
203
- self.items.each do |row|
204
- value = block.call(cast_row(row, as))
205
- items << row if value
84
+ def validate_column(column)
85
+ column = column.to_sym
86
+ raise ArgumentError, "Must provide the name of an existing column. Provided #{column.inspect}, needed to provide one of #{self.labels.inspect}" unless self.labels.include?(column)
87
+ column
206
88
  end
207
- @items = items.dup
208
- self
209
- end
210
-
211
- def filter(as=Array, &block)
212
- new_data_frame = self.clone
213
- new_data_frame.filter!(as, &block)
214
- end
215
-
216
- def infer_class(obj)
217
- obj = obj.to_s.classify.constantize if obj.is_a?(Symbol)
218
- obj = obj.classify.constantize if obj.is_a?(String)
219
- obj
220
- end
221
- protected :infer_class
222
-
223
- def cast_row(row, as)
224
- if as == Hash
225
- obj = {}
226
- self.labels.each_with_index do |label, i|
227
- obj[label] = row[i]
228
- end
229
- obj
230
- elsif as == OpenStruct
231
- obj = OpenStruct.new
232
- self.labels.each_with_index do |label, i|
233
- obj.table[label] = row[i]
234
- end
89
+
90
+ def infer_class(obj)
91
+ obj = obj.to_s.classify.constantize if obj.is_a?(Symbol)
92
+ obj = obj.classify.constantize if obj.is_a?(String)
235
93
  obj
236
- elsif as == Array
237
- row
238
- else
239
- as.new(*row)
240
94
  end
241
- end
242
- protected :cast_row
243
-
244
- # Creates a new data frame, only with the specified columns.
245
- def subset_from_columns(*cols)
246
- new_labels = self.labels.inject([]) do |list, label|
247
- list << label if cols.include?(label)
248
- list
249
- end
250
- new_data_frame = DataFrame.new(*self.labels)
251
- new_data_frame.import(self.items)
252
- self.labels.each do |label|
253
- new_data_frame.drop!(label) unless new_labels.include?(label)
254
- end
255
- new_data_frame
256
- end
257
-
258
- # A weird name. This creates a column for every category in a column
259
- # and marks each row by its value
260
- def j_binary_ize!(*columns)
261
- # Allows to mix a hash with the columns.
262
- options = columns.find_all {|e| e.is_a?(Hash)}.inject({}) {|h, e| h.merge!(e)}
263
- columns.delete_if {|e| e.is_a?(Hash)}
264
95
 
265
- # Generates new columns
266
- columns.each do |col|
267
- values = render_column(col.to_underscore_sym)
268
- values.categories.each do |category|
269
- full_name = (col.to_s + "_" + category.to_s).to_sym
270
- if options[:allow_overlap]
271
- category_map = values.inject([]) do |list, e|
272
- list << values.all_categories(e)
273
- end
274
- self.append!(full_name, category_map.map{|e| e.include?(category)})
275
- else
276
- self.append!(full_name, values.category_map.map{|e| e == category})
96
+ def cast_row(row, as)
97
+ if as == Hash
98
+ obj = {}
99
+ self.labels.each_with_index do |label, i|
100
+ obj[label] = row[i]
277
101
  end
102
+ obj
103
+ elsif as == OpenStruct
104
+ obj = OpenStruct.new
105
+ self.labels.each_with_index do |label, i|
106
+ obj.table[label] = row[i]
107
+ end
108
+ obj
109
+ elsif as == Array
110
+ row
111
+ else
112
+ as.new(*row)
278
113
  end
279
114
  end
280
- end
281
-
282
- # Adds a unique column to the table
283
- def append!(column_name, value=nil)
284
- raise ArgumentError, "Can't have duplicate column names" if self.labels.include?(column_name)
285
- self.labels << column_name.to_underscore_sym
286
- if value.is_a?(Array)
287
- self.items.each_with_index do |item, i|
288
- item << value[i]
289
- end
290
- else
291
- self.items.each do |item|
292
- item << value
293
- end
294
- end
295
- # Because we are tainting the sub arrays, the TaintableArray doesn't know it's been changed.
296
- self.items.taint
297
- end
298
-
299
- def filter_by_category(hash)
300
- new_data_frame = self.dup
301
- hash.each do |key, value|
302
- key = key.to_underscore_sym
303
- next unless self.labels.include?(key)
304
- value = [value] unless value.is_a?(Array) or value.is_a?(Range)
305
- new_data_frame.filter!(:hash) {|row| value.include?(row[key])}
306
- end
307
- new_data_frame
308
- end
309
-
310
- def filter_by_category!(hash)
311
- hash.each do |key, value|
312
- key = key.to_underscore_sym
313
- next unless self.labels.include?(key)
314
- value = [value] unless value.is_a?(Array) or value.is_a?(Range)
315
- self.filter!(:hash) {|row| value.include?(row[key])}
316
- end
317
- end
318
-
319
115
  end
@@ -0,0 +1,28 @@
1
+ module DF #:nodoc:
2
+ # Uses Ilya Grigorik's ID3 decision_tree gem. Installs it if you don't have it.
3
+ module ID3
4
+ begin
5
+ gem 'decisiontree'
6
+ require 'decisiontree'
7
+ rescue
8
+ `sudo gem install decisiontree`
9
+ gem 'decisiontree'
10
+ require 'decisiontree'
11
+ end
12
+
13
+ def create_id3(dependent_column, opts={})
14
+ # Need to put the dependent column in the last column
15
+ # Probably have other pre processing as well.
16
+ default = opts.fetch(:default, 1)
17
+ @id3 = DecisionTree::ID3Tree.new(self.labels, self.training_data, default, :discrete)
18
+ # ...
19
+ end
20
+
21
+ def id3
22
+ end
23
+ end
24
+ end
25
+
26
+ class DataFrame
27
+ include DF::ID3
28
+ end
@@ -0,0 +1,10 @@
1
+ module DF #:nodoc:
2
+ # Uses a KMeans classifier to cluster the data set.
3
+ module KMeans
4
+
5
+ end
6
+ end
7
+
8
+ class DataFrame
9
+ include DF::KMeans
10
+ end
@@ -0,0 +1,48 @@
1
+ # The University of California - Irvine has a great set of machine
2
+ # learning sample data sets. Their data description pages have field
3
+ # label descriptors. This class extracts them and returns a DataFrame
4
+ # with the labels of a data set.
5
+
6
+ # Turns out, this isn't very useful. So...oh well.
7
+ # By the way, the code I'm talking about is found here: http://archive.ics.uci.edu/ml/
8
+ # And to use this class:
9
+ # require 'lib/data_frame/labels_from_uci'
10
+ # df = LabelsFromUCI.data_frame 'http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.names'
11
+ # df.import('http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data')
12
+
13
+ class LabelsFromUCI
14
+
15
+ class << self
16
+ def process(url)
17
+ lfu = new(url)
18
+ lfu.labels
19
+ end
20
+
21
+ def data_frame(url)
22
+ lfu = new(url)
23
+ DataFrame.new(lfu.labels)
24
+ end
25
+ end
26
+
27
+ attr_reader :url, :contents, :labels
28
+
29
+ def initialize(url)
30
+ @url = url
31
+ open(url) { |f| @contents = f.read }
32
+ process_labels
33
+ end
34
+
35
+ protected
36
+ def process_labels
37
+ @labels = []
38
+ @contents.each_line do |line|
39
+ if line =~ label_re
40
+ @labels << $1
41
+ end
42
+ end
43
+ end
44
+
45
+ def label_re
46
+ /@attribute (\w+)/
47
+ end
48
+ end