davidrichards-data_frame 0.0.18 → 0.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +16 -0
 - data/VERSION.yml +1 -1
 - data/bin/plain_frame +22 -0
 - data/lib/data_frame.rb +2 -1
 - data/lib/data_frame/arff.rb +43 -36
 - data/lib/data_frame/core/column_management.rb +102 -0
 - data/lib/data_frame/core/filter.rb +48 -0
 - data/lib/data_frame/core/import.rb +112 -0
 - data/lib/data_frame/core/pre_process.rb +61 -0
 - data/lib/data_frame/core/saving.rb +29 -0
 - data/lib/data_frame/core/training.rb +36 -0
 - data/lib/data_frame/data_frame.rb +37 -241
 - data/lib/data_frame/id3.rb +28 -0
 - data/lib/data_frame/kmeans.rb +10 -0
 - data/lib/data_frame/labels_from_uci.rb +48 -0
 - data/lib/data_frame/mlp.rb +18 -0
 - data/lib/data_frame/sbn.rb +18 -0
 - data/lib/data_frame/transposable_array.rb +1 -1
 - data/lib/ext/array.rb +11 -0
 - data/spec/data_frame/arff_spec.rb +1 -0
 - data/spec/data_frame/core/column_management_spec.rb +97 -0
 - data/spec/data_frame/core/filter_spec.rb +88 -0
 - data/spec/data_frame/core/import_spec.rb +41 -0
 - data/spec/data_frame/core/pre_process_spec.rb +71 -0
 - data/spec/data_frame/core/saving_spec.rb +61 -0
 - data/spec/data_frame/core/training_spec.rb +51 -0
 - data/spec/data_frame/data_frame_spec.rb +10 -226
 - data/spec/data_frame/id3_spec.rb +22 -0
 - data/spec/ext/array_spec.rb +13 -0
 - data/spec/fixtures/discrete_testing.csv +4 -0
 - data/spec/fixtures/discrete_training.csv +21 -0
 - metadata +33 -6
 
| 
         @@ -0,0 +1,29 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module DF #:nodoc:
         
     | 
| 
      
 2 
     | 
    
         
            +
              module Saving #:nodoc:
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
                # Saves a data frame as CSV.  
         
     | 
| 
      
 5 
     | 
    
         
            +
                # Examples:
         
     | 
| 
      
 6 
     | 
    
         
            +
                # df.save('/tmp/some_filename.csv')
         
     | 
| 
      
 7 
     | 
    
         
            +
                # df.save('/tmp/some_filename.csv', :include_header => false) # No header information is saved
         
     | 
| 
      
 8 
     | 
    
         
            +
                # df.save('/tmp/some_filename.csv', :only => [:list, :of, :columns])
         
     | 
| 
      
 9 
     | 
    
         
            +
                # df.save('/tmp/some_filename.csv', :subset => [:list, :of, :columns])
         
     | 
| 
      
 10 
     | 
    
         
            +
                # df.save('/tmp/some_filename.csv', 
         
     | 
| 
      
 11 
     | 
    
         
            +
                #   :filter => {:column_name => :category_value, 
         
     | 
| 
      
 12 
     | 
    
         
            +
                #     :another_column_name => (range..values)}) # Filter by category
         
     | 
| 
      
 13 
     | 
    
         
            +
                def save(filename, opts={})
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
                  df = self
         
     | 
| 
      
 16 
     | 
    
         
            +
                  df = df.subset_from_columns(*Array(opts[:only])) if opts[:only]
         
     | 
| 
      
 17 
     | 
    
         
            +
                  df = df.subset_from_columns(*Array(opts[:subset])) if opts[:subset]
         
     | 
| 
      
 18 
     | 
    
         
            +
                  df = df.filter_by_category(opts[:filter]) if opts[:filter]
         
     | 
| 
      
 19 
     | 
    
         
            +
                  df = df.filter_by_category(opts[:filter_by_category]) if opts[:filter_by_category]
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
                  File.open(filename, "w") { |f| f.write df.to_csv(opts.fetch(:include_header, true)) }
         
     | 
| 
      
 22 
     | 
    
         
            +
                end
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
              end
         
     | 
| 
      
 25 
     | 
    
         
            +
            end
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
            class DataFrame
         
     | 
| 
      
 28 
     | 
    
         
            +
              include DF::Saving
         
     | 
| 
      
 29 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,36 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module Training #:nodoc:
         
     | 
| 
      
 2 
     | 
    
         
            +
              
         
     | 
| 
      
 3 
     | 
    
         
            +
              # Remove the training set if reset
         
     | 
| 
      
 4 
     | 
    
         
            +
              # Return cached training_set, if there is one
         
     | 
| 
      
 5 
     | 
    
         
            +
              # Get the proportion or 80%
         
     | 
| 
      
 6 
     | 
    
         
            +
              # Get the number of items to choose, n, or a proportion of the items
         
     | 
| 
      
 7 
     | 
    
         
            +
              # Store and return n random items
         
     | 
| 
      
 8 
     | 
    
         
            +
              def training_set(opts={})
         
     | 
| 
      
 9 
     | 
    
         
            +
                @training_set = nil if opts[:reset]
         
     | 
| 
      
 10 
     | 
    
         
            +
                return @training_set if @training_set
         
     | 
| 
      
 11 
     | 
    
         
            +
                
         
     | 
| 
      
 12 
     | 
    
         
            +
                items_size = self.items.size
         
     | 
| 
      
 13 
     | 
    
         
            +
                proportion = opts.fetch(:proportion, 0.8)
         
     | 
| 
      
 14 
     | 
    
         
            +
                n = opts[:n]
         
     | 
| 
      
 15 
     | 
    
         
            +
                n ||= (items_size * proportion).to_i
         
     | 
| 
      
 16 
     | 
    
         
            +
                n = self.items.size if n > items_size
         
     | 
| 
      
 17 
     | 
    
         
            +
                n = 0 if n < 0
         
     | 
| 
      
 18 
     | 
    
         
            +
                
         
     | 
| 
      
 19 
     | 
    
         
            +
                @training_set = []
         
     | 
| 
      
 20 
     | 
    
         
            +
                while n > @training_set.size
         
     | 
| 
      
 21 
     | 
    
         
            +
                  @training_set << random_next(items_size) while n > @training_set.size
         
     | 
| 
      
 22 
     | 
    
         
            +
                  @training_set.uniq!
         
     | 
| 
      
 23 
     | 
    
         
            +
                end
         
     | 
| 
      
 24 
     | 
    
         
            +
                @training_set
         
     | 
| 
      
 25 
     | 
    
         
            +
              end
         
     | 
| 
      
 26 
     | 
    
         
            +
              
         
     | 
| 
      
 27 
     | 
    
         
            +
              protected
         
     | 
| 
      
 28 
     | 
    
         
            +
                def random_next(n)
         
     | 
| 
      
 29 
     | 
    
         
            +
                  self.items[rand(n)]
         
     | 
| 
      
 30 
     | 
    
         
            +
                end
         
     | 
| 
      
 31 
     | 
    
         
            +
              
         
     | 
| 
      
 32 
     | 
    
         
            +
            end
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
            class DataFrame
         
     | 
| 
      
 35 
     | 
    
         
            +
              include Training
         
     | 
| 
      
 36 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -4,70 +4,6 @@ 
     | 
|
| 
       4 
4 
     | 
    
         
             
            # is tainted. 
         
     | 
| 
       5 
5 
     | 
    
         
             
            class DataFrame
         
     | 
| 
       6 
6 
     | 
    
         | 
| 
       7 
     | 
    
         
            -
              class << self
         
     | 
| 
       8 
     | 
    
         
            -
                
         
     | 
| 
       9 
     | 
    
         
            -
                # This is the neatest part of this neat gem.
         
     | 
| 
       10 
     | 
    
         
            -
                # DataFrame.from_csv can be called in a lot of ways:
         
     | 
| 
       11 
     | 
    
         
            -
                # DataFrame.from_csv(csv_contents)
         
     | 
| 
       12 
     | 
    
         
            -
                # DataFrame.from_csv(filename)
         
     | 
| 
       13 
     | 
    
         
            -
                # DataFrame.from_csv(url)
         
     | 
| 
       14 
     | 
    
         
            -
                # If you need to define converters for FasterCSV, do it before calling
         
     | 
| 
       15 
     | 
    
         
            -
                # this method: 
         
     | 
| 
       16 
     | 
    
         
            -
                # FasterCSV::Converters[:special] = lambda{|f| f == 'foo' ? 'bar' : 'foo'}
         
     | 
| 
       17 
     | 
    
         
            -
                # DataFrame.from_csv('http://example.com/my_special_url.csv', :converters => :special)
         
     | 
| 
       18 
     | 
    
         
            -
                # This returns bar where 'foo' was found and 'foo' everywhere else.
         
     | 
| 
       19 
     | 
    
         
            -
                def from_csv(obj, opts={})
         
     | 
| 
       20 
     | 
    
         
            -
                  labels, table = infer_csv_contents(obj, opts)
         
     | 
| 
       21 
     | 
    
         
            -
                  name = infer_name_from_contents(obj, opts)
         
     | 
| 
       22 
     | 
    
         
            -
                  return nil unless labels and table
         
     | 
| 
       23 
     | 
    
         
            -
                  df = new(*labels)
         
     | 
| 
       24 
     | 
    
         
            -
                  df.import(table)
         
     | 
| 
       25 
     | 
    
         
            -
                  df.name = name
         
     | 
| 
       26 
     | 
    
         
            -
                  df
         
     | 
| 
       27 
     | 
    
         
            -
                end
         
     | 
| 
       28 
     | 
    
         
            -
                
         
     | 
| 
       29 
     | 
    
         
            -
                protected
         
     | 
| 
       30 
     | 
    
         
            -
                
         
     | 
| 
       31 
     | 
    
         
            -
                  # Only works for names sources, urls and files
         
     | 
| 
       32 
     | 
    
         
            -
                  def infer_name_from_contents(obj, opts={})
         
     | 
| 
       33 
     | 
    
         
            -
                    begin
         
     | 
| 
       34 
     | 
    
         
            -
                      File.split(obj).last.split('.')[0..-2].join('.').titleize
         
     | 
| 
       35 
     | 
    
         
            -
                    rescue
         
     | 
| 
       36 
     | 
    
         
            -
                      nil
         
     | 
| 
       37 
     | 
    
         
            -
                    end
         
     | 
| 
       38 
     | 
    
         
            -
                  end
         
     | 
| 
       39 
     | 
    
         
            -
                  
         
     | 
| 
       40 
     | 
    
         
            -
                  def infer_csv_contents(obj, opts={})
         
     | 
| 
       41 
     | 
    
         
            -
                    contents = File.read(obj) if File.exist?(obj)
         
     | 
| 
       42 
     | 
    
         
            -
                    begin
         
     | 
| 
       43 
     | 
    
         
            -
                      open(obj) {|f| contents = f.read} unless contents
         
     | 
| 
       44 
     | 
    
         
            -
                    rescue
         
     | 
| 
       45 
     | 
    
         
            -
                      nil
         
     | 
| 
       46 
     | 
    
         
            -
                    end
         
     | 
| 
       47 
     | 
    
         
            -
                    contents ||= obj if obj.is_a?(String)
         
     | 
| 
       48 
     | 
    
         
            -
                    return nil unless contents
         
     | 
| 
       49 
     | 
    
         
            -
                    table = FCSV.parse(contents, default_csv_opts.merge(opts))
         
     | 
| 
       50 
     | 
    
         
            -
                    labels = table.shift
         
     | 
| 
       51 
     | 
    
         
            -
                    while table.last.empty?
         
     | 
| 
       52 
     | 
    
         
            -
                      table.pop
         
     | 
| 
       53 
     | 
    
         
            -
                    end
         
     | 
| 
       54 
     | 
    
         
            -
                    [labels, table]
         
     | 
| 
       55 
     | 
    
         
            -
                  end
         
     | 
| 
       56 
     | 
    
         
            -
                  
         
     | 
| 
       57 
     | 
    
         
            -
                  def default_csv_opts; {:converters => :all}; end
         
     | 
| 
       58 
     | 
    
         
            -
              end
         
     | 
| 
       59 
     | 
    
         
            -
              
         
     | 
| 
       60 
     | 
    
         
            -
              # Include the methods from arff.rb
         
     | 
| 
       61 
     | 
    
         
            -
              include ARFF
         
     | 
| 
       62 
     | 
    
         
            -
              
         
     | 
| 
       63 
     | 
    
         
            -
              # Loads a batch of rows.  Expects an array of arrays, else you don't
         
     | 
| 
       64 
     | 
    
         
            -
              # know what you have. 
         
     | 
| 
       65 
     | 
    
         
            -
              def import(rows)
         
     | 
| 
       66 
     | 
    
         
            -
                rows.each do |row|
         
     | 
| 
       67 
     | 
    
         
            -
                  self.add_item(row)
         
     | 
| 
       68 
     | 
    
         
            -
                end
         
     | 
| 
       69 
     | 
    
         
            -
              end
         
     | 
| 
       70 
     | 
    
         
            -
              
         
     | 
| 
       71 
7 
     | 
    
         
             
              def inspect
         
     | 
| 
       72 
8 
     | 
    
         
             
                "DataFrame rows: #{self.rows.size} labels: #{self.labels.inspect}"
         
     | 
| 
       73 
9 
     | 
    
         
             
              end
         
     | 
| 
         @@ -83,15 +19,11 @@ class DataFrame 
     | 
|
| 
       83 
19 
     | 
    
         
             
              attr_accessor :name
         
     | 
| 
       84 
20 
     | 
    
         | 
| 
       85 
21 
     | 
    
         
             
              def initialize(*labels)
         
     | 
| 
      
 22 
     | 
    
         
            +
                labels = labels.first if labels.size == 1 and labels.first.is_a?(Array)
         
     | 
| 
       86 
23 
     | 
    
         
             
                @labels = labels.map {|e| e.to_underscore_sym }
         
     | 
| 
       87 
24 
     | 
    
         
             
                @items = TransposableArray.new
         
     | 
| 
       88 
25 
     | 
    
         
             
              end
         
     | 
| 
       89 
26 
     | 
    
         | 
| 
       90 
     | 
    
         
            -
              def add_item(item)
         
     | 
| 
       91 
     | 
    
         
            -
                self.items << item
         
     | 
| 
       92 
     | 
    
         
            -
              end
         
     | 
| 
       93 
     | 
    
         
            -
              alias :add :add_item
         
     | 
| 
       94 
     | 
    
         
            -
              
         
     | 
| 
       95 
27 
     | 
    
         
             
              def row_labels
         
     | 
| 
       96 
28 
     | 
    
         
             
                @row_labels ||= []
         
     | 
| 
       97 
29 
     | 
    
         
             
              end
         
     | 
| 
         @@ -101,15 +33,22 @@ class DataFrame 
     | 
|
| 
       101 
33 
     | 
    
         
             
                @row_labels = ary
         
     | 
| 
       102 
34 
     | 
    
         
             
              end
         
     | 
| 
       103 
35 
     | 
    
         | 
| 
      
 36 
     | 
    
         
            +
              # The rows as an array of arrays, an alias for items.
         
     | 
| 
      
 37 
     | 
    
         
            +
              alias :rows :items
         
     | 
| 
      
 38 
     | 
    
         
            +
              
         
     | 
| 
      
 39 
     | 
    
         
            +
              def render_row(sym)
         
     | 
| 
      
 40 
     | 
    
         
            +
                i = self.row_labels.index(sym)
         
     | 
| 
      
 41 
     | 
    
         
            +
                return nil unless i
         
     | 
| 
      
 42 
     | 
    
         
            +
                @items[i]
         
     | 
| 
      
 43 
     | 
    
         
            +
              end
         
     | 
| 
      
 44 
     | 
    
         
            +
              
         
     | 
| 
      
 45 
     | 
    
         
            +
              # Return the column, given its name
         
     | 
| 
       104 
46 
     | 
    
         
             
              def render_column(sym)
         
     | 
| 
       105 
     | 
    
         
            -
                i = @labels.index(sym)
         
     | 
| 
      
 47 
     | 
    
         
            +
                i = @labels.index(sym.to_underscore_sym)
         
     | 
| 
       106 
48 
     | 
    
         
             
                return nil unless i
         
     | 
| 
       107 
49 
     | 
    
         
             
                @items.transpose[i]
         
     | 
| 
       108 
50 
     | 
    
         
             
              end
         
     | 
| 
       109 
51 
     | 
    
         | 
| 
       110 
     | 
    
         
            -
              # The rows as an array of arrays, an alias for items.
         
     | 
| 
       111 
     | 
    
         
            -
              alias :rows :items
         
     | 
| 
       112 
     | 
    
         
            -
              
         
     | 
| 
       113 
52 
     | 
    
         
             
              # The columns as a Dictionary or Hash
         
     | 
| 
       114 
53 
     | 
    
         
             
              # This is cached, call columns(true) to reset the cache.
         
     | 
| 
       115 
54 
     | 
    
         
             
              def columns(reset=false)
         
     | 
| 
         @@ -128,12 +67,6 @@ class DataFrame 
     | 
|
| 
       128 
67 
     | 
    
         
             
              alias :to_hash :columns
         
     | 
| 
       129 
68 
     | 
    
         
             
              alias :to_dictionary :columns
         
     | 
| 
       130 
69 
     | 
    
         | 
| 
       131 
     | 
    
         
            -
              def render_row(sym)
         
     | 
| 
       132 
     | 
    
         
            -
                i = self.row_labels.index(sym)
         
     | 
| 
       133 
     | 
    
         
            -
                return nil unless i
         
     | 
| 
       134 
     | 
    
         
            -
                @items[i]
         
     | 
| 
       135 
     | 
    
         
            -
              end
         
     | 
| 
       136 
     | 
    
         
            -
              
         
     | 
| 
       137 
70 
     | 
    
         
             
              def method_missing(sym, *args, &block)
         
     | 
| 
       138 
71 
     | 
    
         
             
                if self.labels.include?(sym)
         
     | 
| 
       139 
72 
     | 
    
         
             
                  render_column(sym)
         
     | 
| 
         @@ -146,174 +79,37 @@ class DataFrame 
     | 
|
| 
       146 
79 
     | 
    
         
             
                end
         
     | 
| 
       147 
80 
     | 
    
         
             
              end
         
     | 
| 
       148 
81 
     | 
    
         | 
| 
       149 
     | 
    
         
            -
               
     | 
| 
       150 
     | 
    
         
            -
                labels.each do |label|
         
     | 
| 
       151 
     | 
    
         
            -
                  drop_one!(label)
         
     | 
| 
       152 
     | 
    
         
            -
                end
         
     | 
| 
       153 
     | 
    
         
            -
                self
         
     | 
| 
       154 
     | 
    
         
            -
              end
         
     | 
| 
       155 
     | 
    
         
            -
              
         
     | 
| 
       156 
     | 
    
         
            -
              def drop_one!(label)
         
     | 
| 
       157 
     | 
    
         
            -
                i = self.labels.index(label)
         
     | 
| 
       158 
     | 
    
         
            -
                return nil unless i
         
     | 
| 
       159 
     | 
    
         
            -
                self.items.each do |item|
         
     | 
| 
       160 
     | 
    
         
            -
                  item.delete_at(i)
         
     | 
| 
       161 
     | 
    
         
            -
                end
         
     | 
| 
       162 
     | 
    
         
            -
                self.labels.delete_at(i)
         
     | 
| 
       163 
     | 
    
         
            -
                self
         
     | 
| 
       164 
     | 
    
         
            -
              end
         
     | 
| 
       165 
     | 
    
         
            -
              protected :drop_one!
         
     | 
| 
       166 
     | 
    
         
            -
              
         
     | 
| 
       167 
     | 
    
         
            -
              def replace!(column, values=nil, &block)
         
     | 
| 
       168 
     | 
    
         
            -
                column = validate_column(column)
         
     | 
| 
       169 
     | 
    
         
            -
                if not values
         
     | 
| 
       170 
     | 
    
         
            -
                  values = self.send(column)
         
     | 
| 
       171 
     | 
    
         
            -
                  values.map! {|e| block.call(e)}
         
     | 
| 
       172 
     | 
    
         
            -
                end
         
     | 
| 
       173 
     | 
    
         
            -
                replace_column(column, values)
         
     | 
| 
       174 
     | 
    
         
            -
                self
         
     | 
| 
       175 
     | 
    
         
            -
              end
         
     | 
| 
       176 
     | 
    
         
            -
              
         
     | 
| 
       177 
     | 
    
         
            -
              def replace_column(column, values)
         
     | 
| 
       178 
     | 
    
         
            -
                column = validate_column(column)
         
     | 
| 
       179 
     | 
    
         
            -
                index = self.labels.index(column)
         
     | 
| 
       180 
     | 
    
         
            -
                list = []
         
     | 
| 
       181 
     | 
    
         
            -
                self.items.each_with_index do |item, i|
         
     | 
| 
       182 
     | 
    
         
            -
                  consolidated = item
         
     | 
| 
       183 
     | 
    
         
            -
                  consolidated[index] = values[i]
         
     | 
| 
       184 
     | 
    
         
            -
                  list << consolidated
         
     | 
| 
       185 
     | 
    
         
            -
                end
         
     | 
| 
       186 
     | 
    
         
            -
                @items = list.dup
         
     | 
| 
       187 
     | 
    
         
            -
              end
         
     | 
| 
       188 
     | 
    
         
            -
              protected :replace_column
         
     | 
| 
      
 82 
     | 
    
         
            +
              protected
         
     | 
| 
       189 
83 
     | 
    
         | 
| 
       190 
     | 
    
         
            -
             
     | 
| 
       191 
     | 
    
         
            -
             
     | 
| 
       192 
     | 
    
         
            -
             
     | 
| 
       193 
     | 
    
         
            -
             
     | 
| 
       194 
     | 
    
         
            -
              end
         
     | 
| 
       195 
     | 
    
         
            -
              protected :validate_column
         
     | 
| 
       196 
     | 
    
         
            -
              
         
     | 
| 
       197 
     | 
    
         
            -
              # Takes a block to evaluate on each row.  The row can be converted into
         
     | 
| 
       198 
     | 
    
         
            -
              # an OpenStruct or a Hash for easier filter methods. Note, don't try this
         
     | 
| 
       199 
     | 
    
         
            -
              # with a hash or open struct unless you have facets available.
         
     | 
| 
       200 
     | 
    
         
            -
              def filter!(as=Array, &block)
         
     | 
| 
       201 
     | 
    
         
            -
                as = infer_class(as)
         
     | 
| 
       202 
     | 
    
         
            -
                items = []
         
     | 
| 
       203 
     | 
    
         
            -
                self.items.each do |row|
         
     | 
| 
       204 
     | 
    
         
            -
                  value = block.call(cast_row(row, as))
         
     | 
| 
       205 
     | 
    
         
            -
                  items << row if value
         
     | 
| 
      
 84 
     | 
    
         
            +
                def validate_column(column)
         
     | 
| 
      
 85 
     | 
    
         
            +
                  column = column.to_sym
         
     | 
| 
      
 86 
     | 
    
         
            +
                  raise ArgumentError, "Must provide the name of an existing column.  Provided #{column.inspect}, needed to provide one of #{self.labels.inspect}" unless self.labels.include?(column)
         
     | 
| 
      
 87 
     | 
    
         
            +
                  column
         
     | 
| 
       206 
88 
     | 
    
         
             
                end
         
     | 
| 
       207 
     | 
    
         
            -
                 
     | 
| 
       208 
     | 
    
         
            -
                 
     | 
| 
       209 
     | 
    
         
            -
             
     | 
| 
       210 
     | 
    
         
            -
             
     | 
| 
       211 
     | 
    
         
            -
              def filter(as=Array, &block)
         
     | 
| 
       212 
     | 
    
         
            -
                new_data_frame = self.clone
         
     | 
| 
       213 
     | 
    
         
            -
                new_data_frame.filter!(as, &block)
         
     | 
| 
       214 
     | 
    
         
            -
              end
         
     | 
| 
       215 
     | 
    
         
            -
              
         
     | 
| 
       216 
     | 
    
         
            -
              def infer_class(obj)
         
     | 
| 
       217 
     | 
    
         
            -
                obj = obj.to_s.classify.constantize if obj.is_a?(Symbol)
         
     | 
| 
       218 
     | 
    
         
            -
                obj = obj.classify.constantize if obj.is_a?(String)
         
     | 
| 
       219 
     | 
    
         
            -
                obj
         
     | 
| 
       220 
     | 
    
         
            -
              end
         
     | 
| 
       221 
     | 
    
         
            -
              protected :infer_class
         
     | 
| 
       222 
     | 
    
         
            -
              
         
     | 
| 
       223 
     | 
    
         
            -
              def cast_row(row, as)
         
     | 
| 
       224 
     | 
    
         
            -
                if as == Hash
         
     | 
| 
       225 
     | 
    
         
            -
                  obj = {}
         
     | 
| 
       226 
     | 
    
         
            -
                  self.labels.each_with_index do |label, i|
         
     | 
| 
       227 
     | 
    
         
            -
                    obj[label] = row[i]
         
     | 
| 
       228 
     | 
    
         
            -
                  end
         
     | 
| 
       229 
     | 
    
         
            -
                  obj
         
     | 
| 
       230 
     | 
    
         
            -
                elsif as == OpenStruct
         
     | 
| 
       231 
     | 
    
         
            -
                  obj = OpenStruct.new
         
     | 
| 
       232 
     | 
    
         
            -
                  self.labels.each_with_index do |label, i|
         
     | 
| 
       233 
     | 
    
         
            -
                    obj.table[label] = row[i]
         
     | 
| 
       234 
     | 
    
         
            -
                  end
         
     | 
| 
      
 89 
     | 
    
         
            +
                
         
     | 
| 
      
 90 
     | 
    
         
            +
                def infer_class(obj)
         
     | 
| 
      
 91 
     | 
    
         
            +
                  obj = obj.to_s.classify.constantize if obj.is_a?(Symbol)
         
     | 
| 
      
 92 
     | 
    
         
            +
                  obj = obj.classify.constantize if obj.is_a?(String)
         
     | 
| 
       235 
93 
     | 
    
         
             
                  obj
         
     | 
| 
       236 
     | 
    
         
            -
                elsif as == Array
         
     | 
| 
       237 
     | 
    
         
            -
                  row
         
     | 
| 
       238 
     | 
    
         
            -
                else
         
     | 
| 
       239 
     | 
    
         
            -
                  as.new(*row)
         
     | 
| 
       240 
94 
     | 
    
         
             
                end
         
     | 
| 
       241 
     | 
    
         
            -
              end
         
     | 
| 
       242 
     | 
    
         
            -
              protected :cast_row
         
     | 
| 
       243 
     | 
    
         
            -
              
         
     | 
| 
       244 
     | 
    
         
            -
              # Creates a new data frame, only with the specified columns.
         
     | 
| 
       245 
     | 
    
         
            -
              def subset_from_columns(*cols)
         
     | 
| 
       246 
     | 
    
         
            -
                new_labels = self.labels.inject([]) do |list, label|
         
     | 
| 
       247 
     | 
    
         
            -
                  list << label if cols.include?(label)
         
     | 
| 
       248 
     | 
    
         
            -
                  list
         
     | 
| 
       249 
     | 
    
         
            -
                end
         
     | 
| 
       250 
     | 
    
         
            -
                new_data_frame = DataFrame.new(*self.labels)
         
     | 
| 
       251 
     | 
    
         
            -
                new_data_frame.import(self.items)
         
     | 
| 
       252 
     | 
    
         
            -
                self.labels.each do |label|
         
     | 
| 
       253 
     | 
    
         
            -
                  new_data_frame.drop!(label) unless new_labels.include?(label)
         
     | 
| 
       254 
     | 
    
         
            -
                end
         
     | 
| 
       255 
     | 
    
         
            -
                new_data_frame
         
     | 
| 
       256 
     | 
    
         
            -
              end
         
     | 
| 
       257 
     | 
    
         
            -
              
         
     | 
| 
       258 
     | 
    
         
            -
              # A weird name.  This creates a column for every category in a column
         
     | 
| 
       259 
     | 
    
         
            -
              # and marks each row by its value 
         
     | 
| 
       260 
     | 
    
         
            -
              def j_binary_ize!(*columns)
         
     | 
| 
       261 
     | 
    
         
            -
                # Allows to mix a hash with the columns.
         
     | 
| 
       262 
     | 
    
         
            -
                options = columns.find_all {|e| e.is_a?(Hash)}.inject({}) {|h, e| h.merge!(e)}
         
     | 
| 
       263 
     | 
    
         
            -
                columns.delete_if {|e| e.is_a?(Hash)}
         
     | 
| 
       264 
95 
     | 
    
         | 
| 
       265 
     | 
    
         
            -
                 
     | 
| 
       266 
     | 
    
         
            -
             
     | 
| 
       267 
     | 
    
         
            -
             
     | 
| 
       268 
     | 
    
         
            -
             
     | 
| 
       269 
     | 
    
         
            -
             
     | 
| 
       270 
     | 
    
         
            -
                    if options[:allow_overlap]
         
     | 
| 
       271 
     | 
    
         
            -
                      category_map = values.inject([]) do |list, e|
         
     | 
| 
       272 
     | 
    
         
            -
                        list << values.all_categories(e)
         
     | 
| 
       273 
     | 
    
         
            -
                      end
         
     | 
| 
       274 
     | 
    
         
            -
                      self.append!(full_name, category_map.map{|e| e.include?(category)})
         
     | 
| 
       275 
     | 
    
         
            -
                    else
         
     | 
| 
       276 
     | 
    
         
            -
                      self.append!(full_name, values.category_map.map{|e| e == category})
         
     | 
| 
      
 96 
     | 
    
         
            +
                def cast_row(row, as)
         
     | 
| 
      
 97 
     | 
    
         
            +
                  if as == Hash
         
     | 
| 
      
 98 
     | 
    
         
            +
                    obj = {}
         
     | 
| 
      
 99 
     | 
    
         
            +
                    self.labels.each_with_index do |label, i|
         
     | 
| 
      
 100 
     | 
    
         
            +
                      obj[label] = row[i]
         
     | 
| 
       277 
101 
     | 
    
         
             
                    end
         
     | 
| 
      
 102 
     | 
    
         
            +
                    obj
         
     | 
| 
      
 103 
     | 
    
         
            +
                  elsif as == OpenStruct
         
     | 
| 
      
 104 
     | 
    
         
            +
                    obj = OpenStruct.new
         
     | 
| 
      
 105 
     | 
    
         
            +
                    self.labels.each_with_index do |label, i|
         
     | 
| 
      
 106 
     | 
    
         
            +
                      obj.table[label] = row[i]
         
     | 
| 
      
 107 
     | 
    
         
            +
                    end
         
     | 
| 
      
 108 
     | 
    
         
            +
                    obj
         
     | 
| 
      
 109 
     | 
    
         
            +
                  elsif as == Array
         
     | 
| 
      
 110 
     | 
    
         
            +
                    row
         
     | 
| 
      
 111 
     | 
    
         
            +
                  else
         
     | 
| 
      
 112 
     | 
    
         
            +
                    as.new(*row)
         
     | 
| 
       278 
113 
     | 
    
         
             
                  end
         
     | 
| 
       279 
114 
     | 
    
         
             
                end
         
     | 
| 
       280 
     | 
    
         
            -
              end
         
     | 
| 
       281 
     | 
    
         
            -
              
         
     | 
| 
       282 
     | 
    
         
            -
              # Adds a unique column to the table
         
     | 
| 
       283 
     | 
    
         
            -
              def append!(column_name, value=nil)
         
     | 
| 
       284 
     | 
    
         
            -
                raise ArgumentError, "Can't have duplicate column names" if self.labels.include?(column_name)
         
     | 
| 
       285 
     | 
    
         
            -
                self.labels << column_name.to_underscore_sym
         
     | 
| 
       286 
     | 
    
         
            -
                if value.is_a?(Array)
         
     | 
| 
       287 
     | 
    
         
            -
                  self.items.each_with_index do |item, i|
         
     | 
| 
       288 
     | 
    
         
            -
                    item << value[i]
         
     | 
| 
       289 
     | 
    
         
            -
                  end
         
     | 
| 
       290 
     | 
    
         
            -
                else
         
     | 
| 
       291 
     | 
    
         
            -
                  self.items.each do |item|
         
     | 
| 
       292 
     | 
    
         
            -
                    item << value
         
     | 
| 
       293 
     | 
    
         
            -
                  end
         
     | 
| 
       294 
     | 
    
         
            -
                end
         
     | 
| 
       295 
     | 
    
         
            -
                # Because we are tainting the sub arrays, the TaintableArray doesn't know it's been changed.
         
     | 
| 
       296 
     | 
    
         
            -
                self.items.taint
         
     | 
| 
       297 
     | 
    
         
            -
              end
         
     | 
| 
       298 
     | 
    
         
            -
              
         
     | 
| 
       299 
     | 
    
         
            -
              def filter_by_category(hash)
         
     | 
| 
       300 
     | 
    
         
            -
                new_data_frame = self.dup
         
     | 
| 
       301 
     | 
    
         
            -
                hash.each do |key, value|
         
     | 
| 
       302 
     | 
    
         
            -
                  key = key.to_underscore_sym
         
     | 
| 
       303 
     | 
    
         
            -
                  next unless self.labels.include?(key)
         
     | 
| 
       304 
     | 
    
         
            -
                  value = [value] unless value.is_a?(Array) or value.is_a?(Range)
         
     | 
| 
       305 
     | 
    
         
            -
                  new_data_frame.filter!(:hash) {|row| value.include?(row[key])}
         
     | 
| 
       306 
     | 
    
         
            -
                end
         
     | 
| 
       307 
     | 
    
         
            -
                new_data_frame
         
     | 
| 
       308 
     | 
    
         
            -
              end
         
     | 
| 
       309 
     | 
    
         
            -
             
     | 
| 
       310 
     | 
    
         
            -
              def filter_by_category!(hash)
         
     | 
| 
       311 
     | 
    
         
            -
                hash.each do |key, value|
         
     | 
| 
       312 
     | 
    
         
            -
                  key = key.to_underscore_sym
         
     | 
| 
       313 
     | 
    
         
            -
                  next unless self.labels.include?(key)
         
     | 
| 
       314 
     | 
    
         
            -
                  value = [value] unless value.is_a?(Array) or value.is_a?(Range)
         
     | 
| 
       315 
     | 
    
         
            -
                  self.filter!(:hash) {|row| value.include?(row[key])}
         
     | 
| 
       316 
     | 
    
         
            -
                end
         
     | 
| 
       317 
     | 
    
         
            -
              end
         
     | 
| 
       318 
     | 
    
         
            -
                
         
     | 
| 
       319 
115 
     | 
    
         
             
            end
         
     | 
| 
         @@ -0,0 +1,28 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module DF #:nodoc:
         
     | 
| 
      
 2 
     | 
    
         
            +
              # Uses Ilya Grigorik's ID3 decision_tree gem.  Installs it if you don't have it.
         
     | 
| 
      
 3 
     | 
    
         
            +
              module ID3
         
     | 
| 
      
 4 
     | 
    
         
            +
                begin
         
     | 
| 
      
 5 
     | 
    
         
            +
                  gem 'decisiontree'
         
     | 
| 
      
 6 
     | 
    
         
            +
                  require 'decisiontree'
         
     | 
| 
      
 7 
     | 
    
         
            +
                rescue
         
     | 
| 
      
 8 
     | 
    
         
            +
                  `sudo gem install decisiontree`
         
     | 
| 
      
 9 
     | 
    
         
            +
                  gem 'decisiontree'
         
     | 
| 
      
 10 
     | 
    
         
            +
                  require 'decisiontree'
         
     | 
| 
      
 11 
     | 
    
         
            +
                end
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
                def create_id3(dependent_column, opts={})
         
     | 
| 
      
 14 
     | 
    
         
            +
                  # Need to put the dependent column in the last column
         
     | 
| 
      
 15 
     | 
    
         
            +
                  # Probably have other pre processing as well.
         
     | 
| 
      
 16 
     | 
    
         
            +
                  default = opts.fetch(:default, 1)
         
     | 
| 
      
 17 
     | 
    
         
            +
                  @id3 = DecisionTree::ID3Tree.new(self.labels, self.training_data, default, :discrete)
         
     | 
| 
      
 18 
     | 
    
         
            +
                  # ...
         
     | 
| 
      
 19 
     | 
    
         
            +
                end
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
                def id3
         
     | 
| 
      
 22 
     | 
    
         
            +
                end
         
     | 
| 
      
 23 
     | 
    
         
            +
              end
         
     | 
| 
      
 24 
     | 
    
         
            +
            end
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
            class DataFrame
         
     | 
| 
      
 27 
     | 
    
         
            +
              include DF::ID3
         
     | 
| 
      
 28 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,48 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # The University of California - Irvine has a great set of machine
         
     | 
| 
      
 2 
     | 
    
         
            +
            # learning sample data sets.  Their data description pages have field
         
     | 
| 
      
 3 
     | 
    
         
            +
            # label descriptors.  This class extracts them and returns a DataFrame
         
     | 
| 
      
 4 
     | 
    
         
            +
            # with the labels of a data set. 
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            # Turns out, this isn't very useful.  So...oh well.
         
     | 
| 
      
 7 
     | 
    
         
            +
            # By the way, the code I'm talking about is found here: http://archive.ics.uci.edu/ml/
         
     | 
| 
      
 8 
     | 
    
         
            +
            # And to use this class:
         
     | 
| 
      
 9 
     | 
    
         
            +
            # require 'lib/data_frame/labels_from_uci'
         
     | 
| 
      
 10 
     | 
    
         
            +
            # df = LabelsFromUCI.data_frame 'http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.names'
         
     | 
| 
      
 11 
     | 
    
         
            +
            # df.import('http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data')
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
            class LabelsFromUCI
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
              class << self
         
     | 
| 
      
 16 
     | 
    
         
            +
                def process(url)
         
     | 
| 
      
 17 
     | 
    
         
            +
                  lfu = new(url)
         
     | 
| 
      
 18 
     | 
    
         
            +
                  lfu.labels
         
     | 
| 
      
 19 
     | 
    
         
            +
                end
         
     | 
| 
      
 20 
     | 
    
         
            +
                
         
     | 
| 
      
 21 
     | 
    
         
            +
                def data_frame(url)
         
     | 
| 
      
 22 
     | 
    
         
            +
                  lfu = new(url)
         
     | 
| 
      
 23 
     | 
    
         
            +
                  DataFrame.new(lfu.labels)
         
     | 
| 
      
 24 
     | 
    
         
            +
                end
         
     | 
| 
      
 25 
     | 
    
         
            +
              end
         
     | 
| 
      
 26 
     | 
    
         
            +
              
         
     | 
| 
      
 27 
     | 
    
         
            +
              attr_reader :url, :contents, :labels
         
     | 
| 
      
 28 
     | 
    
         
            +
              
         
     | 
| 
      
 29 
     | 
    
         
            +
              def initialize(url)
         
     | 
| 
      
 30 
     | 
    
         
            +
                @url = url
         
     | 
| 
      
 31 
     | 
    
         
            +
                open(url) { |f| @contents = f.read }
         
     | 
| 
      
 32 
     | 
    
         
            +
                process_labels
         
     | 
| 
      
 33 
     | 
    
         
            +
              end
         
     | 
| 
      
 34 
     | 
    
         
            +
              
         
     | 
| 
      
 35 
     | 
    
         
            +
              protected
         
     | 
| 
      
 36 
     | 
    
         
            +
                def process_labels
         
     | 
| 
      
 37 
     | 
    
         
            +
                  @labels = []
         
     | 
| 
      
 38 
     | 
    
         
            +
                  @contents.each_line do |line|
         
     | 
| 
      
 39 
     | 
    
         
            +
                    if line =~ label_re
         
     | 
| 
      
 40 
     | 
    
         
            +
                      @labels << $1
         
     | 
| 
      
 41 
     | 
    
         
            +
                    end
         
     | 
| 
      
 42 
     | 
    
         
            +
                  end
         
     | 
| 
      
 43 
     | 
    
         
            +
                end
         
     | 
| 
      
 44 
     | 
    
         
            +
                
         
     | 
| 
      
 45 
     | 
    
         
            +
                def label_re
         
     | 
| 
      
 46 
     | 
    
         
            +
                  /@attribute (\w+)/
         
     | 
| 
      
 47 
     | 
    
         
            +
                end
         
     | 
| 
      
 48 
     | 
    
         
            +
            end
         
     |