RubyGems - davidrichards-data_frame - Versions diffs - 0.0.18 → 0.0.19 - Mend

davidrichards-data_frame 0.0.18 → 0.0.19

Files changed (32) hide show

data/README.rdoc +16 -0
data/VERSION.yml +1 -1
data/bin/plain_frame +22 -0
data/lib/data_frame.rb +2 -1
data/lib/data_frame/arff.rb +43 -36
data/lib/data_frame/core/column_management.rb +102 -0
data/lib/data_frame/core/filter.rb +48 -0
data/lib/data_frame/core/import.rb +112 -0
data/lib/data_frame/core/pre_process.rb +61 -0
data/lib/data_frame/core/saving.rb +29 -0
data/lib/data_frame/core/training.rb +36 -0
data/lib/data_frame/data_frame.rb +37 -241
data/lib/data_frame/id3.rb +28 -0
data/lib/data_frame/kmeans.rb +10 -0
data/lib/data_frame/labels_from_uci.rb +48 -0
data/lib/data_frame/mlp.rb +18 -0
data/lib/data_frame/sbn.rb +18 -0
data/lib/data_frame/transposable_array.rb +1 -1
data/lib/ext/array.rb +11 -0
data/spec/data_frame/arff_spec.rb +1 -0
data/spec/data_frame/core/column_management_spec.rb +97 -0
data/spec/data_frame/core/filter_spec.rb +88 -0
data/spec/data_frame/core/import_spec.rb +41 -0
data/spec/data_frame/core/pre_process_spec.rb +71 -0
data/spec/data_frame/core/saving_spec.rb +61 -0
data/spec/data_frame/core/training_spec.rb +51 -0
data/spec/data_frame/data_frame_spec.rb +10 -226
data/spec/data_frame/id3_spec.rb +22 -0
data/spec/ext/array_spec.rb +13 -0
data/spec/fixtures/discrete_testing.csv +4 -0
data/spec/fixtures/discrete_training.csv +21 -0
metadata +33 -6

data/README.rdoc CHANGED Viewed

@@ -91,6 +91,22 @@ Data Frame can now create sub-models:
 	>> df.models
 	=> #<OpenStruct weekend=DataFrame rows: 179 labels: [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]>
+== Utilities
+I use data frame for a lot of things, and I've added some utilities for this gem in case you would like to as well.  For instance, here is how I take the data in a data frame and load it into a neural network:
+  # Show mlp.  Will probably need to add a row classifier for training and test data.  Also, will probably want to
+== CLI
+There are some really interesting things that have good command-line shortcuts:
+* Make
+* A
+* List
+	# Now add some demos
 ==Installation
 sudo gem install davidrichards-data_frame

data/VERSION.yml CHANGED Viewed

@@ -1,4 +1,4 @@
 ---
 :major: 0
 :minor: 0
-:patch: 18
+:patch: 20

data/bin/plain_frame ADDED Viewed

@@ -0,0 +1,22 @@
+#!/usr/bin/env ruby -wKU
+require 'yaml'
+version_hash = YAML.load_file(File.join(File.dirname(__FILE__), %w(.. VERSION.yml)))
+version = [version_hash[:major].to_s, version_hash[:minor].to_s, version_hash[:patch].to_s].join(".")
+df_file = File.join(File.dirname(__FILE__), %w(.. lib data_frame))
+irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
+require 'optparse'
+options = { :irb => irb, :without_stored_procedures => false }
+OptionParser.new do |opt|
+  opt.banner = "Usage: console [environment] [options]"
+  opt.on("--irb=[#{irb}]", 'Invoke a different irb.') { |v| options[:irb] = v }
+  opt.parse!(ARGV)
+end
+libs =  " -r irb/completion -r #{df_file}"
+puts "Loading Data Frame version: #{version}"
+exec "#{options[:irb]} #{libs} --simple-prompt"

data/lib/data_frame.rb CHANGED Viewed

@@ -20,6 +20,7 @@ $:.unshift(File.dirname(__FILE__))
 require 'data_frame/callback_array'
 require 'data_frame/transposable_array'
 require 'data_frame/parameter_capture'
-require 'data_frame/arff'
 require 'data_frame/data_frame'
 require 'data_frame/model'
+Dir.glob("#{File.dirname(__FILE__)}/data_frame/core/*.rb").each { |file| require file }

data/lib/data_frame/arff.rb CHANGED Viewed

@@ -1,45 +1,52 @@
-# Turns a data frame into ARFF-formatted content.
-module ARFF
-  # Used in arff, but generally useful.
-  def to_csv(include_header=true)
-    value = include_header ? self.labels.map{|e| e.to_s}.join(',') + "\n" : ''
-    self.items.inject(value) do |list, e|
-      list << e.map {|cell| cell.to_s}.join(',') + "\n"
-    end
-  end
+module DF #:nodoc:
+  # Turns a data frame into ARFF-formatted content.
+  module ARFF
-  def to_arff
-    arff_header + to_csv(false)
-  end
-  protected
-    def arff_attributes
-      container = defined?(Dictionary) ? Dictionary.new : Hash.new
-      self.labels.inject(container) do |list, e|
-        list[e] = self.render_column(e).categories
-      end
-    end
-    def arff_formatted_attributes
-      self.labels.inject('') do |str, e|
-        val = "{" + self.render_column(e).categories.map{|x| x.to_s}.join(',') + "}"
-        str << "@attribute #{e} #{val}\n"
+    # Used in arff, but generally useful.
+    def to_csv(include_header=true)
+      value = include_header ? self.labels.map{|e| e.to_s}.join(',') + "\n" : ''
+      self.items.inject(value) do |list, e|
+        list << e.map {|cell| cell.to_s}.join(',') + "\n"
       end
     end
-    def arff_relation
-      self.name ? self.name.to_underscore_sym.to_s : 'unamed_relation'
+    def to_arff
+      arff_header + to_csv(false)
     end
-    def arff_header
-      %[@relation #{arff_relation}
+    protected
+      def arff_attributes
+        container = defined?(Dictionary) ? Dictionary.new : Hash.new
+        self.labels.inject(container) do |list, e|
+          list[e] = self.render_column(e).categories
+        end
+      end
+      def arff_formatted_attributes
+        self.labels.inject('') do |str, e|
+          val = "{" + self.render_column(e).categories.map{|x| x.to_s}.join(',') + "}"
+          str << "@attribute #{e} #{val}\n"
+        end
+      end
+      def arff_relation
+        self.name ? self.name.to_underscore_sym.to_s : 'unamed_relation'
+      end
+      def arff_header
+        %[@relation #{arff_relation}
 #{arff_formatted_attributes}
 @data
 ]
-    end
-    alias :arff_items :to_csv
+      end
+      alias :arff_items :to_csv
+  end
+end
+class DataFrame
+  include DF::ARFF
 end

data/lib/data_frame/core/column_management.rb ADDED Viewed

@@ -0,0 +1,102 @@
+module DF #:nodoc:
+  module ColumnManagement #:nodoc:
+    def move_to_last!(orig_name)
+      raise ArgumentError, "Column not found" unless self.labels.include?(orig_name)
+      new_name = (orig_name.to_s + "_a_unique_name").to_sym
+      self.append!(new_name, self.render_column(orig_name))
+      self.drop!(orig_name)
+      self.rename!(orig_name, new_name)
+    end
+    # In the order of alias: new_name, orig_name
+    def rename!(new_name, orig_name)
+      new_name = new_name.to_underscore_sym
+      orig_name = orig_name.to_underscore_sym
+      raise ArgumentError, "Column not found" unless self.labels.include?(orig_name)
+      raise ArgumentError, "Cannot name #{orig_name} to #{new_name}, that column already exists." if self.labels.include?(new_name)
+      i = self.labels.index(orig_name)
+      self.labels[i] = new_name
+    end
+    # Adds a unique column to the table
+    def append!(column_name, value=nil)
+      raise ArgumentError, "Can't have duplicate column names" if self.labels.include?(column_name)
+      self.labels << column_name.to_underscore_sym
+      if value.is_a?(Array)
+        self.items.each_with_index do |item, i|
+          item << value[i]
+        end
+      else
+        self.items.each do |item|
+          item << value
+        end
+      end
+      # Because we are tainting the sub arrays, the TaintableArray doesn't know it's been changed.
+      self.items.taint
+    end
+    def replace!(column, values=nil, &block)
+      column = validate_column(column)
+      if not values
+        values = self.send(column)
+        values.map! {|e| block.call(e)}
+      end
+      replace_column!(column, values)
+      self
+    end
+    # Replace a single column with an array of values.
+    # It is helpful to have the values the same size as the rest of the data
+    # frame.
+    def replace_column!(column, values)
+      column = validate_column(column)
+      index = self.labels.index(column)
+      list = []
+      self.items.each_with_index do |item, i|
+        consolidated = item
+        consolidated[index] = values[i]
+        list << consolidated
+      end
+      @items = list.dup
+    end
+    # Drop one or more columns
+    def drop!(*labels)
+      labels.each do |label|
+        drop_one!(label)
+      end
+      self
+    end
+    # Drop a single column
+    def drop_one!(label)
+      i = self.labels.index(label)
+      return nil unless i
+      self.items.each do |item|
+        item.delete_at(i)
+      end
+      self.labels.delete_at(i)
+      self
+    end
+    # Creates a new data frame, only with the specified columns.
+    def subset_from_columns(*cols)
+      new_labels = self.labels.inject([]) do |list, label|
+        list << label if cols.include?(label)
+        list
+      end
+      new_data_frame = DataFrame.new(*self.labels)
+      new_data_frame.import(self.items)
+      self.labels.each do |label|
+        new_data_frame.drop!(label) unless new_labels.include?(label)
+      end
+      new_data_frame
+    end
+  end
+end
+class DataFrame
+  include DF::ColumnManagement
+end

data/lib/data_frame/core/filter.rb ADDED Viewed

@@ -0,0 +1,48 @@
+module DF #:nodoc:
+  module Filter #:nodoc:
+    # Takes a block to evaluate on each row.  The row can be converted into
+    # an OpenStruct or a Hash for easier filter methods. Note, don't try this
+    # with a hash or open struct unless you have facets available.
+    def filter!(as=Array, &block)
+      as = infer_class(as)
+      items = []
+      self.items.each do |row|
+        value = block.call(cast_row(row, as))
+        items << row if value
+      end
+      @items = items.dup
+      self
+    end
+    def filter(as=Array, &block)
+      new_data_frame = self.clone
+      new_data_frame.filter!(as, &block)
+    end
+    def filter_by_category(hash)
+      new_data_frame = self.dup
+      hash.each do |key, value|
+        key = key.to_underscore_sym
+        next unless self.labels.include?(key)
+        value = [value] unless value.is_a?(Array) or value.is_a?(Range)
+        new_data_frame.filter!(:hash) {|row| value.include?(row[key])}
+      end
+      new_data_frame
+    end
+    def filter_by_category!(hash)
+      hash.each do |key, value|
+        key = key.to_underscore_sym
+        next unless self.labels.include?(key)
+        value = [value] unless value.is_a?(Array) or value.is_a?(Range)
+        self.filter!(:hash) {|row| value.include?(row[key])}
+      end
+    end
+  end
+end
+class DataFrame
+  include DF::Filter
+end

data/lib/data_frame/core/import.rb ADDED Viewed

@@ -0,0 +1,112 @@
+module DF #:nodoc:
+  module Import #:nodoc:
+    module InferCSV #:nodoc:
+      protected
+        def default_csv_opts; {:converters => :all}; end
+        def infer_csv_contents(obj, opts={})
+          contents = File.read(obj) if File.exist?(obj)
+          begin
+            open(obj) {|f| contents = f.read} unless contents
+          rescue
+            nil
+          end
+          contents ||= obj if obj.is_a?(String)
+          return nil unless contents
+          table = FCSV.parse(contents, default_csv_opts.merge(opts))
+          labels = opts.fetch(:headers, true) ? table.shift : []
+          while table.last.empty?
+            table.pop
+          end
+          [labels, table]
+        end
+    end # InferCSV
+    module ClassMethods #:nodoc:
+      include InferCSV
+      # This is the neatest part of this neat gem.
+      # DataFrame.from_csv can be called in a lot of ways:
+      # DataFrame.from_csv(csv_contents)
+      # DataFrame.from_csv(filename)
+      # DataFrame.from_csv(url)
+      # If you need to define converters for FasterCSV, do it before calling
+      # this method:
+      # FasterCSV::Converters[:special] = lambda{|f| f == 'foo' ? 'bar' : 'foo'}
+      # DataFrame.from_csv('http://example.com/my_special_url.csv', :converters => :special)
+      # This returns bar where 'foo' was found and 'foo' everywhere else.
+      def from_csv(obj, opts={})
+        labels, table = infer_csv_contents(obj, opts)
+        name = infer_name_from_contents(obj, opts)
+        return nil unless labels and table
+        df = new(*labels)
+        df.import(table)
+        df.name = name
+        df
+      end
+      protected
+        # Only works for names sources, urls and files
+        def infer_name_from_contents(obj, opts={})
+          begin
+            File.split(obj).last.split('.')[0..-2].join('.').titleize
+          rescue
+            nil
+          end
+        end
+    end # Class Methods
+    module InstanceMethods #:nodoc:
+      include InferCSV
+      def add_item(item)
+        self.items << item
+      end
+      alias :add :add_item
+      # Loads a batch of rows.  Expects an array of arrays, else you don't
+      # know what you have.
+      def import(rows)
+        case rows
+        when Array
+          import_array(rows)
+        when String
+          labels, table = infer_csv_contents(rows, :headers => false)
+          import(table)
+        else
+          raise ArgumentError, "Don't know how to import data from #{rows.class}"
+        end
+        true
+      end
+      protected
+        # Imports a table as an array of arrays.
+        # If the array is one-dimensional and there is more than one label, it
+        # imports only one row.
+        def import_array(rows)
+          raise ArgumentError, "Can only work with arrays" unless rows.is_a?(Array)
+          if self.labels.size > 1 and rows.dimensions == 1
+            self.add_item(rows)
+          else
+            rows.each do |row|
+              self.add_item(row)
+            end
+          end
+        end
+    end # Instance Methods
+  end
+end
+class DataFrame
+  include DF::Import::InstanceMethods
+  extend DF::Import::ClassMethods
+end

data/lib/data_frame/core/pre_process.rb ADDED Viewed

@@ -0,0 +1,61 @@
+module DF #:nodoc:
+  module PreProcess #:nodoc:
+    # A weird name.  This creates a column for every category in a column
+    # and marks each row by its value
+    def j_binary_ize!(*columns)
+      # Allows to mix a hash with the columns.
+      options = columns.find_all {|e| e.is_a?(Hash)}.inject({}) {|h, e| h.merge!(e)}
+      columns.delete_if {|e| e.is_a?(Hash)}
+      # Generates new columns
+      columns.each do |col|
+        values = render_column(col.to_underscore_sym)
+        values.categories.each do |category|
+          full_name = (col.to_s + "_" + category.to_s).to_sym
+          if options[:allow_overlap]
+            category_map = values.inject([]) do |list, e|
+              list << values.all_categories(e)
+            end
+            self.append!(full_name, category_map.map{|e| e.include?(category)})
+          else
+            self.append!(full_name, values.category_map.map{|e| e == category})
+          end
+        end
+      end
+    end
+    # Adds a column, numerical_column_name that shows the same data as a
+    # nominal value, but as a number.
+    def numericize!(*columns)
+      columns.each do |col|
+        values = render_column(col.to_underscore_sym)
+        categories = values.categories
+        value_categories = values.map {|v| values.category(v)}
+        i = 0
+        category_map = value_categories.uniq.inject({}) do |h, c|
+          h[c] = i
+          i += 1
+          h
+        end
+        blank = Array.new(category_map.size, 0)
+        reverse_category_map = category_map.inject({}) {|h, e| h[e.last] = e.first; h}
+        new_values = values.inject([]) do |list, val|
+          a = blank.dup
+          a[category_map[values.category(val)]] = 1
+          list << a
+        end
+        new_name = "numerical #{col.to_s}".to_underscore_sym
+        self.append!(new_name, new_values)
+      end
+    end
+  end
+end
+class DataFrame
+  include DF::PreProcess
+end