RubyGems - davidrichards-data_frame - Versions diffs - 0.0.18 → 0.0.19 - Mend

davidrichards-data_frame 0.0.18 → 0.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

data/README.rdoc +16 -0
data/VERSION.yml +1 -1
data/bin/plain_frame +22 -0
data/lib/data_frame.rb +2 -1
data/lib/data_frame/arff.rb +43 -36
data/lib/data_frame/core/column_management.rb +102 -0
data/lib/data_frame/core/filter.rb +48 -0
data/lib/data_frame/core/import.rb +112 -0
data/lib/data_frame/core/pre_process.rb +61 -0
data/lib/data_frame/core/saving.rb +29 -0
data/lib/data_frame/core/training.rb +36 -0
data/lib/data_frame/data_frame.rb +37 -241
data/lib/data_frame/id3.rb +28 -0
data/lib/data_frame/kmeans.rb +10 -0
data/lib/data_frame/labels_from_uci.rb +48 -0
data/lib/data_frame/mlp.rb +18 -0
data/lib/data_frame/sbn.rb +18 -0
data/lib/data_frame/transposable_array.rb +1 -1
data/lib/ext/array.rb +11 -0
data/spec/data_frame/arff_spec.rb +1 -0
data/spec/data_frame/core/column_management_spec.rb +97 -0
data/spec/data_frame/core/filter_spec.rb +88 -0
data/spec/data_frame/core/import_spec.rb +41 -0
data/spec/data_frame/core/pre_process_spec.rb +71 -0
data/spec/data_frame/core/saving_spec.rb +61 -0
data/spec/data_frame/core/training_spec.rb +51 -0
data/spec/data_frame/data_frame_spec.rb +10 -226
data/spec/data_frame/id3_spec.rb +22 -0
data/spec/ext/array_spec.rb +13 -0
data/spec/fixtures/discrete_testing.csv +4 -0
data/spec/fixtures/discrete_training.csv +21 -0
metadata +33 -6

data/lib/data_frame/core/saving.rb ADDED Viewed

@@ -0,0 +1,29 @@
+module DF #:nodoc:
+  module Saving #:nodoc:
+    # Saves a data frame as CSV.
+    # Examples:
+    # df.save('/tmp/some_filename.csv')
+    # df.save('/tmp/some_filename.csv', :include_header => false) # No header information is saved
+    # df.save('/tmp/some_filename.csv', :only => [:list, :of, :columns])
+    # df.save('/tmp/some_filename.csv', :subset => [:list, :of, :columns])
+    # df.save('/tmp/some_filename.csv',
+    #   :filter => {:column_name => :category_value,
+    #     :another_column_name => (range..values)}) # Filter by category
+    def save(filename, opts={})
+      df = self
+      df = df.subset_from_columns(*Array(opts[:only])) if opts[:only]
+      df = df.subset_from_columns(*Array(opts[:subset])) if opts[:subset]
+      df = df.filter_by_category(opts[:filter]) if opts[:filter]
+      df = df.filter_by_category(opts[:filter_by_category]) if opts[:filter_by_category]
+      File.open(filename, "w") { |f| f.write df.to_csv(opts.fetch(:include_header, true)) }
+    end
+  end
+end
+class DataFrame
+  include DF::Saving
+end

data/lib/data_frame/core/training.rb ADDED Viewed

@@ -0,0 +1,36 @@
+module Training #:nodoc:
+  # Remove the training set if reset
+  # Return cached training_set, if there is one
+  # Get the proportion or 80%
+  # Get the number of items to choose, n, or a proportion of the items
+  # Store and return n random items
+  def training_set(opts={})
+    @training_set = nil if opts[:reset]
+    return @training_set if @training_set
+    items_size = self.items.size
+    proportion = opts.fetch(:proportion, 0.8)
+    n = opts[:n]
+    n ||= (items_size * proportion).to_i
+    n = self.items.size if n > items_size
+    n = 0 if n < 0
+    @training_set = []
+    while n > @training_set.size
+      @training_set << random_next(items_size) while n > @training_set.size
+      @training_set.uniq!
+    end
+    @training_set
+  end
+  protected
+    def random_next(n)
+      self.items[rand(n)]
+    end
+end
+class DataFrame
+  include Training
+end

data/lib/data_frame/data_frame.rb CHANGED Viewed

@@ -4,70 +4,6 @@
 # is tainted.
 class DataFrame
-  class << self
-    # This is the neatest part of this neat gem.
-    # DataFrame.from_csv can be called in a lot of ways:
-    # DataFrame.from_csv(csv_contents)
-    # DataFrame.from_csv(filename)
-    # DataFrame.from_csv(url)
-    # If you need to define converters for FasterCSV, do it before calling
-    # this method:
-    # FasterCSV::Converters[:special] = lambda{|f| f == 'foo' ? 'bar' : 'foo'}
-    # DataFrame.from_csv('http://example.com/my_special_url.csv', :converters => :special)
-    # This returns bar where 'foo' was found and 'foo' everywhere else.
-    def from_csv(obj, opts={})
-      labels, table = infer_csv_contents(obj, opts)
-      name = infer_name_from_contents(obj, opts)
-      return nil unless labels and table
-      df = new(*labels)
-      df.import(table)
-      df.name = name
-      df
-    end
-    protected
-      # Only works for names sources, urls and files
-      def infer_name_from_contents(obj, opts={})
-        begin
-          File.split(obj).last.split('.')[0..-2].join('.').titleize
-        rescue
-          nil
-        end
-      end
-      def infer_csv_contents(obj, opts={})
-        contents = File.read(obj) if File.exist?(obj)
-        begin
-          open(obj) {|f| contents = f.read} unless contents
-        rescue
-          nil
-        end
-        contents ||= obj if obj.is_a?(String)
-        return nil unless contents
-        table = FCSV.parse(contents, default_csv_opts.merge(opts))
-        labels = table.shift
-        while table.last.empty?
-          table.pop
-        end
-        [labels, table]
-      end
-      def default_csv_opts; {:converters => :all}; end
-  end
-  # Include the methods from arff.rb
-  include ARFF
-  # Loads a batch of rows.  Expects an array of arrays, else you don't
-  # know what you have.
-  def import(rows)
-    rows.each do |row|
-      self.add_item(row)
-    end
-  end
   def inspect
     "DataFrame rows: #{self.rows.size} labels: #{self.labels.inspect}"
   end
@@ -83,15 +19,11 @@ class DataFrame
   attr_accessor :name
   def initialize(*labels)
+    labels = labels.first if labels.size == 1 and labels.first.is_a?(Array)
     @labels = labels.map {|e| e.to_underscore_sym }
     @items = TransposableArray.new
   end
-  def add_item(item)
-    self.items << item
-  end
-  alias :add :add_item
   def row_labels
     @row_labels ||= []
   end
@@ -101,15 +33,22 @@ class DataFrame
     @row_labels = ary
   end
+  # The rows as an array of arrays, an alias for items.
+  alias :rows :items
+  def render_row(sym)
+    i = self.row_labels.index(sym)
+    return nil unless i
+    @items[i]
+  end
+  # Return the column, given its name
   def render_column(sym)
-    i = @labels.index(sym)
+    i = @labels.index(sym.to_underscore_sym)
     return nil unless i
     @items.transpose[i]
   end
-  # The rows as an array of arrays, an alias for items.
-  alias :rows :items
   # The columns as a Dictionary or Hash
   # This is cached, call columns(true) to reset the cache.
   def columns(reset=false)
@@ -128,12 +67,6 @@ class DataFrame
   alias :to_hash :columns
   alias :to_dictionary :columns
-  def render_row(sym)
-    i = self.row_labels.index(sym)
-    return nil unless i
-    @items[i]
-  end
   def method_missing(sym, *args, &block)
     if self.labels.include?(sym)
       render_column(sym)
@@ -146,174 +79,37 @@ class DataFrame
     end
   end
-  def drop!(*labels)
-    labels.each do |label|
-      drop_one!(label)
-    end
-    self
-  end
-  def drop_one!(label)
-    i = self.labels.index(label)
-    return nil unless i
-    self.items.each do |item|
-      item.delete_at(i)
-    end
-    self.labels.delete_at(i)
-    self
-  end
-  protected :drop_one!
-  def replace!(column, values=nil, &block)
-    column = validate_column(column)
-    if not values
-      values = self.send(column)
-      values.map! {|e| block.call(e)}
-    end
-    replace_column(column, values)
-    self
-  end
-  def replace_column(column, values)
-    column = validate_column(column)
-    index = self.labels.index(column)
-    list = []
-    self.items.each_with_index do |item, i|
-      consolidated = item
-      consolidated[index] = values[i]
-      list << consolidated
-    end
-    @items = list.dup
-  end
-  protected :replace_column
+  protected
-  def validate_column(column)
-    column = column.to_sym
-    raise ArgumentError, "Must provide the name of an existing column.  Provided #{column.inspect}, needed to provide one of #{self.labels.inspect}" unless self.labels.include?(column)
-    column
-  end
-  protected :validate_column
-  # Takes a block to evaluate on each row.  The row can be converted into
-  # an OpenStruct or a Hash for easier filter methods. Note, don't try this
-  # with a hash or open struct unless you have facets available.
-  def filter!(as=Array, &block)
-    as = infer_class(as)
-    items = []
-    self.items.each do |row|
-      value = block.call(cast_row(row, as))
-      items << row if value
+    def validate_column(column)
+      column = column.to_sym
+      raise ArgumentError, "Must provide the name of an existing column.  Provided #{column.inspect}, needed to provide one of #{self.labels.inspect}" unless self.labels.include?(column)
+      column
     end
-    @items = items.dup
-    self
-  end
-  def filter(as=Array, &block)
-    new_data_frame = self.clone
-    new_data_frame.filter!(as, &block)
-  end
-  def infer_class(obj)
-    obj = obj.to_s.classify.constantize if obj.is_a?(Symbol)
-    obj = obj.classify.constantize if obj.is_a?(String)
-    obj
-  end
-  protected :infer_class
-  def cast_row(row, as)
-    if as == Hash
-      obj = {}
-      self.labels.each_with_index do |label, i|
-        obj[label] = row[i]
-      end
-      obj
-    elsif as == OpenStruct
-      obj = OpenStruct.new
-      self.labels.each_with_index do |label, i|
-        obj.table[label] = row[i]
-      end
+    def infer_class(obj)
+      obj = obj.to_s.classify.constantize if obj.is_a?(Symbol)
+      obj = obj.classify.constantize if obj.is_a?(String)
       obj
-    elsif as == Array
-      row
-    else
-      as.new(*row)
     end
-  end
-  protected :cast_row
-  # Creates a new data frame, only with the specified columns.
-  def subset_from_columns(*cols)
-    new_labels = self.labels.inject([]) do |list, label|
-      list << label if cols.include?(label)
-      list
-    end
-    new_data_frame = DataFrame.new(*self.labels)
-    new_data_frame.import(self.items)
-    self.labels.each do |label|
-      new_data_frame.drop!(label) unless new_labels.include?(label)
-    end
-    new_data_frame
-  end
-  # A weird name.  This creates a column for every category in a column
-  # and marks each row by its value
-  def j_binary_ize!(*columns)
-    # Allows to mix a hash with the columns.
-    options = columns.find_all {|e| e.is_a?(Hash)}.inject({}) {|h, e| h.merge!(e)}
-    columns.delete_if {|e| e.is_a?(Hash)}
-    # Generates new columns
-    columns.each do |col|
-      values = render_column(col.to_underscore_sym)
-      values.categories.each do |category|
-        full_name = (col.to_s + "_" + category.to_s).to_sym
-        if options[:allow_overlap]
-          category_map = values.inject([]) do |list, e|
-            list << values.all_categories(e)
-          end
-          self.append!(full_name, category_map.map{|e| e.include?(category)})
-        else
-          self.append!(full_name, values.category_map.map{|e| e == category})
+    def cast_row(row, as)
+      if as == Hash
+        obj = {}
+        self.labels.each_with_index do |label, i|
+          obj[label] = row[i]
         end
+        obj
+      elsif as == OpenStruct
+        obj = OpenStruct.new
+        self.labels.each_with_index do |label, i|
+          obj.table[label] = row[i]
+        end
+        obj
+      elsif as == Array
+        row
+      else
+        as.new(*row)
       end
     end
-  end
-  # Adds a unique column to the table
-  def append!(column_name, value=nil)
-    raise ArgumentError, "Can't have duplicate column names" if self.labels.include?(column_name)
-    self.labels << column_name.to_underscore_sym
-    if value.is_a?(Array)
-      self.items.each_with_index do |item, i|
-        item << value[i]
-      end
-    else
-      self.items.each do |item|
-        item << value
-      end
-    end
-    # Because we are tainting the sub arrays, the TaintableArray doesn't know it's been changed.
-    self.items.taint
-  end
-  def filter_by_category(hash)
-    new_data_frame = self.dup
-    hash.each do |key, value|
-      key = key.to_underscore_sym
-      next unless self.labels.include?(key)
-      value = [value] unless value.is_a?(Array) or value.is_a?(Range)
-      new_data_frame.filter!(:hash) {|row| value.include?(row[key])}
-    end
-    new_data_frame
-  end
-  def filter_by_category!(hash)
-    hash.each do |key, value|
-      key = key.to_underscore_sym
-      next unless self.labels.include?(key)
-      value = [value] unless value.is_a?(Array) or value.is_a?(Range)
-      self.filter!(:hash) {|row| value.include?(row[key])}
-    end
-  end
 end

data/lib/data_frame/id3.rb ADDED Viewed

@@ -0,0 +1,28 @@
+module DF #:nodoc:
+  # Uses Ilya Grigorik's ID3 decision_tree gem.  Installs it if you don't have it.
+  module ID3
+    begin
+      gem 'decisiontree'
+      require 'decisiontree'
+    rescue
+      `sudo gem install decisiontree`
+      gem 'decisiontree'
+      require 'decisiontree'
+    end
+    def create_id3(dependent_column, opts={})
+      # Need to put the dependent column in the last column
+      # Probably have other pre processing as well.
+      default = opts.fetch(:default, 1)
+      @id3 = DecisionTree::ID3Tree.new(self.labels, self.training_data, default, :discrete)
+      # ...
+    end
+    def id3
+    end
+  end
+end
+class DataFrame
+  include DF::ID3
+end

data/lib/data_frame/kmeans.rb ADDED Viewed

@@ -0,0 +1,10 @@
+module DF #:nodoc:
+  # Uses a KMeans classifier to cluster the data set.
+  module KMeans
+  end
+end
+class DataFrame
+  include DF::KMeans
+end

data/lib/data_frame/labels_from_uci.rb ADDED Viewed

@@ -0,0 +1,48 @@
+# The University of California - Irvine has a great set of machine
+# learning sample data sets.  Their data description pages have field
+# label descriptors.  This class extracts them and returns a DataFrame
+# with the labels of a data set.
+# Turns out, this isn't very useful.  So...oh well.
+# By the way, the code I'm talking about is found here: http://archive.ics.uci.edu/ml/
+# And to use this class:
+# require 'lib/data_frame/labels_from_uci'
+# df = LabelsFromUCI.data_frame 'http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.names'
+# df.import('http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data')
+class LabelsFromUCI
+  class << self
+    def process(url)
+      lfu = new(url)
+      lfu.labels
+    end
+    def data_frame(url)
+      lfu = new(url)
+      DataFrame.new(lfu.labels)
+    end
+  end
+  attr_reader :url, :contents, :labels
+  def initialize(url)
+    @url = url
+    open(url) { |f| @contents = f.read }
+    process_labels
+  end
+  protected
+    def process_labels
+      @labels = []
+      @contents.each_line do |line|
+        if line =~ label_re
+          @labels << $1
+        end
+      end
+    end
+    def label_re
+      /@attribute (\w+)/
+    end
+end