RubyGems - data_frame - Versions diffs - 0.1.8 - Mend

data_frame 0.1.8

Files changed (45) hide show

data/README.rdoc +122 -0
data/VERSION.yml +4 -0
data/bin/plain_frame +22 -0
data/lib/data_frame.rb +26 -0
data/lib/data_frame/arff.rb +52 -0
data/lib/data_frame/callback_array.rb +152 -0
data/lib/data_frame/core/column_management.rb +147 -0
data/lib/data_frame/core/filter.rb +48 -0
data/lib/data_frame/core/import.rb +113 -0
data/lib/data_frame/core/pre_process.rb +69 -0
data/lib/data_frame/core/saving.rb +29 -0
data/lib/data_frame/core/training.rb +46 -0
data/lib/data_frame/data_frame.rb +115 -0
data/lib/data_frame/id3.rb +28 -0
data/lib/data_frame/kmeans.rb +10 -0
data/lib/data_frame/labels_from_uci.rb +48 -0
data/lib/data_frame/mlp.rb +18 -0
data/lib/data_frame/model.rb +22 -0
data/lib/data_frame/parameter_capture.rb +50 -0
data/lib/data_frame/sbn.rb +18 -0
data/lib/data_frame/transposable_array.rb +23 -0
data/lib/ext/array.rb +11 -0
data/lib/ext/open_struct.rb +5 -0
data/lib/ext/string.rb +5 -0
data/lib/ext/symbol.rb +5 -0
data/spec/data_frame/arff_spec.rb +48 -0
data/spec/data_frame/callback_array_spec.rb +148 -0
data/spec/data_frame/core/column_management_spec.rb +128 -0
data/spec/data_frame/core/filter_spec.rb +88 -0
data/spec/data_frame/core/import_spec.rb +41 -0
data/spec/data_frame/core/pre_process_spec.rb +103 -0
data/spec/data_frame/core/saving_spec.rb +61 -0
data/spec/data_frame/core/training_spec.rb +72 -0
data/spec/data_frame/data_frame_spec.rb +141 -0
data/spec/data_frame/id3_spec.rb +22 -0
data/spec/data_frame/model_spec.rb +36 -0
data/spec/data_frame/parameter_capture_spec.rb +32 -0
data/spec/data_frame/transposable_array_spec.rb +138 -0
data/spec/data_frame_spec.rb +29 -0
data/spec/ext/array_spec.rb +13 -0
data/spec/fixtures/basic.csv +3 -0
data/spec/fixtures/discrete_testing.csv +4 -0
data/spec/fixtures/discrete_training.csv +21 -0
data/spec/spec_helper.rb +8 -0
metadata +128 -0

@@ -0,0 +1,10 @@
+module DF #:nodoc:
+  # Uses a KMeans classifier to cluster the data set.
+  module KMeans
+  end
+end
+class DataFrame
+  include DF::KMeans
+end

data/lib/data_frame/labels_from_uci.rb ADDED

@@ -0,0 +1,48 @@
+# The University of California - Irvine has a great set of machine
+# learning sample data sets.  Their data description pages have field
+# label descriptors.  This class extracts them and returns a DataFrame
+# with the labels of a data set.
+# Turns out, this isn't very useful.  So...oh well.
+# By the way, the code I'm talking about is found here: http://archive.ics.uci.edu/ml/
+# And to use this class:
+# require 'lib/data_frame/labels_from_uci'
+# df = LabelsFromUCI.data_frame 'http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.names'
+# df.import('http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data')
+class LabelsFromUCI
+  class << self
+    def process(url)
+      lfu = new(url)
+      lfu.labels
+    end
+    def data_frame(url)
+      lfu = new(url)
+      DataFrame.new(lfu.labels)
+    end
+  end
+  attr_reader :url, :contents, :labels
+  def initialize(url)
+    @url = url
+    open(url) { |f| @contents = f.read }
+    process_labels
+  end
+  protected
+    def process_labels
+      @labels = []
+      @contents.each_line do |line|
+        if line =~ label_re
+          @labels << $1
+        end
+      end
+    end
+    def label_re
+      /@attribute (\w+)/
+    end
+end

data/lib/data_frame/mlp.rb ADDED

@@ -0,0 +1,18 @@
+module DF #:nodoc:
+  # Turns Data Frame into a feeder for Red Davis' MLP classifier.
+  # Will install it if you don't have it.
+  module MLP
+    begin
+      gem 'reddavis-mlp'
+      require 'mlp'
+    rescue
+      `sudo gem install reddavis-mlp`
+      gem 'reddavis-mlp'
+      require 'mlp'
+    end
+  end
+end
+class DataFrame
+  include DF::MLP
+end

data/lib/data_frame/model.rb ADDED

@@ -0,0 +1,22 @@
+# Adds the model methods to the data frame.
+class DataFrame
+  # Returns a model if defined
+  # Defines a model with a block, if given and not defined
+  # Stores the model in the models container, which gives us access like:
+  # df.models.new_model_name...
+  def model(name=nil, &block)
+    return self.models[name] if self.models.table.keys.include?(name)
+    return false unless block
+    @pc = ParameterCapture.new(&block)
+    model = self.filter(Hash) do |row|
+      @pc.filter(row)
+    end
+    self.models.table[name] = model
+  end
+  def models
+    @models ||= OpenStruct.new
+  end
+end

data/lib/data_frame/parameter_capture.rb ADDED

@@ -0,0 +1,50 @@
+# Captures the intent of a model definition in a block.  Usage:
+# pc = ParameterCapture.new do |p|
+#   p.whatever :some_value
+#   p.another :one
+#   p.or_list [1, 2]
+#   p.or_range (1..2)
+# end
+# pc.parameters
+# => {:whatever => :some_value, :another => :one, :or_list => [1,2], :or_range => (1..2)}
+class ParameterCapture
+  def initialize(&block)
+    self.instance_eval &block
+  end
+  def parameters
+    @parameters ||= OpenStruct.new
+  end
+  # Exposes the set keys
+  def keys
+    self.parameters.table.keys
+  end
+  # can be used in a data_frame filter.
+  # @pc.filter(row) Using a Hash as a cast type for the filter.
+  def filter(row)
+    self.keys.each do |key|
+      value = self.parameters.send(key)
+      case value
+      when Array
+        return false unless value.include?(row[key])
+      when Range
+        return false unless value.include?(row[key])
+      else
+        return false unless value === row[key]
+      end
+    end
+    return true
+  end
+  def method_missing(key, *values, &block)
+    if self.parameters.table.keys.include?(key)
+      self.parameters.send(key)
+    elsif values.size == 1
+      self.parameters.table[key] = values.first
+    else
+      self.parameters.table[key] = values
+    end
+  end
+end

data/lib/data_frame/sbn.rb ADDED

@@ -0,0 +1,18 @@
+module DF #:nodoc:
+  # Turns Data Frame into a feeder for Carl Youngblood's Simple Bayesian classifier.
+  # Will install it if you don't have it.
+  module SBN
+    begin
+      gem 'sbn'
+      require 'sbn'
+    rescue
+      `sudo gem install sbn`
+      gem 'sbn'
+      require 'sbn'
+    end
+  end
+end
+class DataFrame
+  include DF::SBN
+end

data/lib/data_frame/transposable_array.rb ADDED

@@ -0,0 +1,23 @@
+# The only trick in this array is that it's transpose is memoized until
+# it is tainted.  This should reduce computations elegantly.
+class TransposableArray < CallbackArray
+  after_taint :clear_cache
+  orig_transpose = instance_method(:transpose)
+  define_method(:transpose) {
+    self.untaint
+    @transpose ||= orig_transpose.bind(self).call
+  }
+  # For debugging and testing purposes, it just feels dirty to always ask
+  # for @ta.send(:instance_variable_get, :@transpose)
+  def cache
+    @transpose
+  end
+  def clear_cache
+    @transpose = nil
+  end
+  protected :clear_cache
+end

data/lib/ext/array.rb ADDED

@@ -0,0 +1,11 @@
+class Array
+  # Defines the number of dimensions:
+  # [1,2,3] is 1-dimensional
+  # [[1,2,3], [1,2,3]] is 2-dimensional
+  # [[[1,2,3], [1,2,3]], [[1,2,3], [1,2,3], [[1,2,3], [1,2,3]]]] is 3-dimensional
+  # So [[[1,2,3], [1,2,3]], [[1,2,3], [1,2,3], [[1,2,3], [1,2,3]]]].dimensions == 3
+  def dimensions(n=0)
+    n += 1
+    self.first.is_a?(Array) ? self.first.dimensions(n) : n
+  end
+end

data/lib/ext/open_struct.rb ADDED

@@ -0,0 +1,5 @@
+class OpenStruct
+  def table
+    @table
+  end
+end

data/lib/ext/string.rb ADDED

@@ -0,0 +1,5 @@
+class String # :nodoc:
+  def to_underscore_sym
+    self.titleize.gsub(/\s+/, '').underscore.to_sym
+  end
+end

data/lib/ext/symbol.rb ADDED

@@ -0,0 +1,5 @@
+class Symbol # :nodoc:
+  def to_underscore_sym
+    self.to_s.titleize.gsub(/\s+/, '').underscore.to_sym
+  end
+end

data/spec/data_frame/arff_spec.rb ADDED

@@ -0,0 +1,48 @@
+require File.join(File.dirname(__FILE__), "/../spec_helper")
+require 'data_frame/arff'
+describe "ARFF" do
+  before do
+    @df = DataFrame.from_csv(File.expand_path(File.join(File.dirname(__FILE__), '..', 'fixtures', 'basic.csv')))
+  end
+  it "should allow a data frame to be expressed as an arff-formatted file" do
+    @df.to_arff.should eql(basic_arff)
+  end
+  it "should add a to_csv method" do
+    @df.to_csv.should eql(%{x,y,month,day,ffmc,dmc,dc,isi,temp,rh,wind,rain,area
+7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,0
+7,4,oct,tue,90.6,35.4,669.1,6.7,18,33,0.9,0,0
+})
+  end
+  it "should allow a non-header export for to_csv" do
+    @df.to_csv(false).should eql(%{7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,0
+7,4,oct,tue,90.6,35.4,669.1,6.7,18,33,0.9,0,0
+})
+  end
+end
+def basic_arff
+  %[@relation basic
+@attribute x {7}
+@attribute y {4,5}
+@attribute month {mar,oct}
+@attribute day {fri,tue}
+@attribute ffmc {86.2,90.6}
+@attribute dmc {26.2,35.4}
+@attribute dc {94.3,669.1}
+@attribute isi {5.1,6.7}
+@attribute temp {8.2,18}
+@attribute rh {33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51}
+@attribute wind {0.9,6.7}
+@attribute rain {0}
+@attribute area {0}
+@data
+7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,0
+7,4,oct,tue,90.6,35.4,669.1,6.7,18,33,0.9,0,0
+]
+end

data/spec/data_frame/callback_array_spec.rb ADDED

@@ -0,0 +1,148 @@
+require File.join(File.dirname(__FILE__), "/../spec_helper")
+# TransposableArray is a thorough test on the after_taint method.  Here
+# I only test the other callbacks.
+class Register
+  def self.next(meth)
+    @@count ||= {}
+    @@count[meth] ||= 0
+    @@count[meth] += 1
+  end
+  def self.for(meth)
+    @@count ||= {}
+    @@count[meth]
+  end
+end
+class A < CallbackArray
+  before_taint :register_before_taint
+  def register_before_taint
+    Register.next(:before_taint)
+  end
+  before_untaint :register_before_untaint
+  def register_before_untaint
+    Register.next(:before_untaint)
+  end
+  after_untaint :register_after_untaint
+  def register_after_untaint
+    Register.next(:after_untaint)
+  end
+end
+describe CallbackArray do
+  before do
+    @a = A.new [1,2,3]
+  end
+  context "before_taint" do
+    before do
+      @c = Register.for(:before_taint) || 0
+    end
+    after do
+      Register.for(:before_taint).should eql(@c + 1)
+      @a.should be_tainted
+    end
+    it "should callback before taint" do
+      @a.taint
+    end
+    it "should callback before :[]=" do
+      @a[0] = 2
+    end
+    it "should callback before :<<" do
+      @a << 3
+    end
+    it "should callback before :delete" do
+      @a.delete(2)
+    end
+    it "should callback before :push" do
+      @a.push(5)
+    end
+    it "should callback before :pop" do
+      @a.pop
+    end
+    it "should callback before :shift" do
+      @a.shift
+    end
+    it "should callback before :unshift" do
+      @a.unshift(6)
+    end
+    it "should callback before :map!" do
+      @a.map! {|e| e}
+    end
+    it "should callback before :sort!" do
+      @a.sort!
+    end
+    it "should callback before :reverse!" do
+      @a.reverse!
+    end
+    it "should callback before :collect!" do
+      @a.collect! {|e| e}
+    end
+    it "should callback before :compact!" do
+      @a.compact!
+    end
+    it "should callback before :reject!" do
+      @a.reject! {|e| not e}
+    end
+    it "should callback before :slice!" do
+      @a.slice!(1,2)
+    end
+    it "should callback before :flatten!" do
+      @a.flatten!
+    end
+    it "should callback before :uniq!" do
+      @a.uniq!
+    end
+    it "should callback before :clear" do
+      @a.clear
+    end
+  end
+  it "should not adjust the array in other methods" do
+    @a.at(0)
+    @a.sort
+    @a.uniq
+    @a.find{|e| e}
+    Register.for(:before_taint).should be_nil
+    @a.should_not be_tainted
+  end
+  it "should callback before untaint" do
+    c = Register.for(:before_untaint) || 0
+    @a.taint
+    @a.untaint
+    Register.for(:before_untaint).should eql(c + 1)
+  end
+  it "should callback after untaint" do
+    c = Register.for(:after_untaint) || 0
+    @a.taint
+    @a.untaint
+    Register.for(:after_untaint).should eql(c + 1)
+  end
+end

data/spec/data_frame/core/column_management_spec.rb ADDED

@@ -0,0 +1,128 @@
+require File.join(File.dirname(__FILE__), "/../../spec_helper")
+describe "Column Management" do
+  before do
+    @labels = [:these, :are, :the, :labels]
+    @df = DataFrame.new(*@labels)
+    @df.add [1,2,3,4]
+    @df.add [5, 6, 7, 8]
+  end
+  context "append!" do
+    it "should be able to append an array of values to the data frame" do
+      @df.append!(:new_column, [5,5])
+      @df.new_column.should eql([5,5])
+    end
+    it "should be able to append a default value to the data frame" do
+      @df.append!(:new_column, :value)
+      @df.new_column.should eql([:value, :value])
+    end
+    it "should use nil as the default value" do
+      @df.append!(:new_column)
+      @df.new_column.should eql([nil, nil])
+    end
+  end
+  context "move_to_last!" do
+    it "should be able to move a column to the end of the data frame, useful for dependency models" do
+      @df.labels.should eql(@labels)
+      @df.move_to_last!(:these)
+      @df.labels.should eql([:are, :the, :labels, :these])
+      @df.these.should eql([1,5])
+    end
+  end
+  context "rename!" do
+    it "should be able to rename a column" do
+      @df.rename!(:new_name, :these)
+      @df.labels.should eql([:new_name, :are, :the, :labels])
+    end
+    it "should be able to use the new column name with dot notation" do
+      v = @df.these.dup
+      @df.rename!(:new_name, :these)
+      @df.new_name.should eql(v)
+    end
+  end
+  context "drop!" do
+    it "should be able to remove a column" do
+      @df = DataFrame.new :twos, :threes, :fours
+      @df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
+      @df.drop!(:twos)
+      @df.items.all? {|i| i.should eql([3,4])}
+      @df.labels.should eql([:threes, :fours])
+    end
+    it "should be able to remove more than one column at a time" do
+      @df = DataFrame.new :twos, :threes, :fours
+      @df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
+      @df.drop!(:twos, :fours)
+      @df.items.all? {|i| i.should eql([3])}
+      @df.labels.should eql([:threes])
+    end
+  end
+  context "replace!" do
+    before do
+      @doubler = lambda{|e| e * 2}
+    end
+    it "should only replace columns that actually exist" do
+      lambda{@df.replace!(:not_a_column, &@doubler)}.should raise_error(
+        ArgumentError, /Must provide the name of an existing column./)
+      lambda{@df.replace!(:these, &@doubler)}.should_not raise_error
+    end
+    it "should be able to replace a column with a block" do
+      @df.replace!(:these) {|e| e * 2}
+      @df.these.should eql([2,10])
+    end
+    it "should be able to replace a column with an array" do
+      @a = [5,9]
+      @df.replace!(:these, @a)
+      @df.these.should eql(@a)
+    end
+  end
+  context "subset_from_columns" do
+    it "should be able to create a subset of columns" do
+      new_data_frame = @df.subset_from_columns(:these, :labels)
+      new_data_frame.should_not eql(@df)
+      new_data_frame.labels.should eql([:these, :labels])
+      new_data_frame.items.should eql([[1,4],[5,8]])
+      new_data_frame.these.should eql([1,5])
+    end
+  end
+  context "duplicate!" do
+    it "should be able to duplicate a column" do
+      @df.duplicate!(:these)
+      @df.these1.should eql(@df.these)
+    end
+    it "should use unique names for the duplicate column" do
+      @df.duplicate!(:these)
+      @df.duplicate!(:these)
+      @df.duplicate!(:these)
+      @df.these3.should eql(@df.these2)
+      @df.these2.should eql(@df.these1)
+      @df.these1.should eql(@df.these)
+    end
+    it "should reset the labels list when a column is duplicated" do
+      @df.duplicate!(:these)
+      @df.labels.should be_include(:these1)
+    end
+    it "should return true, rather than the whole data set" do
+      @df.duplicate!(:these).should eql(true)
+    end
+  end
+end