RubyGems - davidrichards-data_frame - Versions diffs - 0.0.18 → 0.0.19 - Mend

davidrichards-data_frame 0.0.18 → 0.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

data/README.rdoc +16 -0
data/VERSION.yml +1 -1
data/bin/plain_frame +22 -0
data/lib/data_frame.rb +2 -1
data/lib/data_frame/arff.rb +43 -36
data/lib/data_frame/core/column_management.rb +102 -0
data/lib/data_frame/core/filter.rb +48 -0
data/lib/data_frame/core/import.rb +112 -0
data/lib/data_frame/core/pre_process.rb +61 -0
data/lib/data_frame/core/saving.rb +29 -0
data/lib/data_frame/core/training.rb +36 -0
data/lib/data_frame/data_frame.rb +37 -241
data/lib/data_frame/id3.rb +28 -0
data/lib/data_frame/kmeans.rb +10 -0
data/lib/data_frame/labels_from_uci.rb +48 -0
data/lib/data_frame/mlp.rb +18 -0
data/lib/data_frame/sbn.rb +18 -0
data/lib/data_frame/transposable_array.rb +1 -1
data/lib/ext/array.rb +11 -0
data/spec/data_frame/arff_spec.rb +1 -0
data/spec/data_frame/core/column_management_spec.rb +97 -0
data/spec/data_frame/core/filter_spec.rb +88 -0
data/spec/data_frame/core/import_spec.rb +41 -0
data/spec/data_frame/core/pre_process_spec.rb +71 -0
data/spec/data_frame/core/saving_spec.rb +61 -0
data/spec/data_frame/core/training_spec.rb +51 -0
data/spec/data_frame/data_frame_spec.rb +10 -226
data/spec/data_frame/id3_spec.rb +22 -0
data/spec/ext/array_spec.rb +13 -0
data/spec/fixtures/discrete_testing.csv +4 -0
data/spec/fixtures/discrete_training.csv +21 -0
metadata +33 -6

data/lib/data_frame/mlp.rb ADDED Viewed

@@ -0,0 +1,18 @@
+module DF #:nodoc:
+  # Turns Data Frame into a feeder for Red Davis' MLP classifier.
+  # Will install it if you don't have it.
+  module MLP
+    begin
+      gem 'reddavis-mlp'
+      require 'mlp'
+    rescue
+      `sudo gem install reddavis-mlp`
+      gem 'reddavis-mlp'
+      require 'mlp'
+    end
+  end
+end
+class DataFrame
+  include DF::MLP
+end

data/lib/data_frame/sbn.rb ADDED Viewed

@@ -0,0 +1,18 @@
+module DF #:nodoc:
+  # Turns Data Frame into a feeder for Carl Youngblood's Simple Bayesian classifier.
+  # Will install it if you don't have it.
+  module SBN
+    begin
+      gem 'sbn'
+      require 'sbn'
+    rescue
+      `sudo gem install sbn`
+      gem 'sbn'
+      require 'sbn'
+    end
+  end
+end
+class DataFrame
+  include DF::SBN
+end

data/lib/data_frame/transposable_array.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # The only trick in this array is that it's transpose is memoized until
-# it is tainted.  This will reduce computations elegantly.
+# it is tainted.  This should reduce computations elegantly.
 class TransposableArray < CallbackArray
   after_taint :clear_cache

data/lib/ext/array.rb ADDED Viewed

@@ -0,0 +1,11 @@
+class Array
+  # Defines the number of dimensions:
+  # [1,2,3] is 1-dimensional
+  # [[1,2,3], [1,2,3]] is 2-dimensional
+  # [[[1,2,3], [1,2,3]], [[1,2,3], [1,2,3], [[1,2,3], [1,2,3]]]] is 3-dimensional
+  # So [[[1,2,3], [1,2,3]], [[1,2,3], [1,2,3], [[1,2,3], [1,2,3]]]].dimensions == 3
+  def dimensions(n=0)
+    n += 1
+    self.first.is_a?(Array) ? self.first.dimensions(n) : n
+  end
+end

data/spec/data_frame/arff_spec.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 require File.join(File.dirname(__FILE__), "/../spec_helper")
+require 'data_frame/arff'
 describe "ARFF" do
   before do

data/spec/data_frame/core/column_management_spec.rb ADDED Viewed

@@ -0,0 +1,97 @@
+require File.join(File.dirname(__FILE__), "/../../spec_helper")
+describe "Column Management" do
+  before do
+    @labels = [:these, :are, :the, :labels]
+    @df = DataFrame.new(*@labels)
+    @df.add [1,2,3,4]
+    @df.add [5, 6, 7, 8]
+  end
+  context "append!" do
+    it "should be able to append an array of values to the data frame" do
+      @df.append!(:new_column, [5,5])
+      @df.new_column.should eql([5,5])
+    end
+    it "should be able to append a default value to the data frame" do
+      @df.append!(:new_column, :value)
+      @df.new_column.should eql([:value, :value])
+    end
+    it "should use nil as the default value" do
+      @df.append!(:new_column)
+      @df.new_column.should eql([nil, nil])
+    end
+  end
+  context "move_to_last!" do
+    it "should be able to move a column to the end of the data frame, useful for dependency models" do
+      @df.labels.should eql(@labels)
+      @df.move_to_last!(:these)
+      @df.labels.should eql([:are, :the, :labels, :these])
+      @df.these.should eql([1,5])
+    end
+  end
+  context "rename!" do
+    it "should be able to rename a column" do
+      @df.rename!(:new_name, :these)
+      @df.labels.should eql([:new_name, :are, :the, :labels])
+    end
+  end
+  context "drop!" do
+    it "should be able to remove a column" do
+      @df = DataFrame.new :twos, :threes, :fours
+      @df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
+      @df.drop!(:twos)
+      @df.items.all? {|i| i.should eql([3,4])}
+      @df.labels.should eql([:threes, :fours])
+    end
+    it "should be able to remove more than one column at a time" do
+      @df = DataFrame.new :twos, :threes, :fours
+      @df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
+      @df.drop!(:twos, :fours)
+      @df.items.all? {|i| i.should eql([3])}
+      @df.labels.should eql([:threes])
+    end
+  end
+  context "replace!" do
+    before do
+      @doubler = lambda{|e| e * 2}
+    end
+    it "should only replace columns that actually exist" do
+      lambda{@df.replace!(:not_a_column, &@doubler)}.should raise_error(
+        ArgumentError, /Must provide the name of an existing column./)
+      lambda{@df.replace!(:these, &@doubler)}.should_not raise_error
+    end
+    it "should be able to replace a column with a block" do
+      @df.replace!(:these) {|e| e * 2}
+      @df.these.should eql([2,10])
+    end
+    it "should be able to replace a column with an array" do
+      @a = [5,9]
+      @df.replace!(:these, @a)
+      @df.these.should eql(@a)
+    end
+  end
+  context "subset_from_columns" do
+    it "should be able to create a subset of columns" do
+      new_data_frame = @df.subset_from_columns(:these, :labels)
+      new_data_frame.should_not eql(@df)
+      new_data_frame.labels.should eql([:these, :labels])
+      new_data_frame.items.should eql([[1,4],[5,8]])
+      new_data_frame.these.should eql([1,5])
+    end
+  end
+end

data/spec/data_frame/core/filter_spec.rb ADDED Viewed

@@ -0,0 +1,88 @@
+require File.join(File.dirname(__FILE__), "/../../spec_helper")
+describe "Filter" do
+  before do
+    @labels = [:these, :are, :the, :labels]
+    @df = DataFrame.new(*@labels)
+    @df.add [1,2,3,4]
+    @df.add [5, 6, 7, 8]
+  end
+  it "should be able to filter a data frame with a block using an OpenStruct for each row" do
+    @df.filter!(:open_struct) {|row| row.these == 5}
+    @df.items.should eql([[5, 6, 7, 8]])
+  end
+  it "should be able to filter a data frame with a block using a Hash for each row" do
+    @df.filter!(:hash) {|row| row[:these] == 5}
+    @df.items.should eql([[5, 6, 7, 8]])
+  end
+  S4 = Struct.new(:one, :two, :three, :four)
+  it "should be able to filter a data frame with a block using another class that uses the row as input" do
+    @df.filter!(S4) {|row| row.one == 5}
+    @df.items.should eql([[5, 6, 7, 8]])
+  end
+  it "should be able to filter a data frame with a block using an array for each row" do
+    @df.filter! {|row| row.first == 5}
+    @df.items.should eql([[5, 6, 7, 8]])
+  end
+  it "should be able to do fancy things with the row as the filter" do
+    @df.filter! {|row| row.sum > 10}
+    @df.items.should eql([[5, 6, 7, 8]])
+  end
+  it "should be able to generate a new data frame with filter" do
+    new_df = @df.filter(:open_struct) {|row| row.these == 5}
+    new_df.items.should eql([[5, 6, 7, 8]])
+    @df.items.should eql([[1, 2, 3, 4], [5, 6, 7, 8]])
+  end
+end
+context "filter_by_category" do
+  before do
+    @df = DataFrame.new(:weather, :date)
+    (1..31).each do |i|
+      @df.add [(i % 3 == 1) ? :fair : :good, Date.parse("07/#{i}/2009")]
+    end
+    @d1 = Date.parse("07/15/2009")
+    @d2 = Date.parse("07/31/2009")
+  end
+  it "should be able to filter by category" do
+    filtered = @df.filter_by_category(:weather => :good)
+    filtered.weather.uniq.should eql([:good])
+    @df.weather.uniq.should be_include(:fair)
+  end
+  it "should be able to manage ranges for filter values" do
+    filtered = @df.filter_by_category(:date => (@d1..@d2))
+    filtered.date.should_not be_include(Date.parse("07/01/2009"))
+    filtered.date.should_not be_include(Date.parse("07/14/2009"))
+    filtered.date.should be_include(Date.parse("07/15/2009"))
+    filtered.date.should be_include(Date.parse("07/31/2009"))
+    @df.date.should be_include(Date.parse("07/01/2009"))
+  end
+  it "should be able to take an array of values to filter with" do
+    filtered = @df.filter_by_category(:date => [@d1, @d2])
+    filtered.date.should_not be_include(Date.parse("07/01/2009"))
+    filtered.date.should be_include(Date.parse("07/15/2009"))
+    filtered.date.should be_include(Date.parse("07/31/2009"))
+  end
+  it "should have a destructive version" do
+    @df.filter_by_category!(:date => [@d1, @d2])
+    @df.date.should_not be_include(Date.parse("07/01/2009"))
+    @df.date.should be_include(Date.parse("07/15/2009"))
+    @df.date.should be_include(Date.parse("07/31/2009"))
+  end
+end

data/spec/data_frame/core/import_spec.rb ADDED Viewed

@@ -0,0 +1,41 @@
+require File.join(File.dirname(__FILE__), "/../../spec_helper")
+describe "Import" do
+  before do
+    @labels = [:these, :are, :the, :labels]
+    @df = DataFrame.new(*@labels)
+  end
+  it "should be able to add an item" do
+    item = [1,2,3,4]
+    @df.add_item(item)
+    @df.items.should eql([item])
+  end
+  it "should be able to import more than one row at a time" do
+    @df.import([[2,2,2,2],[3,3,3,3],[4,4,4,4]])
+    @df.row_labels = [:twos, :threes, :fours]
+    @df.twos.should eql([2,2,2,2])
+    @df.threes.should eql([3,3,3,3])
+    @df.fours.should eql([4,4,4,4])
+  end
+  it "should be able to import only one row" do
+    @df.import([2,2,2,2])
+    @df.these.should eql([2])
+  end
+  it "should be able to import a reference to csv" do
+    contents = %{7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,0
+7,4,oct,tue,90.6,35.4,669.1,6.7,18,33,0.9,0,0
+}
+    @labels = [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
+    @df = DataFrame.new(@labels)
+    @df.import(contents)
+    @df.x.should eql([7,7])
+    @df.area.should eql([0,0])
+  end
+end

data/spec/data_frame/core/pre_process_spec.rb ADDED Viewed

@@ -0,0 +1,71 @@
+require File.join(File.dirname(__FILE__), "/../../spec_helper")
+describe "PreProcess" do
+  it "should be able to j_binary_ize! a column, taking its categories and creating a column for each" do
+    df = DataFrame.new(:observations)
+    df.add [:many]
+    df.add [:fine]
+    df.add [:things]
+    df.add [:are]
+    df.add [:available]
+    df.j_binary_ize!(:observations)
+    df.observations_many.should eql([true, false, false, false, false])
+    df.observations_fine.should eql([false, true, false, false, false])
+    df.observations_things.should eql([false, false, true, false, false])
+    df.observations_are.should eql([false, false, false, true, false])
+    df.observations_available.should eql([false, false, false, false, true])
+    df.observations.should eql([:many, :fine, :things, :are, :available])
+  end
+  it "should be able to j_binary_ize! a more normal column" do
+    df = DataFrame.new(:observations)
+    df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
+    df.observations.add_category(:small) {|e| e <= 3}
+    df.observations.add_category(:large) {|e| e >= 3}
+    df.j_binary_ize!(:observations)
+    df.observations_small.should eql([true, true, true, false, false, false, true, true, true])
+    df.observations_large.should eql([false, false, false, true, true, true, false, false, false])
+  end
+  it "should be able to j_binary_ize with non-adjacent sets (sets that allow a value to have more than one category)" do
+    df = DataFrame.new(:observations)
+    df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
+    df.observations.add_category(:small) {|e| e <= 3}
+    df.observations.add_category(:large) {|e| e >= 3}
+    df.j_binary_ize!(:observations, :allow_overlap => true)
+    df.observations_small.should eql([true, true, true, false, false, false, true, true, true])
+    df.observations_large.should eql([false, false, true, true, true, true, true, false, false])
+  end
+  it "should be able to hold multiple ideas of a columns categories by resetting the category and re-running j_binary_ize" do
+    df = DataFrame.new(:observations)
+    df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
+    df.observations.add_category(:small) {|e| e <= 3}
+    df.observations.add_category(:large) {|e| e >= 3}
+    df.j_binary_ize!(:observations, :allow_overlap => true)
+    df.observations.set_categories(:odd => lambda{|e| e.odd?}, :even => lambda{|e| e.even?})
+    df.j_binary_ize!(:observations)
+    df.observations_small.should eql([true, true, true, false, false, false, true, true, true])
+    df.observations_large.should eql([false, false, true, true, true, true, true, false, false])
+    df.observations.should eql([1,2,3,4,5,4,3,2,1])
+    df.observations_even.should eql([false, true, false, true, false, true, false, true, false])
+    df.observations_odd.should eql([true, false, true, false, true, false, true, false, true])
+  end
+  context "numericize!" do
+    before do
+      @df = DataFrame.new(:observations)
+      @df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
+      @df.observations.add_category(:small) {|e| e <= 3}
+      @df.observations.add_category(:large) {|e| e > 3}
+    end
+    it "should be able to numericize nominal data" do
+      @df.numericize!(:observations)
+      @df.numerical_observations.should eql([[1,0],[1,0],[1,0],[0,1],[0,1],[0,1],[1,0],[1,0],[1,0]])
+    end
+  end
+end

data/spec/data_frame/core/saving_spec.rb ADDED Viewed

@@ -0,0 +1,61 @@
+require File.join(File.dirname(__FILE__), "/../../spec_helper")
+describe "Saving" do
+  before do
+    @df = DataFrame.new(:observations)
+    @df.import([1,2,3,4,5,4,3,2,1].map{|e| Array(e)})
+    @df.observations.add_category(:small) {|e| e <= 3}
+    @df.observations.add_category(:large) {|e| e > 3}
+    @filename = "/tmp/numericized_observations"
+  end
+  after do
+    `rm -rf #{@filename}`
+  end
+  it "should be able to save the data frame" do
+    @df.numericize!(:observations)
+    @df.save(@filename)
+    File.read(@filename).should eql(@df.to_csv)
+  end
+  it "should be able to save the data frame without the header" do
+    @df.save(@filename, :include_header => false)
+    File.read(@filename).should eql(@df.to_csv(false))
+  end
+  it "should be able to save off a subset" do
+    @df = DataFrame.new(:observations, :junk)
+    @df.import( [1,2,3,4,5,4,3,2,1].map{ |e| [e,e] } )
+    @df.save(@filename, :subset => :observations)
+    File.read(@filename).should eql(@df.subset_from_columns(:observations).to_csv)
+  end
+  it "should be able to filter the rows" do
+    @df = DataFrame.new(:observations, :junk)
+    @df.import( [1,2,3,4,5,4,3,2,1].map{ |e| [e,e] } )
+    @df.save(@filename, :subset => :observations)
+    @df.observations.add_category(:small) {|e| e <= 3}
+    @df.observations.add_category(:large) {|e| e > 3}
+    @df.save(@filename, :filter_by_category => {:observations => :small})
+    File.read(@filename).should eql(@df.filter_by_category(:observations => :small).to_csv)
+  end
+  it "should have a shortcut for subset, only" do
+    @df = DataFrame.new(:observations, :junk)
+    @df.import( [1,2,3,4,5,4,3,2,1].map{ |e| [e,e] } )
+    @df.save(@filename, :only => :observations)
+    File.read(@filename).should eql(@df.subset_from_columns(:observations).to_csv)
+  end
+  it "should have a shortcut for filter_by_category, filter" do
+    @df = DataFrame.new(:observations, :junk)
+    @df.import( [1,2,3,4,5,4,3,2,1].map{ |e| [e,e] } )
+    @df.save(@filename, :subset => :observations)
+    @df.observations.add_category(:small) {|e| e <= 3}
+    @df.observations.add_category(:large) {|e| e > 3}
+    @df.save(@filename, :filter => {:observations => :small})
+    File.read(@filename).should eql(@df.filter_by_category(:observations => :small).to_csv)
+  end
+end

data/spec/data_frame/core/training_spec.rb ADDED Viewed

@@ -0,0 +1,51 @@
+require File.join(File.dirname(__FILE__), "/../../spec_helper")
+describe "Training" do
+  before do
+    @df = DataFrame.new(:one)
+    @df.import((0...100).to_a)
+  end
+  it "should be able to create a proportional training set from a data frame" do
+    @df.training_set(:n => 3)
+    @df.training_set.size.should eql(3)
+    @df.training_set.all? {|e| @df.items.should be_include(e)}
+  end
+  it "should use the same training set unless reset is passed to it" do
+    @df.training_set(:n => 5)
+    @df.training_set.should eql(@df.training_set)
+    old = @df.training_set
+    @df.training_set(:reset => true, :n => 5)
+    @df.training_set.should_not eql(old)
+  end
+  it "should be able to create a proportional training set" do
+    @df.training_set(:proportion => 0.6)
+    @df.training_set.size.should eql(60)
+    @df.training_set(:proportion => 0.42, :reset => true)
+    @df.training_set.size.should eql(42)
+    @df.training_set(:proportion => 0, :reset => true)
+    @df.training_set.size.should eql(0)
+    @df.training_set(:proportion => 1, :reset => true)
+    @df.training_set.size.should eql(100)
+  end
+  it "should not have a set size exceeding the items size" do
+    @df.training_set(:proportion => 2)
+    @df.training_set.size.should eql(100)
+    @df.training_set(:n => 200, :reset => true)
+    @df.training_set.size.should eql(100)
+  end
+  it "should not have any items when the proportion is calculated below 0" do
+    @df.training_set(:proportion => -2)
+    @df.training_set.size.should eql(0)
+    @df.training_set(:n => -2, :reset => true)
+    @df.training_set.size.should eql(0)
+  end
+  it "should have a default proportion of 80%" do
+    @df.training_set.size.should eql(80)
+  end
+end