RubyGems - davidrichards-data_frame - Versions diffs - 0.0.12 → 0.0.13 - Mend

davidrichards-data_frame 0.0.12 → 0.0.13

Files changed (6) hide show

data/README.rdoc CHANGED

@@ -48,6 +48,33 @@ To get your feet wet, you may want to play with data sets found here:
   http://www.liaad.up.pt/~ltorgo/Regression/DataSets.html
+== Transformations
+A lot of the work in the data frame is to transform the actual table.  You may need to drop columns, filter results, replace values in a column or create a new data frame based on the existing one.  Here's how to do that:
+  >  df = DataFrame.from_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv')
+  # => DataFrame rows: 517 labels: [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
+  > df.drop!(:ffmc)
+  # => DataFrame rows: 517 labels: [:x, :y, :month, :day, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
+  > df.drop!(:dmc, :dc, :isi, :rh)
+  # => DataFrame rows: 517 labels: [:x, :y, :month, :day, :temp, :wind, :rain, :area]
+  > df.x
+  # => [7, 7, 7, 8, 8, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6,...]
+  > df.replace!(:x) {|e| e * 3}
+  # => DataFrame rows: 517 labels: [:x, :y, :month, :day, :temp, :wind, :rain, :area]
+  > df.x
+  # => [21, 21, 21, 24, 24, 24, 24, 24, 24, 21, 21, 21, 18, 18, 18,...]
+  > df.filter!(:open_struct) {|row| row.x == 24}
+  # => DataFrame rows: 61 labels: [:x, :y, :month, :day, :temp, :wind, :rain, :area]
+  > df.x
+  # => [24, 24, 24, 24, 24, 24, 24, 24, 24,...]
+  > new_data_frame = df.subset_from_columns(:x, :y)
+  # => DataFrame rows: 61 labels: [:x, :y]
+  > new_data_frame.items
+  # => [[24, 6], [24, 6], [24, 6], [24, 6], ...]
+Note: most of these transformations are not optimized.  I'll work with things for a while before I try to optimize this library.  However, I should say that I've used some fairly large data sets (thousands of rows) and have been fine with things so far.
 ==Installation

data/VERSION.yml CHANGED

@@ -1,4 +1,4 @@
 ---
 :major: 0
 :minor: 0
-:patch: 12
+:patch: 13

data/lib/data_frame.rb CHANGED

@@ -3,6 +3,7 @@ require 'activesupport'
 require 'just_enumerable_stats'
 require 'open-uri'
 require 'fastercsv'
+require 'ostruct'
 # Use a Dictionary if available
 begin
@@ -57,6 +58,9 @@ class DataFrame
         return nil unless contents
         table = FCSV.parse(contents, default_csv_opts.merge(opts))
         labels = table.shift
+        while table.last.empty?
+          table.pop
+        end
         [labels, table]
       end
@@ -71,6 +75,10 @@ class DataFrame
     end
   end
+  def inspect
+    "DataFrame rows: #{self.rows.size} labels: #{self.labels.inspect}"
+  end
   # The labels of the data items
   attr_reader :labels
   alias :variables :labels
@@ -142,14 +150,108 @@ class DataFrame
     end
   end
-  def drop!(label)
+  def drop!(*labels)
+    labels.each do |label|
+      drop_one!(label)
+    end
+    self
+  end
+  def drop_one!(label)
     i = self.labels.index(label)
     return nil unless i
     self.items.each do |item|
       item.delete_at(i)
     end
     self.labels.delete_at(i)
-    true
+    self
+  end
+  protected :drop_one!
+  def replace!(column, values=nil, &block)
+    column = validate_column(column)
+    if not values
+      values = self.send(column)
+      values.map! {|e| block.call(e)}
+    end
+    replace_column(column, values)
+    self
+  end
+  def replace_column(column, values)
+    column = validate_column(column)
+    index = self.labels.index(column)
+    list = []
+    self.items.each_with_index do |item, i|
+      consolidated = item
+      consolidated[index] = values[i]
+      list << consolidated
+    end
+    @items = list.dup
+  end
+  protected :replace_column
+  def validate_column(column)
+    column = column.to_sym
+    raise ArgumentError, "Must provide the name of an existing column.  Provided #{column.inspect}, needed to provide one of #{self.labels.inspect}" unless self.labels.include?(column)
+    column
+  end
+  protected :validate_column
+  # Takes a block to evaluate on each row.  The row can be converted into
+  # an OpenStruct or a Hash for easier filter methods. Note, don't try this
+  # with a hash or open struct unless you have facets available.
+  def filter!(as=Array, &block)
+    as = infer_class(as)
+    items = []
+    self.items.each do |row|
+      value = block.call(cast_row(row, as))
+      items << row if value
+    end
+    @items = items.dup
+    self
+  end
+  def infer_class(obj)
+    obj = obj.to_s.classify.constantize if obj.is_a?(Symbol)
+    obj = obj.classify.constantize if obj.is_a?(String)
+    obj
+  end
+  protected :infer_class
+  def cast_row(row, as)
+    if as == Hash
+      obj = {}
+      self.labels.each_with_index do |label, i|
+        obj[label] = row[i]
+      end
+      obj
+    elsif as == OpenStruct
+      obj = OpenStruct.new
+      self.labels.each_with_index do |label, i|
+        obj.table[label] = row[i]
+      end
+      obj
+    elsif as == Array
+      row
+    else
+      as.new(row)
+    end
+  end
+  protected :cast_row
+  # Creates a new data frame, only with the specified columns.
+  def subset_from_columns(*cols)
+    new_labels = self.labels.inject([]) do |list, label|
+      list << label if cols.include?(label)
+      list
+    end
+    new_data_frame = DataFrame.new(*self.labels)
+    new_data_frame.import(self.items)
+    self.labels.each do |label|
+      new_data_frame.drop!(label) unless new_labels.include?(label)
+    end
+    new_data_frame
   end
 end

data/lib/ext/open_struct.rb ADDED

@@ -0,0 +1,5 @@
+class OpenStruct
+  def table
+    @table
+  end
+end

data/spec/data_frame_spec.rb CHANGED

@@ -101,6 +101,14 @@ describe DataFrame do
     @df.labels.should eql([:threes, :fours])
   end
+  it "should be able to remove more than one column at a time" do
+    @df = DataFrame.new :twos, :threes, :fours
+    @df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
+    @df.drop!(:twos, :fours)
+    @df.items.all? {|i| i.should eql([3])}
+    @df.labels.should eql([:threes])
+  end
   it "should offer a hash-like structure of columns" do
     @df.add [1,2,3,4]
     @df.add [5, 6, 7, 8]
@@ -131,4 +139,56 @@ describe DataFrame do
     @df.labels.should eql(@labels)
     @df.variables.should eql(@labels)
   end
+  context "replace!" do
+    before do
+      @df.add [1,2,3,4]
+      @df.add [5, 6, 7, 8]
+      @doubler = lambda{|e| e * 2}
+    end
+    it "should only replace columns that actually exist" do
+      lambda{@df.replace!(:not_a_column, &@doubler)}.should raise_error(
+        ArgumentError, /Must provide the name of an existing column./)
+      lambda{@df.replace!(:these, &@doubler)}.should_not raise_error
+    end
+    it "should be able to replace a column with a block" do
+      @df.replace!(:these) {|e| e * 2}
+      @df.these.should eql([2,10])
+    end
+    it "should be able to replace a column with an array" do
+      @a = [5,9]
+      @df.replace!(:these, @a)
+      @df.these.should eql(@a)
+    end
+  end
+  context "filter!" do
+    before do
+      @df.add [1,2,3,4]
+      @df.add [5, 6, 7, 8]
+    end
+    it "should be able to filter a data frame with a block" do
+      @df.filter!(:open_struct) {|row| row.these == 5}
+      @df.items.should eql([[5, 6, 7, 8]])
+    end
+  end
+  context "subset_from_columns" do
+    before do
+      @df.add [1,2,3,4]
+      @df.add [5, 6, 7, 8]
+    end
+    it "should be able to create a subset of columns" do
+      new_data_frame = @df.subset_from_columns(:these, :labels)
+      new_data_frame.should_not eql(@df)
+      new_data_frame.labels.should eql([:these, :labels])
+      new_data_frame.items.should eql([[1,4],[5,8]])
+      new_data_frame.these.should eql([1,5])
+    end
+  end
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: davidrichards-data_frame
 version: !ruby/object:Gem::Version
-  version: 0.0.12
+  version: 0.0.13
 platform: ruby
 authors:
 - David Richards
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-08-11 00:00:00 -07:00
+date: 2009-08-16 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -58,6 +58,7 @@ files:
 - lib/data_frame/transposable_array.rb
 - lib/data_frame.rb
 - lib/ext
+- lib/ext/open_struct.rb
 - lib/ext/string.rb
 - lib/ext/symbol.rb
 - spec/data_frame