RubyGems - davidrichards-data_frame - Versions diffs - 0.0.12 → 0.0.13 - Mend

davidrichards-data_frame 0.0.12 → 0.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/README.rdoc CHANGED

@@ -48,6 +48,33 @@ To get your feet wet, you may want to play with data sets found here:
   http://www.liaad.up.pt/~ltorgo/Regression/DataSets.html
+== Transformations
+A lot of the work in the data frame is to transform the actual table.  You may need to drop columns, filter results, replace values in a column or create a new data frame based on the existing one.  Here's how to do that:
+  >  df = DataFrame.from_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv')
+  # => DataFrame rows: 517 labels: [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
+  > df.drop!(:ffmc)
+  # => DataFrame rows: 517 labels: [:x, :y, :month, :day, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
+  > df.drop!(:dmc, :dc, :isi, :rh)
+  # => DataFrame rows: 517 labels: [:x, :y, :month, :day, :temp, :wind, :rain, :area]
+  > df.x
+  # => [7, 7, 7, 8, 8, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6,...]
+  > df.replace!(:x) {|e| e * 3}
+  # => DataFrame rows: 517 labels: [:x, :y, :month, :day, :temp, :wind, :rain, :area]
+  > df.x
+  # => [21, 21, 21, 24, 24, 24, 24, 24, 24, 21, 21, 21, 18, 18, 18,...]
+  > df.filter!(:open_struct) {|row| row.x == 24}
+  # => DataFrame rows: 61 labels: [:x, :y, :month, :day, :temp, :wind, :rain, :area]
+  > df.x
+  # => [24, 24, 24, 24, 24, 24, 24, 24, 24,...]
+  > new_data_frame = df.subset_from_columns(:x, :y)
+  # => DataFrame rows: 61 labels: [:x, :y]
+  > new_data_frame.items
+  # => [[24, 6], [24, 6], [24, 6], [24, 6], ...]
+Note: most of these transformations are not optimized.  I'll work with things for a while before I try to optimize this library.  However, I should say that I've used some fairly large data sets (thousands of rows) and have been fine with things so far.
 ==Installation

data/VERSION.yml CHANGED

@@ -1,4 +1,4 @@
 ---
 :major: 0
 :minor: 0
-:patch: 12
+:patch: 13

data/lib/data_frame.rb CHANGED

@@ -3,6 +3,7 @@ require 'activesupport'
 require 'just_enumerable_stats'
 require 'open-uri'
 require 'fastercsv'
+require 'ostruct'
 # Use a Dictionary if available
 begin
@@ -57,6 +58,9 @@ class DataFrame
         return nil unless contents
         table = FCSV.parse(contents, default_csv_opts.merge(opts))
         labels = table.shift
+        while table.last.empty?
+          table.pop
+        end
         [labels, table]
       end
@@ -71,6 +75,10 @@ class DataFrame
     end
   end
+  def inspect
+    "DataFrame rows: #{self.rows.size} labels: #{self.labels.inspect}"
+  end
   # The labels of the data items
   attr_reader :labels
   alias :variables :labels
@@ -142,14 +150,108 @@ class DataFrame
     end
   end
-  def drop!(label)
+  def drop!(*labels)
+    labels.each do |label|
+      drop_one!(label)
+    end
+    self
+  end
+  def drop_one!(label)
     i = self.labels.index(label)
     return nil unless i
     self.items.each do |item|
       item.delete_at(i)
     end
     self.labels.delete_at(i)
-    true
+    self
+  end
+  protected :drop_one!
+  def replace!(column, values=nil, &block)
+    column = validate_column(column)
+    if not values
+      values = self.send(column)
+      values.map! {|e| block.call(e)}
+    end
+    replace_column(column, values)
+    self
+  end
+  def replace_column(column, values)
+    column = validate_column(column)
+    index = self.labels.index(column)
+    list = []
+    self.items.each_with_index do |item, i|
+      consolidated = item
+      consolidated[index] = values[i]
+      list << consolidated
+    end
+    @items = list.dup
+  end
+  protected :replace_column
+  def validate_column(column)
+    column = column.to_sym
+    raise ArgumentError, "Must provide the name of an existing column.  Provided #{column.inspect}, needed to provide one of #{self.labels.inspect}" unless self.labels.include?(column)
+    column
+  end
+  protected :validate_column
+  # Takes a block to evaluate on each row.  The row can be converted into
+  # an OpenStruct or a Hash for easier filter methods. Note, don't try this
+  # with a hash or open struct unless you have facets available.
+  def filter!(as=Array, &block)
+    as = infer_class(as)
+    items = []
+    self.items.each do |row|
+      value = block.call(cast_row(row, as))
+      items << row if value
+    end
+    @items = items.dup
+    self
+  end
+  def infer_class(obj)
+    obj = obj.to_s.classify.constantize if obj.is_a?(Symbol)
+    obj = obj.classify.constantize if obj.is_a?(String)
+    obj
+  end
+  protected :infer_class
+  def cast_row(row, as)
+    if as == Hash
+      obj = {}
+      self.labels.each_with_index do |label, i|
+        obj[label] = row[i]
+      end
+      obj
+    elsif as == OpenStruct
+      obj = OpenStruct.new
+      self.labels.each_with_index do |label, i|
+        obj.table[label] = row[i]
+      end
+      obj
+    elsif as == Array
+      row
+    else
+      as.new(row)
+    end
+  end
+  protected :cast_row
+  # Creates a new data frame, only with the specified columns.
+  def subset_from_columns(*cols)
+    new_labels = self.labels.inject([]) do |list, label|
+      list << label if cols.include?(label)
+      list
+    end
+    new_data_frame = DataFrame.new(*self.labels)
+    new_data_frame.import(self.items)
+    self.labels.each do |label|
+      new_data_frame.drop!(label) unless new_labels.include?(label)
+    end
+    new_data_frame
   end
 end

data/lib/ext/open_struct.rb ADDED

@@ -0,0 +1,5 @@
+class OpenStruct
+  def table
+    @table
+  end
+end

data/spec/data_frame_spec.rb CHANGED

@@ -101,6 +101,14 @@ describe DataFrame do
     @df.labels.should eql([:threes, :fours])
   end
+  it "should be able to remove more than one column at a time" do
+    @df = DataFrame.new :twos, :threes, :fours
+    @df.import([[2,3,4], [2,3,4], [2,3,4], [2,3,4]])
+    @df.drop!(:twos, :fours)
+    @df.items.all? {|i| i.should eql([3])}
+    @df.labels.should eql([:threes])
+  end
   it "should offer a hash-like structure of columns" do
     @df.add [1,2,3,4]
     @df.add [5, 6, 7, 8]
@@ -131,4 +139,56 @@ describe DataFrame do
     @df.labels.should eql(@labels)
     @df.variables.should eql(@labels)
   end
+  context "replace!" do
+    before do
+      @df.add [1,2,3,4]
+      @df.add [5, 6, 7, 8]
+      @doubler = lambda{|e| e * 2}
+    end
+    it "should only replace columns that actually exist" do
+      lambda{@df.replace!(:not_a_column, &@doubler)}.should raise_error(
+        ArgumentError, /Must provide the name of an existing column./)
+      lambda{@df.replace!(:these, &@doubler)}.should_not raise_error
+    end
+    it "should be able to replace a column with a block" do
+      @df.replace!(:these) {|e| e * 2}
+      @df.these.should eql([2,10])
+    end
+    it "should be able to replace a column with an array" do
+      @a = [5,9]
+      @df.replace!(:these, @a)
+      @df.these.should eql(@a)
+    end
+  end
+  context "filter!" do
+    before do
+      @df.add [1,2,3,4]
+      @df.add [5, 6, 7, 8]
+    end
+    it "should be able to filter a data frame with a block" do
+      @df.filter!(:open_struct) {|row| row.these == 5}
+      @df.items.should eql([[5, 6, 7, 8]])
+    end
+  end
+  context "subset_from_columns" do
+    before do
+      @df.add [1,2,3,4]
+      @df.add [5, 6, 7, 8]
+    end
+    it "should be able to create a subset of columns" do
+      new_data_frame = @df.subset_from_columns(:these, :labels)
+      new_data_frame.should_not eql(@df)
+      new_data_frame.labels.should eql([:these, :labels])
+      new_data_frame.items.should eql([[1,4],[5,8]])
+      new_data_frame.these.should eql([1,5])
+    end
+  end
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: davidrichards-data_frame
 version: !ruby/object:Gem::Version
-  version: 0.0.12
+  version: 0.0.13
 platform: ruby
 authors:
 - David Richards
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-08-11 00:00:00 -07:00
+date: 2009-08-16 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -58,6 +58,7 @@ files:
 - lib/data_frame/transposable_array.rb
 - lib/data_frame.rb
 - lib/ext
+- lib/ext/open_struct.rb
 - lib/ext/string.rb
 - lib/ext/symbol.rb
 - spec/data_frame