RubyGems - davidrichards-data_frame - Versions diffs - 0.0.15 → 0.0.17 - Mend

davidrichards-data_frame 0.0.15 → 0.0.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

data/README.rdoc +15 -0
data/VERSION.yml +1 -1
data/lib/data_frame.rb +3 -302
data/lib/data_frame/data_frame.rb +301 -0
data/lib/data_frame/model.rb +22 -0
data/lib/data_frame/parameter_capture.rb +50 -0
data/spec/data_frame/data_frame_spec.rb +341 -0
data/spec/data_frame/model_spec.rb +36 -0
data/spec/data_frame/parameter_capture_spec.rb +32 -0
data/spec/data_frame_spec.rb +14 -326
metadata +9 -4

data/README.rdoc CHANGED

@@ -75,6 +75,21 @@ A lot of the work in the data frame is to transform the actual table.  You may n
 Note: most of these transformations are not optimized.  I'll work with things for a while before I try to optimize this library.  However, I should say that I've used some fairly large data sets (thousands of rows) and have been fine with things so far.
+== Models
+Data Frame can now create sub-models:
+	>> df = DataFrame.from_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv')
+	=> DataFrame rows: 517 labels: [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
+	>> df.model(:weekend) do |m|
+	?> m.day %w(sat sun)
+	>> end
+	=> DataFrame rows: 179 labels: [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
+	>> df.models.weekend.day.uniq
+	=> ["sat", "sun"]
+	>> df.models
+	=> #<OpenStruct weekend=DataFrame rows: 179 labels: [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]>
 ==Installation

data/VERSION.yml CHANGED

@@ -1,4 +1,4 @@
 ---
 :major: 0
 :minor: 0
-:patch: 15
+:patch: 17

data/lib/data_frame.rb CHANGED

@@ -19,305 +19,6 @@ $:.unshift(File.dirname(__FILE__))
 require 'data_frame/callback_array'
 require 'data_frame/transposable_array'
-# This allows me to have named columns and optionally named rows in a
-# data frame, to work calculations (usually on the columns), to
-# transpose the matrix and store the transposed matrix until the object
-# is tainted.
-class DataFrame
-  class << self
-    # This is the neatest part of this neat gem.
-    # DataFrame.from_csv can be called in a lot of ways:
-    # DataFrame.from_csv(csv_contents)
-    # DataFrame.from_csv(filename)
-    # DataFrame.from_csv(url)
-    # If you need to define converters for FasterCSV, do it before calling
-    # this method:
-    # FasterCSV::Converters[:special] = lambda{|f| f == 'foo' ? 'bar' : 'foo'}
-    # DataFrame.from_csv('http://example.com/my_special_url.csv', :converters => :special)
-    # This returns bar where 'foo' was found and 'foo' everywhere else.
-    def from_csv(obj, opts={})
-      labels, table = infer_csv_contents(obj, opts)
-      return nil unless labels and table
-      df = new(*labels)
-      df.import(table)
-      df
-    end
-    protected
-      def infer_csv_contents(obj, opts={})
-        contents = File.read(obj) if File.exist?(obj)
-        begin
-          open(obj) {|f| contents = f.read} unless contents
-        rescue
-          nil
-        end
-        contents ||= obj if obj.is_a?(String)
-        return nil unless contents
-        table = FCSV.parse(contents, default_csv_opts.merge(opts))
-        labels = table.shift
-        while table.last.empty?
-          table.pop
-        end
-        [labels, table]
-      end
-      def default_csv_opts; {:converters => :all}; end
-  end
-  # Loads a batch of rows.  Expects an array of arrays, else you don't
-  # know what you have.
-  def import(rows)
-    rows.each do |row|
-      self.add_item(row)
-    end
-  end
-  def inspect
-    "DataFrame rows: #{self.rows.size} labels: #{self.labels.inspect}"
-  end
-  # The labels of the data items
-  attr_reader :labels
-  alias :variables :labels
-  # The items stored in the frame
-  attr_reader :items
-  def initialize(*labels)
-    @labels = labels.map {|e| e.to_underscore_sym }
-    @items = TransposableArray.new
-  end
-  def add_item(item)
-    self.items << item
-  end
-  alias :add :add_item
-  def row_labels
-    @row_labels ||= []
-  end
-  def row_labels=(ary)
-    raise ArgumentError, "Row labels must be an array" unless ary.is_a?(Array)
-    @row_labels = ary
-  end
-  def render_column(sym)
-    i = @labels.index(sym)
-    return nil unless i
-    @items.transpose[i]
-  end
-  # The rows as an array of arrays, an alias for items.
-  alias :rows :items
-  # The columns as a Dictionary or Hash
-  # This is cached, call columns(true) to reset the cache.
-  def columns(reset=false)
-    @columns = nil if reset
-    return @columns if @columns
-    container = defined?(Dictionary) ? Dictionary.new : Hash.new
-    i = 0
-    @columns = @items.transpose.inject(container) do |cont, col|
-      cont[@labels[i]] = col
-      i += 1
-      cont
-    end
-  end
-  alias :to_hash :columns
-  alias :to_dictionary :columns
-  def render_row(sym)
-    i = self.row_labels.index(sym)
-    return nil unless i
-    @items[i]
-  end
-  def method_missing(sym, *args, &block)
-    if self.labels.include?(sym)
-      render_column(sym)
-    elsif self.row_labels.include?(sym)
-      render_row(sym)
-    elsif @items.respond_to?(sym)
-      @items.send(sym, *args, &block)
-    else
-      super
-    end
-  end
-  def drop!(*labels)
-    labels.each do |label|
-      drop_one!(label)
-    end
-    self
-  end
-  def drop_one!(label)
-    i = self.labels.index(label)
-    return nil unless i
-    self.items.each do |item|
-      item.delete_at(i)
-    end
-    self.labels.delete_at(i)
-    self
-  end
-  protected :drop_one!
-  def replace!(column, values=nil, &block)
-    column = validate_column(column)
-    if not values
-      values = self.send(column)
-      values.map! {|e| block.call(e)}
-    end
-    replace_column(column, values)
-    self
-  end
-  def replace_column(column, values)
-    column = validate_column(column)
-    index = self.labels.index(column)
-    list = []
-    self.items.each_with_index do |item, i|
-      consolidated = item
-      consolidated[index] = values[i]
-      list << consolidated
-    end
-    @items = list.dup
-  end
-  protected :replace_column
-  def validate_column(column)
-    column = column.to_sym
-    raise ArgumentError, "Must provide the name of an existing column.  Provided #{column.inspect}, needed to provide one of #{self.labels.inspect}" unless self.labels.include?(column)
-    column
-  end
-  protected :validate_column
-  # Takes a block to evaluate on each row.  The row can be converted into
-  # an OpenStruct or a Hash for easier filter methods. Note, don't try this
-  # with a hash or open struct unless you have facets available.
-  def filter!(as=Array, &block)
-    as = infer_class(as)
-    items = []
-    self.items.each do |row|
-      value = block.call(cast_row(row, as))
-      items << row if value
-    end
-    @items = items.dup
-    self
-  end
-  def filter(as=Array, &block)
-    new_data_frame = self.clone
-    new_data_frame.filter!(as, &block)
-  end
-  def infer_class(obj)
-    obj = obj.to_s.classify.constantize if obj.is_a?(Symbol)
-    obj = obj.classify.constantize if obj.is_a?(String)
-    obj
-  end
-  protected :infer_class
-  def cast_row(row, as)
-    if as == Hash
-      obj = {}
-      self.labels.each_with_index do |label, i|
-        obj[label] = row[i]
-      end
-      obj
-    elsif as == OpenStruct
-      obj = OpenStruct.new
-      self.labels.each_with_index do |label, i|
-        obj.table[label] = row[i]
-      end
-      obj
-    elsif as == Array
-      row
-    else
-      as.new(*row)
-    end
-  end
-  protected :cast_row
-  # Creates a new data frame, only with the specified columns.
-  def subset_from_columns(*cols)
-    new_labels = self.labels.inject([]) do |list, label|
-      list << label if cols.include?(label)
-      list
-    end
-    new_data_frame = DataFrame.new(*self.labels)
-    new_data_frame.import(self.items)
-    self.labels.each do |label|
-      new_data_frame.drop!(label) unless new_labels.include?(label)
-    end
-    new_data_frame
-  end
-  # A weird name.  This creates a column for every category in a column
-  # and marks each row by its value
-  def j_binary_ize!(*columns)
-    # Allows to mix a hash with the columns.
-    options = columns.find_all {|e| e.is_a?(Hash)}.inject({}) {|h, e| h.merge!(e)}
-    columns.delete_if {|e| e.is_a?(Hash)}
-    # Generates new columns
-    columns.each do |col|
-      values = render_column(col.to_underscore_sym)
-      values.categories.each do |category|
-        full_name = (col.to_s + "_" + category.to_s).to_sym
-        if options[:allow_overlap]
-          category_map = values.inject([]) do |list, e|
-            list << values.all_categories(e)
-          end
-          self.append!(full_name, category_map.map{|e| e.include?(category)})
-        else
-          self.append!(full_name, values.category_map.map{|e| e == category})
-        end
-      end
-    end
-  end
-  # Adds a unique column to the table
-  def append!(column_name, value=nil)
-    raise ArgumentError, "Can't have duplicate column names" if self.labels.include?(column_name)
-    self.labels << column_name.to_underscore_sym
-    if value.is_a?(Array)
-      self.items.each_with_index do |item, i|
-        item << value[i]
-      end
-    else
-      self.items.each do |item|
-        item << value
-      end
-    end
-    # Because we are tainting the sub arrays, the TaintableArray doesn't know it's been changed.
-    self.items.taint
-  end
-  def filter_by_category(hash)
-    new_data_frame = self.dup
-    hash.each do |key, value|
-      key = key.to_underscore_sym
-      next unless self.labels.include?(key)
-      value = [value] unless value.is_a?(Array) or value.is_a?(Range)
-      new_data_frame.filter!(:hash) {|row| value.include?(row[key])}
-    end
-    new_data_frame
-  end
-  def filter_by_category!(hash)
-    hash.each do |key, value|
-      key = key.to_underscore_sym
-      next unless self.labels.include?(key)
-      value = [value] unless value.is_a?(Array) or value.is_a?(Range)
-      self.filter!(:hash) {|row| value.include?(row[key])}
-    end
-  end
-end
+require 'data_frame/parameter_capture'
+require 'data_frame/data_frame'
+require 'data_frame/model'

data/lib/data_frame/data_frame.rb ADDED

@@ -0,0 +1,301 @@
+# This allows me to have named columns and optionally named rows in a
+# data frame, to work calculations (usually on the columns), to
+# transpose the matrix and store the transposed matrix until the object
+# is tainted.
+class DataFrame
+  class << self
+    # This is the neatest part of this neat gem.
+    # DataFrame.from_csv can be called in a lot of ways:
+    # DataFrame.from_csv(csv_contents)
+    # DataFrame.from_csv(filename)
+    # DataFrame.from_csv(url)
+    # If you need to define converters for FasterCSV, do it before calling
+    # this method:
+    # FasterCSV::Converters[:special] = lambda{|f| f == 'foo' ? 'bar' : 'foo'}
+    # DataFrame.from_csv('http://example.com/my_special_url.csv', :converters => :special)
+    # This returns bar where 'foo' was found and 'foo' everywhere else.
+    def from_csv(obj, opts={})
+      labels, table = infer_csv_contents(obj, opts)
+      return nil unless labels and table
+      df = new(*labels)
+      df.import(table)
+      df
+    end
+    protected
+      def infer_csv_contents(obj, opts={})
+        contents = File.read(obj) if File.exist?(obj)
+        begin
+          open(obj) {|f| contents = f.read} unless contents
+        rescue
+          nil
+        end
+        contents ||= obj if obj.is_a?(String)
+        return nil unless contents
+        table = FCSV.parse(contents, default_csv_opts.merge(opts))
+        labels = table.shift
+        while table.last.empty?
+          table.pop
+        end
+        [labels, table]
+      end
+      def default_csv_opts; {:converters => :all}; end
+  end
+  # Loads a batch of rows.  Expects an array of arrays, else you don't
+  # know what you have.
+  def import(rows)
+    rows.each do |row|
+      self.add_item(row)
+    end
+  end
+  def inspect
+    "DataFrame rows: #{self.rows.size} labels: #{self.labels.inspect}"
+  end
+  # The labels of the data items
+  attr_reader :labels
+  alias :variables :labels
+  # The items stored in the frame
+  attr_reader :items
+  def initialize(*labels)
+    @labels = labels.map {|e| e.to_underscore_sym }
+    @items = TransposableArray.new
+  end
+  def add_item(item)
+    self.items << item
+  end
+  alias :add :add_item
+  def row_labels
+    @row_labels ||= []
+  end
+  def row_labels=(ary)
+    raise ArgumentError, "Row labels must be an array" unless ary.is_a?(Array)
+    @row_labels = ary
+  end
+  def render_column(sym)
+    i = @labels.index(sym)
+    return nil unless i
+    @items.transpose[i]
+  end
+  # The rows as an array of arrays, an alias for items.
+  alias :rows :items
+  # The columns as a Dictionary or Hash
+  # This is cached, call columns(true) to reset the cache.
+  def columns(reset=false)
+    @columns = nil if reset
+    return @columns if @columns
+    container = defined?(Dictionary) ? Dictionary.new : Hash.new
+    i = 0
+    @columns = @items.transpose.inject(container) do |cont, col|
+      cont[@labels[i]] = col
+      i += 1
+      cont
+    end
+  end
+  alias :to_hash :columns
+  alias :to_dictionary :columns
+  def render_row(sym)
+    i = self.row_labels.index(sym)
+    return nil unless i
+    @items[i]
+  end
+  def method_missing(sym, *args, &block)
+    if self.labels.include?(sym)
+      render_column(sym)
+    elsif self.row_labels.include?(sym)
+      render_row(sym)
+    elsif @items.respond_to?(sym)
+      @items.send(sym, *args, &block)
+    else
+      super
+    end
+  end
+  def drop!(*labels)
+    labels.each do |label|
+      drop_one!(label)
+    end
+    self
+  end
+  def drop_one!(label)
+    i = self.labels.index(label)
+    return nil unless i
+    self.items.each do |item|
+      item.delete_at(i)
+    end
+    self.labels.delete_at(i)
+    self
+  end
+  protected :drop_one!
+  def replace!(column, values=nil, &block)
+    column = validate_column(column)
+    if not values
+      values = self.send(column)
+      values.map! {|e| block.call(e)}
+    end
+    replace_column(column, values)
+    self
+  end
+  def replace_column(column, values)
+    column = validate_column(column)
+    index = self.labels.index(column)
+    list = []
+    self.items.each_with_index do |item, i|
+      consolidated = item
+      consolidated[index] = values[i]
+      list << consolidated
+    end
+    @items = list.dup
+  end
+  protected :replace_column
+  def validate_column(column)
+    column = column.to_sym
+    raise ArgumentError, "Must provide the name of an existing column.  Provided #{column.inspect}, needed to provide one of #{self.labels.inspect}" unless self.labels.include?(column)
+    column
+  end
+  protected :validate_column
+  # Takes a block to evaluate on each row.  The row can be converted into
+  # an OpenStruct or a Hash for easier filter methods. Note, don't try this
+  # with a hash or open struct unless you have facets available.
+  def filter!(as=Array, &block)
+    as = infer_class(as)
+    items = []
+    self.items.each do |row|
+      value = block.call(cast_row(row, as))
+      items << row if value
+    end
+    @items = items.dup
+    self
+  end
+  def filter(as=Array, &block)
+    new_data_frame = self.clone
+    new_data_frame.filter!(as, &block)
+  end
+  def infer_class(obj)
+    obj = obj.to_s.classify.constantize if obj.is_a?(Symbol)
+    obj = obj.classify.constantize if obj.is_a?(String)
+    obj
+  end
+  protected :infer_class
+  def cast_row(row, as)
+    if as == Hash
+      obj = {}
+      self.labels.each_with_index do |label, i|
+        obj[label] = row[i]
+      end
+      obj
+    elsif as == OpenStruct
+      obj = OpenStruct.new
+      self.labels.each_with_index do |label, i|
+        obj.table[label] = row[i]
+      end
+      obj
+    elsif as == Array
+      row
+    else
+      as.new(*row)
+    end
+  end
+  protected :cast_row
+  # Creates a new data frame, only with the specified columns.
+  def subset_from_columns(*cols)
+    new_labels = self.labels.inject([]) do |list, label|
+      list << label if cols.include?(label)
+      list
+    end
+    new_data_frame = DataFrame.new(*self.labels)
+    new_data_frame.import(self.items)
+    self.labels.each do |label|
+      new_data_frame.drop!(label) unless new_labels.include?(label)
+    end
+    new_data_frame
+  end
+  # A weird name.  This creates a column for every category in a column
+  # and marks each row by its value
+  def j_binary_ize!(*columns)
+    # Allows to mix a hash with the columns.
+    options = columns.find_all {|e| e.is_a?(Hash)}.inject({}) {|h, e| h.merge!(e)}
+    columns.delete_if {|e| e.is_a?(Hash)}
+    # Generates new columns
+    columns.each do |col|
+      values = render_column(col.to_underscore_sym)
+      values.categories.each do |category|
+        full_name = (col.to_s + "_" + category.to_s).to_sym
+        if options[:allow_overlap]
+          category_map = values.inject([]) do |list, e|
+            list << values.all_categories(e)
+          end
+          self.append!(full_name, category_map.map{|e| e.include?(category)})
+        else
+          self.append!(full_name, values.category_map.map{|e| e == category})
+        end
+      end
+    end
+  end
+  # Adds a unique column to the table
+  def append!(column_name, value=nil)
+    raise ArgumentError, "Can't have duplicate column names" if self.labels.include?(column_name)
+    self.labels << column_name.to_underscore_sym
+    if value.is_a?(Array)
+      self.items.each_with_index do |item, i|
+        item << value[i]
+      end
+    else
+      self.items.each do |item|
+        item << value
+      end
+    end
+    # Because we are tainting the sub arrays, the TaintableArray doesn't know it's been changed.
+    self.items.taint
+  end
+  def filter_by_category(hash)
+    new_data_frame = self.dup
+    hash.each do |key, value|
+      key = key.to_underscore_sym
+      next unless self.labels.include?(key)
+      value = [value] unless value.is_a?(Array) or value.is_a?(Range)
+      new_data_frame.filter!(:hash) {|row| value.include?(row[key])}
+    end
+    new_data_frame
+  end
+  def filter_by_category!(hash)
+    hash.each do |key, value|
+      key = key.to_underscore_sym
+      next unless self.labels.include?(key)
+      value = [value] unless value.is_a?(Array) or value.is_a?(Range)
+      self.filter!(:hash) {|row| value.include?(row[key])}
+    end
+  end
+end