RubyGems - daru_lite - Versions diffs - 0.1 → 0.1.2 - Mend

daru_lite 0.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

checksums.yaml +4 -4
data/.github/ISSUE_TEMPLATE/bug_report.md +38 -0
data/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
data/.github/workflows/ci.yml +20 -0
data/.rubocop_todo.yml +35 -33
data/README.md +19 -115
data/daru_lite.gemspec +1 -0
data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
data/lib/daru_lite/data_frame/calculatable.rb +140 -0
data/lib/daru_lite/data_frame/convertible.rb +107 -0
data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
data/lib/daru_lite/data_frame/fetchable.rb +301 -0
data/lib/daru_lite/data_frame/filterable.rb +144 -0
data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
data/lib/daru_lite/data_frame/indexable.rb +168 -0
data/lib/daru_lite/data_frame/iterable.rb +339 -0
data/lib/daru_lite/data_frame/joinable.rb +152 -0
data/lib/daru_lite/data_frame/missable.rb +75 -0
data/lib/daru_lite/data_frame/pivotable.rb +108 -0
data/lib/daru_lite/data_frame/queryable.rb +67 -0
data/lib/daru_lite/data_frame/setable.rb +109 -0
data/lib/daru_lite/data_frame/sortable.rb +241 -0
data/lib/daru_lite/dataframe.rb +142 -2355
data/lib/daru_lite/index/index.rb +13 -0
data/lib/daru_lite/maths/statistics/vector.rb +1 -1
data/lib/daru_lite/vector/aggregatable.rb +9 -0
data/lib/daru_lite/vector/calculatable.rb +78 -0
data/lib/daru_lite/vector/convertible.rb +77 -0
data/lib/daru_lite/vector/duplicatable.rb +17 -0
data/lib/daru_lite/vector/fetchable.rb +175 -0
data/lib/daru_lite/vector/filterable.rb +128 -0
data/lib/daru_lite/vector/indexable.rb +77 -0
data/lib/daru_lite/vector/iterable.rb +95 -0
data/lib/daru_lite/vector/joinable.rb +17 -0
data/lib/daru_lite/vector/missable.rb +124 -0
data/lib/daru_lite/vector/queryable.rb +45 -0
data/lib/daru_lite/vector/setable.rb +47 -0
data/lib/daru_lite/vector/sortable.rb +113 -0
data/lib/daru_lite/vector.rb +36 -932
data/lib/daru_lite/version.rb +1 -1
data/spec/data_frame/aggregatable_example.rb +65 -0
data/spec/data_frame/buildable_example.rb +109 -0
data/spec/data_frame/calculatable_example.rb +135 -0
data/spec/data_frame/convertible_example.rb +180 -0
data/spec/data_frame/duplicatable_example.rb +111 -0
data/spec/data_frame/fetchable_example.rb +476 -0
data/spec/data_frame/filterable_example.rb +250 -0
data/spec/data_frame/indexable_example.rb +221 -0
data/spec/data_frame/iterable_example.rb +465 -0
data/spec/data_frame/joinable_example.rb +106 -0
data/spec/data_frame/missable_example.rb +47 -0
data/spec/data_frame/pivotable_example.rb +297 -0
data/spec/data_frame/queryable_example.rb +92 -0
data/spec/data_frame/setable_example.rb +482 -0
data/spec/data_frame/sortable_example.rb +350 -0
data/spec/dataframe_spec.rb +181 -3243
data/spec/index/index_spec.rb +8 -0
data/spec/vector/aggregatable_example.rb +27 -0
data/spec/vector/calculatable_example.rb +82 -0
data/spec/vector/convertible_example.rb +126 -0
data/spec/vector/duplicatable_example.rb +48 -0
data/spec/vector/fetchable_example.rb +463 -0
data/spec/vector/filterable_example.rb +165 -0
data/spec/vector/indexable_example.rb +201 -0
data/spec/vector/iterable_example.rb +111 -0
data/spec/vector/joinable_example.rb +25 -0
data/spec/vector/missable_example.rb +88 -0
data/spec/vector/queryable_example.rb +91 -0
data/spec/vector/setable_example.rb +300 -0
data/spec/vector/sortable_example.rb +242 -0
data/spec/vector_spec.rb +111 -1805
metadata +102 -3
data/.github/ISSUE_TEMPLATE.md +0 -18

data/lib/daru_lite/data_frame/filterable.rb ADDED Viewed

@@ -0,0 +1,144 @@
+module DaruLite
+  class DataFrame
+    module Filterable
+      # Return unique rows by vector specified or all vectors
+      #
+      # @param vtrs [String][Symbol] vector names(s) that should be considered
+      #
+      # @example
+      #
+      #    => #<DaruLite::DataFrame(6x2)>
+      #         a   b
+      #     0   1   a
+      #     1   2   b
+      #     2   3   c
+      #     3   4   d
+      #     2   3   c
+      #     3   4   f
+      #
+      #    2.3.3 :> df.uniq
+      #    => #<DaruLite::DataFrame(5x2)>
+      #         a   b
+      #     0   1   a
+      #     1   2   b
+      #     2   3   c
+      #     3   4   d
+      #     3   4   f
+      #
+      #    2.3.3 :> df.uniq(:a)
+      #    => #<DaruLite::DataFrame(5x2)>
+      #         a   b
+      #     0   1   a
+      #     1   2   b
+      #     2   3   c
+      #     3   4   d
+      #
+      def uniq(*vtrs)
+        vecs = vtrs.empty? ? vectors.to_a : Array(vtrs)
+        grouped = group_by(vecs)
+        indexes = grouped.groups.values.map { |v| v[0] }.sort
+        row[*indexes]
+      end
+      # Retain vectors or rows if the block returns a truthy value.
+      #
+      # == Description
+      #
+      # For filtering out certain rows/vectors based on their values,
+      # use the #filter method. By default it iterates over vectors and
+      # keeps those vectors for which the block returns true. It accepts
+      # an optional axis argument which lets you specify whether you want
+      # to iterate over vectors or rows.
+      #
+      # == Arguments
+      #
+      # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
+      # Default to :vector.
+      #
+      # == Usage
+      #
+      #   # Filter vectors
+      #
+      #   df.filter do |vector|
+      #     vector.type == :numeric and vector.median < 50
+      #   end
+      #
+      #   # Filter rows
+      #
+      #   df.filter(:row) do |row|
+      #     row[:a] + row[:d] < 100
+      #   end
+      def filter(axis = :vector, &block)
+        dispatch_to_axis_pl axis, :filter, &block
+      end
+      # Returns a dataframe in which rows with any of the mentioned values
+      # are ignored.
+      # @param [Array] values to reject to form the new dataframe
+      # @return [DaruLite::DataFrame] Data Frame with only rows which doesn't
+      #   contain the mentioned values
+      # @example
+      #   df = DaruLite::DataFrame.new({
+      #     a: [1,    2,          3,   nil,        Float::NAN, nil, 1,   7],
+      #     b: [:a,  :b,          nil, Float::NAN, nil,        3,   5,   8],
+      #     c: ['a',  Float::NAN, 3,   4,          3,          5,   nil, 7]
+      #   }, index: 11..18)
+      #   df.reject_values nil, Float::NAN
+      #   # => #<DaruLite::DataFrame(2x3)>
+      #   #       a   b   c
+      #   #   11   1   a   a
+      #   #   18   7   8   7
+      def reject_values(*values)
+        positions =
+          size.times.to_a - @data.flat_map { |vec| vec.positions(*values) }
+        # Handle the case when positions size is 1 and #row_at wouldn't return a df
+        if positions.size == 1
+          pos = positions.first
+          row_at(pos..pos)
+        else
+          row_at(*positions)
+        end
+      end
+      def keep_row_if
+        @index.size.times
+              .reject { |position| yield(row_at(position)) }
+              .reverse_each { |position| delete_at_position(position) }
+      end
+      def keep_vector_if
+        @vectors.each do |vector|
+          delete_vector(vector) unless yield(@data[@vectors[vector]], vector)
+        end
+      end
+      # creates a new vector with the data of a given field which the block returns true
+      def filter_vector(vec, &block)
+        DaruLite::Vector.new(each_row.select(&block).map { |row| row[vec] })
+      end
+      # Iterates over each row and retains it in a new DataFrame if the block returns
+      # true for that row.
+      def filter_rows
+        return to_enum(:filter_rows) unless block_given?
+        keep_rows = @index.map { |index| yield access_row(index) }
+        where keep_rows
+      end
+      # Iterates over each vector and retains it in a new DataFrame if the block returns
+      # true for that vector.
+      def filter_vectors(&block)
+        return to_enum(:filter_vectors) unless block
+        dup.tap { |df| df.keep_vector_if(&block) }
+      end
+      # Query a DataFrame by passing a DaruLite::Core::Query::BoolArray object.
+      def where(bool_array)
+        DaruLite::Core::Query.df_where self, bool_array
+      end
+    end
+  end
+end

data/lib/daru_lite/data_frame/i_o_able.rb ADDED Viewed

@@ -0,0 +1,179 @@
+module DaruLite
+  class DataFrame
+    module IOAble
+      module ClassMethods
+        # Load data from a CSV file. Specify an optional block to grab the CSV
+        # object and pre-condition it (for example use the `convert` or
+        # `header_convert` methods).
+        #
+        # == Arguments
+        #
+        # * path - Local path / Remote URL of the file to load specified as a String.
+        #
+        # == Options
+        #
+        # Accepts the same options as the DaruLite::DataFrame constructor and CSV.open()
+        # and uses those to eventually construct the resulting DataFrame.
+        #
+        # == Verbose Description
+        #
+        # You can specify all the options to the `.from_csv` function that you
+        # do to the Ruby `CSV.read()` function, since this is what is used internally.
+        #
+        # For example, if the columns in your CSV file are separated by something
+        # other that commas, you can use the `:col_sep` option. If you want to
+        # convert numeric values to numbers and not keep them as strings, you can
+        # use the `:converters` option and set it to `:numeric`.
+        #
+        # The `.from_csv` function uses the following defaults for reading CSV files
+        # (that are passed into the `CSV.read()` function):
+        #
+        #   {
+        #     :col_sep           => ',',
+        #     :converters        => :numeric
+        #   }
+        def from_csv(path, opts = {}, &block)
+          DaruLite::IO.from_csv path, opts, &block
+        end
+        # Read data from an Excel file into a DataFrame.
+        #
+        # == Arguments
+        #
+        # * path - Path of the file to be read.
+        #
+        # == Options
+        #
+        # *:worksheet_id - ID of the worksheet that is to be read.
+        def from_excel(path, opts = {}, &block)
+          DaruLite::IO.from_excel path, opts, &block
+        end
+        # Read a database query and returns a Dataset
+        #
+        # @param dbh [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
+        # @param query [String] The query to be executed
+        #
+        # @return A dataframe containing the data resulting from the query
+        #
+        # USE:
+        #
+        #  dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
+        #  DaruLite::DataFrame.from_sql(dbh, "SELECT * FROM test")
+        #
+        #  #Alternatively
+        #
+        #  require 'dbi'
+        #  DaruLite::DataFrame.from_sql("path/to/sqlite.db", "SELECT * FROM test")
+        def from_sql(dbh, query)
+          DaruLite::IO.from_sql dbh, query
+        end
+        # Read a dataframe from AR::Relation
+        #
+        # @param relation [ActiveRecord::Relation] An AR::Relation object from which data is loaded
+        # @param fields [Array] Field names to be loaded (optional)
+        #
+        # @return A dataframe containing the data loaded from the relation
+        #
+        # USE:
+        #
+        #   # When Post model is defined as:
+        #   class Post < ActiveRecord::Base
+        #     scope :active, -> { where.not(published_at: nil) }
+        #   end
+        #
+        #   # You can load active posts into a dataframe by:
+        #   DaruLite::DataFrame.from_activerecord(Post.active, :title, :published_at)
+        def from_activerecord(relation, *fields)
+          DaruLite::IO.from_activerecord relation, *fields
+        end
+        # Read the database from a plaintext file. For this method to work,
+        # the data should be present in a plain text file in columns. See
+        # spec/fixtures/bank2.dat for an example.
+        #
+        # == Arguments
+        #
+        # * path - Path of the file to be read.
+        # * fields - Vector names of the resulting database.
+        #
+        # == Usage
+        #
+        #   df = DaruLite::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
+        def from_plaintext(path, fields)
+          DaruLite::IO.from_plaintext path, fields
+        end
+        def _load(data)
+          h = Marshal.load data
+          DaruLite::DataFrame.new(
+            h[:data],
+            index: h[:index],
+            order: h[:order],
+            name: h[:name]
+          )
+        end
+      end
+      def self.included(base)
+        base.extend ClassMethods
+      end
+      # Write this DataFrame to a CSV file.
+      #
+      # == Arguments
+      #
+      # * filename - Path of CSV file where the DataFrame is to be saved.
+      #
+      # == Options
+      #
+      # * convert_comma - If set to *true*, will convert any commas in any
+      # of the data to full stops ('.').
+      # All the options accepted by CSV.read() can also be passed into this
+      # function.
+      def write_csv(filename, opts = {})
+        DaruLite::IO.dataframe_write_csv self, filename, opts
+      end
+      # Write this dataframe to an Excel Spreadsheet
+      #
+      # == Arguments
+      #
+      # * filename - The path of the file where the DataFrame should be written.
+      def write_excel(filename, opts = {})
+        DaruLite::IO.dataframe_write_excel self, filename, opts
+      end
+      # Insert each case of the Dataset on the selected table
+      #
+      # == Arguments
+      #
+      # * dbh - DBI database connection object.
+      # * query - Query string.
+      #
+      # == Usage
+      #
+      #  ds = DaruLite::DataFrame.new({:id=>DaruLite::Vector.new([1,2,3]), :name=>DaruLite::Vector.new(["a","b","c"])})
+      #  dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
+      #  ds.write_sql(dbh,"test")
+      def write_sql(dbh, table)
+        DaruLite::IO.dataframe_write_sql self, dbh, table
+      end
+      # Use marshalling to save dataframe to a file.
+      def save(filename)
+        DaruLite::IO.save self, filename
+      end
+      def _dump(_depth)
+        Marshal.dump(
+          data: @data,
+          index: @index.to_a,
+          order: @vectors.to_a,
+          name: @name
+        )
+      end
+    end
+  end
+end

data/lib/daru_lite/data_frame/indexable.rb ADDED Viewed

@@ -0,0 +1,168 @@
+module DaruLite
+  class DataFrame
+    module Indexable
+      module SetSingleIndexStrategy
+        def self.uniq_size(df, col)
+          df[col].uniq.size
+        end
+        def self.new_index(df, col)
+          DaruLite::Index.new(df[col].to_a)
+        end
+        def self.delete_vector(df, col)
+          df.delete_vector(col)
+        end
+      end
+      module SetCategoricalIndexStrategy
+        def self.new_index(df, col)
+          DaruLite::CategoricalIndex.new(df[col].to_a)
+        end
+        def self.delete_vector(df, col)
+          df.delete_vector(col)
+        end
+      end
+      module SetMultiIndexStrategy
+        def self.uniq_size(df, cols)
+          df[*cols].uniq.size
+        end
+        def self.new_index(df, cols)
+          DaruLite::MultiIndex.from_arrays(df[*cols].map_vectors(&:to_a)).tap do |mi|
+            mi.name = cols
+          end
+        end
+        def self.delete_vector(df, cols)
+          df.delete_vectors(*cols)
+        end
+      end
+      # Set a particular column as the new DF
+      def set_index(new_index_col, keep: false, categorical: false)
+        if categorical
+          strategy = SetCategoricalIndexStrategy
+        elsif new_index_col.respond_to?(:to_a)
+          strategy = SetMultiIndexStrategy
+          new_index_col = new_index_col.to_a
+        else
+          strategy = SetSingleIndexStrategy
+        end
+        unless categorical
+          uniq_size = strategy.uniq_size(self, new_index_col)
+          raise ArgumentError, 'All elements in new index must be unique.' if @size != uniq_size
+        end
+        self.index = strategy.new_index(self, new_index_col)
+        strategy.delete_vector(self, new_index_col) unless keep
+        self
+      end
+      # Change the index of the DataFrame and preserve the labels of the previous
+      # indexing. New index can be DaruLite::Index or any of its subclasses.
+      #
+      # @param [DaruLite::Index] new_index The new Index for reindexing the DataFrame.
+      # @example Reindexing DataFrame
+      #   df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
+      #     index: ['a','b','c','d'])
+      #   #=>
+      #   ##<DaruLite::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
+      #   #                    a          b
+      #   #         a          1         11
+      #   #         b          2         22
+      #   #         c          3         33
+      #   #         d          4         44
+      #   df.reindex DaruLite::Index.new(['b', 0, 'a', 'g'])
+      #   #=>
+      #   ##<DaruLite::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
+      #   #                    a          b
+      #   #         b          2         22
+      #   #         0        nil        nil
+      #   #         a          1         11
+      #   #         g        nil        nil
+      def reindex(new_index)
+        unless new_index.is_a?(DaruLite::Index)
+          raise ArgumentError, 'Must pass the new index of type Index or its ' \
+                               "subclasses, not #{new_index.class}"
+        end
+        cl = DaruLite::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
+        new_index.each_with_object(cl) do |idx, memo|
+          memo.row[idx] = @index.include?(idx) ? row[idx] : Array.new(ncols)
+        end
+      end
+      def reset_index
+        index_df = index.to_df
+        names = index.name
+        names = [names] unless names.instance_of?(Array)
+        new_vectors = names + vectors.to_a
+        self.index = index_df.index
+        names.each do |name|
+          self[name] = index_df[name]
+        end
+        self.order = new_vectors
+        self
+      end
+      # Reassign index with a new index of type DaruLite::Index or any of its subclasses.
+      #
+      # @param [DaruLite::Index] idx New index object on which the rows of the dataframe
+      #   are to be indexed.
+      # @example Reassigining index of a DataFrame
+      #   df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]})
+      #   df.index.to_a #=> [0,1,2,3]
+      #
+      #   df.index = DaruLite::Index.new(['a','b','c','d'])
+      #   df.index.to_a #=> ['a','b','c','d']
+      #   df.row['a'].to_a #=> [1,11]
+      def index=(idx)
+        @index = Index.coerce idx
+        @data.each { |vec| vec.index = @index }
+        self
+      end
+      def reindex_vectors(new_vectors)
+        unless new_vectors.is_a?(DaruLite::Index)
+          raise ArgumentError, 'Must pass the new index of type Index or its ' \
+                               "subclasses, not #{new_vectors.class}"
+        end
+        cl = DaruLite::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
+        new_vectors.each_with_object(cl) do |vec, memo|
+          memo[vec] = @vectors.include?(vec) ? self[vec] : Array.new(nrows)
+        end
+      end
+      # Reassign vectors with a new index of type DaruLite::Index or any of its subclasses.
+      #
+      # @param new_index [DaruLite::Index] idx The new index object on which the vectors are to
+      #   be indexed. Must of the same size as ncols.
+      # @example Reassigning vectors of a DataFrame
+      #   df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
+      #   df.vectors.to_a #=> [:a, :b, :c]
+      #
+      #   df.vectors = DaruLite::Index.new([:foo, :bar, :baz])
+      #   df.vectors.to_a #=> [:foo, :bar, :baz]
+      def vectors=(new_index)
+        raise ArgumentError, 'Can only reindex with Index and its subclasses' unless new_index.is_a?(DaruLite::Index)
+        if new_index.size != ncols
+          raise ArgumentError, "Specified index length #{new_index.size} not equal to" \
+                              "dataframe size #{ncols}"
+        end
+        @vectors = new_index
+        @data.zip(new_index.to_a).each do |vect, name|
+          vect.name = name
+        end
+        self
+      end
+    end
+  end
+end