RubyGems - rust - Versions diffs - 0.4 → 0.10 - Mend

rust 0.4 → 0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

checksums.yaml +4 -4
data/bin/ruby-rust +3 -0
data/lib/{rust-csv.rb → rust/core/csv.rb} +35 -4
data/lib/rust/core/rust.rb +221 -0
data/lib/rust/core/types/all.rb +4 -0
data/lib/{rust-core.rb → rust/core/types/dataframe.rb} +324 -244
data/lib/rust/core/types/datatype.rb +195 -0
data/lib/rust/core/types/factor.rb +158 -0
data/lib/rust/core/types/language.rb +199 -0
data/lib/rust/core/types/list.rb +97 -0
data/lib/rust/core/types/matrix.rb +155 -0
data/lib/rust/core/types/s4class.rb +78 -0
data/lib/rust/core/types/utils.rb +122 -0
data/lib/rust/core.rb +7 -0
data/lib/rust/models/all.rb +4 -0
data/lib/rust/models/anova.rb +77 -0
data/lib/rust/models/regression.rb +258 -0
data/lib/rust/plots/all.rb +4 -0
data/lib/rust/plots/basic-plots.rb +143 -0
data/lib/{rust-plots.rb → rust/plots/core.rb} +98 -107
data/lib/rust/plots/distribution-plots.rb +75 -0
data/lib/rust/stats/all.rb +4 -0
data/lib/{rust-basics.rb → rust/stats/correlation.rb} +46 -3
data/lib/rust/stats/descriptive.rb +157 -0
data/lib/{rust-effsize.rb → rust/stats/effsize.rb} +44 -21
data/lib/rust/stats/probabilities.rb +356 -0
data/lib/rust/stats/tests.rb +384 -0
data/lib/rust.rb +4 -8
metadata +31 -12
data/lib/rust-calls.rb +0 -69
data/lib/rust-descriptive.rb +0 -67
data/lib/rust-tests.rb +0 -165

data/lib/{rust-core.rb → rust/core/types/dataframe.rb} RENAMED Viewed

@@ -1,127 +1,34 @@
-require 'code-assertions'
-require 'stringio'
-require 'rinruby'
-require 'csv'
+require_relative 'datatype'
 module Rust
-    CLIENT_MUTEX = Mutex.new
-    R_MUTEX      = Mutex.new
-    R_ENGINE     = RinRuby.new(echo: false)
+    ##
+    # Mirror of the data-frame type in R.
-    private_constant    :R_ENGINE
-    private_constant    :R_MUTEX
-    private_constant    :CLIENT_MUTEX
-    @@debugging = false
-    @@in_client_mutex = false
-    def self.debug
-        @@debugging = true
-    end
-    def self.exclusive
-        result = nil
-        CLIENT_MUTEX.synchronize do
-            @@in_client_mutex = true
-            result = yield
-            @@in_client_mutex = false
-        end
-        return result
-    end
-    def self.[]=(variable, value)
-        if value.is_a?(RustDatatype)
-            value.load_in_r_as(variable.to_s)
-        elsif value.is_a?(String) || value.is_a?(Numeric) || value.is_a?(Array)
-            R_ENGINE.assign(variable, value)
-        else
-            raise "Given #{value.class}, expected RustDatatype, String, Numeric, or Array"
+    class DataFrame < RustDatatype
+        def self.can_pull?(type, klass)
+            return [klass].flatten.include?("data.frame")
         end
-    end
-    def self.[](variable, type=RustDatatype)
-        return type.pull_variable(variable)
-    end
-    def self._eval_big(r_command, return_warnings = false)
-        r_command = r_command.join("\n") if r_command.is_a?(Array)
-        self._rexec(r_command, return_warnings) do |cmd|
-            result = true
-            instructions = cmd.lines
-            while instructions.size > 0
-                current_command = ""
-                while (instructions.size > 0) && (current_command.length + instructions.first.length < 10000)
-                    current_command << instructions.shift
-                end
-                result &= R_ENGINE.eval(current_command)
-            end
-            result
-        end
-    end
-    def self._pull(r_command, return_warnings = false)
-        self._rexec(r_command, return_warnings) { |cmd| R_ENGINE.pull(cmd) }
-    end
-    def self._eval(r_command, return_warnings = false)
-        self._rexec(r_command, return_warnings) { |cmd| R_ENGINE.eval(cmd) }
-    end
-    def self._rexec(r_command, return_warnings = false)
-        puts "Calling _rexec with command: #{r_command}" if @@debugging
-        R_MUTEX.synchronize do
-            assert("This command must be executed in an exclusive block") { @@in_client_mutex }
-            result = nil
-            begin
-                $stdout = StringIO.new
-                if return_warnings
-                    R_ENGINE.echo(true, true)
-                else
-                    R_ENGINE.echo(false, false)
-                end
-                result = yield(r_command)
-            ensure
-                R_ENGINE.echo(false, false)
-                warnings = $stdout.string
-                $stdout = STDOUT
-            end
-            if return_warnings
-                return result, warnings.lines.map { |w| w.strip.chomp }
-            else
-                return result
-            end
-        end
-    end
-    class RustDatatype
-        def self.pull_variable(variable)
-            return Rust._pull(variable)
+        def self.pull_priority
+            1
         end
-        def load_in_r_as(r_instance, variable_name)
-            raise "Not implemented"
-        end
-    end
-    class DataFrame < RustDatatype
-        def self.pull_variable(variable)
+        def self.pull_variable(variable, type, klass)
             hash = {}
-            colnames = Rust._pull("colnames(#{variable})")
+            colnames = Rust["colnames(#{variable})"]
             colnames.each do |col|
-                hash[col] = Rust._pull("#{variable}$#{col}")
+                hash[col] = Rust["#{variable}$\"#{col}\""]
             end
             return DataFrame.new(hash)
         end
+        ##
+        # Creates a new data-frame.
+        # +labels_or_data+ can be either:
+        # - an Array of column names (creates an empty data-frame)
+        # - a Hash with column names as keys and values as values
         def initialize(labels_or_data)
             @data = {}
@@ -130,10 +37,16 @@ module Rust
                 @labels.each { |label| @data[label] = [] }
             elsif labels_or_data.is_a? Hash
                 @labels = labels_or_data.keys.map { |l| l.to_s }
-                @data = labels_or_data.clone
+                labels_or_data.each do |key, value|
+                    @data[key.to_s] = value.clone
+                end
             end
         end
+        ##
+        # Returns the +i+-th row of the data-frame
         def row(i)
             if i < 0 || i >= self.rows
                 return nil
@@ -142,6 +55,20 @@ module Rust
             end
         end
+        ##
+        # Returns the +i+-th row of the data-frame. Faster (but harder to interpret) alternative to #row.
+        def fast_row(i)
+            if i < 0 || i >= self.rows
+                return nil
+            else
+                return @labels.map { |label| @data[label][i] }
+            end
+        end
+        ##
+        # Shuffles the rows in the data-frame. The arguments are passed to the Array#shuffle method.
         def shuffle(*args)
             result = DataFrame.new(@labels)
@@ -156,6 +83,10 @@ module Rust
             return result
         end
+        ##
+        # Returns a copy of the data-frame containing only the specified +rows+ and/or +cols+. If +rows+ and/or +cols+
+        # are nil, all the rows/columns are returned.
         def [](rows, cols=nil)
             raise "You must specify either rows or columns to select" if !rows && !cols
             result = self
@@ -171,9 +102,16 @@ module Rust
             return result
         end
+        ##
+        # Return the column named +name+.
         def column(name)
             return @data[name]
         end
+        alias :| :column
+        ##
+        # Renames the column named +old_name+ in +new_name+.
         def rename_column!(old_name, new_name)
             raise "This DataFrame does not contain a column named #{old_name}" unless @labels.include?(old_name)
@@ -183,10 +121,24 @@ module Rust
             @labels[@labels.index(old_name)] = new_name
         end
+        ##
+        # Functionally transforms the column named +column+ by applying the function given as a block.
+        # Example:
+        # df = Rust::DataFrame.new({a: [1,2,3], b: [3,4,5]})
+        # df.transform_column!("a") { |v| v + 1 }
+        # df|"a" # => [2, 3, 4]
         def transform_column!(column)
             @data[column].map! { |e| yield e }
         end
+        ##
+        # Returns a copy data-frame with only the rows for which the function given in the block returns true.
+        # Example:
+        # df = Rust::DataFrame.new({a: [1,2,3], b: ['a','b','c']})
+        # df2 = df.select_rows { |r| r['a'].even? }
+        # df2|"b" # => ['b']
         def select_rows
             result = DataFrame.new(self.column_names)
             self.each_with_index do |row, i|
@@ -195,6 +147,20 @@ module Rust
             return result
         end
+        ##
+        # Returns true if the function given in the block returns true for any of the rows in this data-frame.
+        def has_row?
+            self.each_with_index do |row, i|
+                return true if yield row, i
+            end
+            return false
+        end
+        ##
+        # Returns a copy of the data-frame with only the columns in +cols+. As an alternative, a block can be used
+        # (only the columns for which the function returns true are kept).
         def select_columns(cols=nil)
             raise "You must specify either the columns you want to select or a selection block" if !cols && !block_given?
@@ -210,24 +176,84 @@ module Rust
         end
         alias :select_cols :select_columns
+        ##
+        # Deletes the column named +column+.
         def delete_column(column)
             @labels.delete(column)
             @data.delete(column)
         end
+        ##
+        # Deletes the +i+-th row.
+        def delete_row(i)
+            @data.each do |label, column|
+                column.delete_at(i)
+            end
+        end
+        ##
+        # Returns a data-frame in which the rows are unique in terms of all the given columns named +by+.
+        def uniq_by(by)
+            result = self.clone
+            result.uniq_by!(by)
+            return result
+        end
+        ##
+        # Makes sure that in this data-frame the rows are unique in terms of all the given columns named +by+.
+        def uniq_by!(by)
+            my_keys = {}
+            to_delete = []
+            self.each_with_index do |row, i|
+                key = []
+                by.each do |colname|
+                    key << row[colname]
+                end
+                unless my_keys[key]
+                    my_keys[key] = i
+                else
+                    to_delete << (i-to_delete.size)
+                end
+            end
+            to_delete.each do |i|
+                self.delete_row(i)
+            end
+            return self
+        end
+        ##
+        # Return the names of the columns.
         def column_names
             return @labels.map { |k| k.to_s }
         end
         alias :colnames :column_names
+        ##
+        # Returns the number of rows.
         def rows
             @data.values[0].size
         end
+        ##
+        # Returns the number of columns
         def columns
             @labels.size
         end
+        ##
+        # Adds the given +row+ to the data-frame. +row+ can be either:
+        # - An Array of values for all the columns (in the order of #column_names);
+        # - A Hash containing associations between column names and value to be set.
         def add_row(row)
             if row.is_a?(Array)
                 raise "Expected an array of size #{@data.size}" unless row.size == @data.size
@@ -243,7 +269,7 @@ module Rust
                 row.each do |key, value|
                     @data[key.to_s] << value
                 end
-#
                 return true
             else
                 raise TypeError, "Expected an Array or a Hash"
@@ -251,6 +277,11 @@ module Rust
         end
         alias :<< :add_row
+        ##
+        # Adds a column named +name+ with the given +values+ (array). The size of +values+ must match the number of
+        # rows of this data-frame. As an alternative, it can be passed a block which returns, for a given row, the
+        # value to assign for the new column.
         def add_column(name, values=nil)
             raise "Column already exists" if @labels.include?(name)
             raise "Values or block required" if !values && !block_given?
@@ -267,6 +298,9 @@ module Rust
             end
         end
+        ##
+        # Yields each row as a Hash containing column names as keys and values as values.
         def each
             self.each_with_index do |element, i|
                 yield element
@@ -275,6 +309,21 @@ module Rust
             return self
         end
+        ##
+        # Yields each row as a Hash containing column names as keys and values as values. Faster alternative to
+        # #each.
+        def fast_each
+            self.fast_each_with_index do |element, i|
+                yield element
+            end
+            return self
+        end
+        ##
+        # Yields each row as a Hash containing column names as keys and values as values and the row index.
         def each_with_index
             for i in 0...self.rows
                 element = {}
@@ -288,6 +337,23 @@ module Rust
             return self
         end
+        ##
+        # Yields each row as a Hash containing column names as keys and values as values and the row index. Faster
+        # alternative to #each_with_index.
+        def fast_each_with_index
+            for i in 0...self.rows
+                element = []
+                @labels.each do |label|
+                    element << @data[label][i]
+                end
+                yield element, i
+            end
+            return self
+        end
         def load_in_r_as(variable_name)
             command = []
@@ -299,6 +365,14 @@ module Rust
                 row_index += 1
             end
+            self.column_names.each do |name|
+                column = self.column(name)
+                if column.is_a?(Factor)
+                    command << "#{variable_name}[,#{name.to_R}] <- factor(#{variable_name}[,#{name.to_R}], labels=#{column.levels.to_R})"
+                end
+            end
             Rust._eval_big(command)
         end
@@ -323,6 +397,9 @@ module Rust
             return result
         end
+        ##
+        # Returns a copy of the data-frame containing only the first +n+ rows.
         def head(n=10)
             result = DataFrame.new(self.column_names)
             self.each_with_index do |row, i|
@@ -331,6 +408,11 @@ module Rust
             return result
         end
+        ##
+        # Merges this data-frame with +other+ in terms of the +by+ column(s) (Array or String).
+        # +first_alias+ and +second_alias+ allow to specify the prefix that should be used for the columns not in +by+
+        # for this and the +other+ data-frame, respectively.
         def merge(other, by, first_alias = "x", second_alias = "y")
             raise TypeError, "Expected Rust::DataFrame" unless other.is_a?(DataFrame)
             raise TypeError, "Expected list of strings" if !by.is_a?(Array) || !by.all? { |e| e.is_a?(String) }
@@ -397,6 +479,94 @@ module Rust
             return result
         end
+        ##
+        # Aggregate the value in groups depending on the +by+ column (String).
+        # A block must be passed to specify how to aggregate the columns. Aggregators for specific columns can be
+        # specified as optional arguments in which the name of the argument represents the column name and the value
+        # contains a block for aggregating the specific column.
+        # Both the default and the specialized blocks must take as argument an array of values and must return a
+        # scalar value.
+        def aggregate(by, **aggregators)
+            raise TypeError, "Expected a string" unless by.is_a?(String)
+            raise TypeError, "All the aggregators should be procs" unless aggregators.values.all? { |v| v.is_a?(Proc) }
+            raise "Expected a block for default aggregator" unless block_given?
+            aggregators = aggregators.map { |label, callable| [label.to_s, callable] }.to_h
+            sorted = self.sort_by(by)
+            current_value = nil
+            partials = []
+            partial = nil
+            sorted.column(by).each_with_index do |value, index|
+                if current_value != value
+                    current_value = value
+                    partials << partial if partial
+                    partial = Rust::DataFrame.new(self.column_names)
+                end
+                partial << sorted.fast_row(index)
+            end
+            partials << partial
+            result = Rust::DataFrame.new(self.column_names)
+            partials.each do |partial|
+                aggregated_row = {}
+                aggregated_row[by] = partial.column(by)[0]
+                (self.column_names - [by]).each do |column|
+                    if aggregators[column]
+                        aggregated_row[column] = aggregators[column].call(partial.column(column))
+                    else
+                        aggregated_row[column] = yield partial.column(column)
+                    end
+                end
+                result << aggregated_row
+            end
+            return result
+        end
+        ##
+        # Returns a copy of this data-frame in which the rows are sorted by the values of the +by+ column.
+        def sort_by(column)
+            result = self.clone
+            result.sort_by!(column)
+            return result
+        end
+        ##
+        # Sorts the rows of this data-frame by the values of the +by+ column.
+        def sort_by!(by)
+            copy = @data[by].clone
+            copy.sort!
+            indices = []
+            @data[by].each_with_index do |value, i|
+                index = copy.index(value)
+                indices << index
+                copy[index] = NilClass
+            end
+            (self.column_names - [by]).each do |column_name|
+                sorted = []
+                column = self.column(column_name)
+                column_i = 0
+                indices.each do |i|
+                    sorted[i] = column[column_i]
+                    column_i += 1
+                end
+                @data[column_name] = sorted
+            end
+            @data[by].sort!
+        end
+        ##
+        # Adds all the rows in +dataframe+ to this data-frame. The column names must match.
         def bind_rows!(dataframe)
             raise TypeError, "DataFrame expected" unless dataframe.is_a?(DataFrame)
             raise "The columns are not compatible: #{self.column_names - dataframe.column_names} - #{dataframe.column_names - self.column_names}" unless (self.column_names & dataframe.column_names).size == self.columns
@@ -409,6 +579,9 @@ module Rust
         end
         alias :rbind! :bind_rows!
+        ##
+        # Adds all the columns in +dataframe+ to this data-frame. The number of rows must match.
         def bind_columns!(dataframe)
             raise TypeError, "DataFrame expected" unless dataframe.is_a?(DataFrame)
             raise "The number of rows are not compatible" if self.rows != dataframe.rows
@@ -422,6 +595,9 @@ module Rust
         end
         alias :cbind! :bind_columns!
+        ##
+        # Returns a copy of this dataframe and adds all the rows in +dataframe+ to it. The column names must match.
         def bind_rows(dataframe)
             result = self.clone
             result.bind_rows!(dataframe)
@@ -429,6 +605,9 @@ module Rust
         end
         alias :rbind :bind_rows
+        ##
+        # Returns a copy of this dataframe and adds all the columns in +dataframe+ to it. The number of rows must match.
         def bind_columns(dataframe)
             result = self.clone
             result.bind_columns!(dataframe)
@@ -436,152 +615,53 @@ module Rust
         end
         alias :cbind :bind_columns
+        ##
+        # Returns a copy of this data-frame.
         def clone
             DataFrame.new(@data)
         end
     end
-    class Matrix < RustDatatype
-        def self.pull_variable(variable)
-            return Rust._pull(variable)
-        end
-        def initialize(data)
-            if data.flatten.size == 0
-                raise "Empty matrices are not allowed"
-            else
-                raise TypeError, "Expected array of array" unless data.is_a?(Array) && data[0].is_a?(Array)
-                raise TypeError, "Only numeric matrices are supported" unless data.all? { |row| row.all?  { |e| e.is_a?(Numeric) } }
-                raise "All the rows must have the same size" unless data.map { |row| row.size }.uniq.size == 1
-                @data = data.clone
-            end
-        end
-        def [](i, j)
-            return @data[i][j]
-        end
-        def rows
-            @data.size
-        end
-        def cols
-            @data[0].size
-        end
+    ##
+    # Represents an array of DataFrame
+    class DataFrameArray < Array
-        def []=(i, j, value)
-            raise "Wrong i" unless i.between?(0, @data.size - 1)
-            raise "Wrong j" unless j.between?(0, @data[0].size - 1)
-            @data[i][j] = value
-        end
+        ##
+        # Returns a data-frame with the rows in all the data-frames together (if compatible).
-        def load_in_r_as(variable_name)
-            Rust._eval("#{variable_name} <- matrix(c(#{@data.flatten.join(",")}), nrow=#{self.rows}, ncol=#{self.cols}, byrow=T)")
+        def bind_all
+            return nil if self.size == 0
+            result = self.first.clone
+            for i in 1...self.size
+                result .bind_rows!(self[i])
+            end
+            return result
         end
     end
-    class Sequence
-        attr_reader :min
-        attr_reader :max
-        def initialize(min, max, step=1)
-            @min = min
-            @max = max
-            @step = step
-        end
-        def step(step)
-            @step = step
-        end
+    ##
+    # Represents a hash of DataFrame
+    class DataFrameHash < Hash
-        def each
-            (@min..@max).step(@step) do |v|
-                yield v
-            end
-        end
+        ##
+        # Returns a data-frame with the rows in all the data-frames together (if compatible).
-        def to_a
-            result = []
-            self.each do |v|
-                result << v
+        def bind_all
+            return nil if self.values.size == 0
+            result = self.values.first.clone
+            for i in 1...self.values.size
+                result .bind_rows!(self.values[i])
             end
             return result
         end
-        def to_R
-            "seq(from=#@min, to=#@max, by=#@step)"
-        end
-    end
-end
-class TrueClass
-    def to_R
-        "TRUE"
-    end
-end
-class FalseClass
-    def to_R
-        "FALSE"
-    end
-end
-class Object
-    def to_R
-        raise TypeError, "Unsupported type for #{self.class}"
     end
 end
-class NilClass
-    def to_R
-        return "NULL"
-    end
-end
-class Numeric
-    def to_R
-        self.inspect
-    end
-end
-class Float
-    def to_R
-        return self.nan? ? "NA" : super
-    end
-end
-class Array
-    def to_R
-        return "c(#{self.map { |e| e.to_R }.join(",")})"
-    end
-end
-class String
-    def to_R
-        return self.inspect
-    end
-end
-class Range
-    def to_R
-        [range.min, range.max].to_R
-    end
-end
-module Rust::RBindings
-    def read_csv(filename, **options)
-        Rust::CSV.read(filename, **options)
-    end
-    def write_csv(filename, dataframe, **options)
-        Rust::CSV.write(filename, dataframe, **options)
-    end
-    def data_frame(*args)
-        Rust::DataFrame.new(*args)
-    end
-end
-def bind_r!
-    include Rust::RBindings
-end