RubyGems - ml4r - Versions diffs - 0.1.4 → 0.1.5 - Mend

ml4r 0.1.4 → 0.1.5

Files changed (33) hide show

data/ext/ml4r/LinearRegression/LinearRegression.cpp +305 -0
data/ext/ml4r/LinearRegression/OLSLinearRegression.cpp +75 -0
data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.cpp +50 -0
data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.cpp +195 -0
data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.cpp +551 -0
data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.cpp +22 -0
data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.cpp +21 -0
data/ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.cpp +142 -0
data/ext/ml4r/MachineLearning/GBM/BernoulliCalculator.cpp +95 -0
data/ext/ml4r/MachineLearning/GBM/GBMEstimator.cpp +601 -0
data/ext/ml4r/MachineLearning/GBM/GBMOutput.cpp +86 -0
data/ext/ml4r/MachineLearning/GBM/GBMRunner.cpp +117 -0
data/ext/ml4r/MachineLearning/GBM/GaussianCalculator.cpp +94 -0
data/ext/ml4r/MachineLearning/GBM/ZenithGBM.cpp +317 -0
data/ext/ml4r/MachineLearning/MLData/MLData.cpp +232 -0
data/ext/ml4r/MachineLearning/MLData/MLDataFields.cpp +1 -0
data/ext/ml4r/MachineLearning/MLData/MLDataReader.cpp +139 -0
data/ext/ml4r/MachineLearning/MLData/ZenithMLData.cpp +96 -0
data/ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.cpp +113 -0
data/ext/ml4r/MachineLearning/MLExperiment.cpp +69 -0
data/ext/ml4r/MachineLearning/MLRunner.cpp +183 -0
data/ext/ml4r/MachineLearning/MLUtils.cpp +15 -0
data/ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.cpp +172 -0
data/ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.cpp +66 -0
data/ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.cpp +84 -0
data/ext/ml4r/MachineLearning/RandomForest/ZenithRandomForest.cpp +184 -0
data/ext/ml4r/ml4r.cpp +34 -0
data/ext/ml4r/ml4r_wrap.cpp +15727 -0
data/ext/ml4r/utils/MathUtils.cpp +204 -0
data/ext/ml4r/utils/StochasticUtils.cpp +73 -0
data/ext/ml4r/utils/Utils.cpp +14 -0
data/ext/ml4r/utils/VlcMessage.cpp +3 -0
metadata +33 -1

data/ext/ml4r/LinearRegression/LinearRegression.cpp ADDED Viewed

@@ -0,0 +1,305 @@
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/vector_proxy.hpp>
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/triangular.hpp>
+#include <boost/numeric/ublas/lu.hpp>
+#include <boost/numeric/ublas/io.hpp>
+#include <boost/foreach.hpp>
+#include <iostream>
+using std::cout;
+using std::endl;
+#include "LinearRegression/LinearRegression.h"
+#include "utils/MatrixInversion.h"
+#include "utils/Utils.h"
+namespace ublas = boost::numeric::ublas;
+using std::vector;
+using ublas::prod;
+using ublas::matrix;
+void LinearRegression::setWeights(vector<double> weights)
+{
+    m_ws = weights;
+}
+void LinearRegression::setFixedConstant(double val)
+{
+    m_constant = val;
+    m_constantIsFixed = true;
+}
+pair<vector<double>,double> LinearRegression::getParameterEstimates()
+{
+    return make_pair(m_bs,m_constant);
+}
+void LinearRegression::checkDimensions()
+{
+    if (!m_ys.size())
+        throw std::runtime_error("[LinearRegression] Number of observations equals zero");
+    if (m_xs.size() != m_ys.size())
+        throw std::runtime_error("[LinearRegression] Number of observations in x doesn't match number of observations in y");
+    if (m_ws.size() && m_ws.size() != m_ys.size())
+        throw std::runtime_error("[LinearRegression] Number of specified weights doesn't match number of observations");
+    unsigned long dimensionOfX = m_xs.front().size();
+    BOOST_FOREACH(vector<double>& x, m_xs)
+        if (x.size() != dimensionOfX)
+            throw std::runtime_error("[LinearRegression] Dimensions of x variables are inconsistent between observations");
+}
+void LinearRegression::calculateStatistics()
+{
+    if (!m_paramsAreValid)
+        throw std::runtime_error("[LinearRegression] Parameters have not been estimated");
+    calculateModelStatistics();
+    calculateParameterStatistics();
+}
+void LinearRegression::calculateParameterStatistics2()
+{
+    // form the matrix X'X
+    ublas::matrix<double> X(m_xs.size(), m_xs.front().size()+1);
+    ublas::matrix<double>::iterator2 matrixIterator = X.begin2();
+    BOOST_FOREACH(vector<double>& row, m_xs)
+    {
+        matrixIterator = std::copy(row.begin(), row.end(), matrixIterator);
+        *(matrixIterator++) = 1.0;
+    }
+    ublas::matrix<double> X_transpose_X = ublas::prod(ublas::trans(X), X);
+    // Invert the matrix
+    ublas::matrix<double> X_transpose_X_inverse(X_transpose_X);
+    InvertMatrix(X_transpose_X, X_transpose_X_inverse);
+    // Also construct a t-stat for the constant
+    if (!m_constantIsFixed) m_bs.push_back(m_constant);
+    m_tStatistics.resize(m_bs.size());
+    for (unsigned int i=0; i<m_bs.size(); ++i)
+    {
+        m_tStatistics.at(i) = m_bs.at(i) / (m_s * sqrt(X_transpose_X_inverse(i,i)));
+    }
+    if (!m_constantIsFixed) m_bs.pop_back();
+}
+void LinearRegression::calculateModelStatistics()
+{
+    checkDimensions();
+    checkParametersAreEstimated();
+    estimateYs();
+    double meanY                = Utils::vectorSum(m_ys) / m_n;
+    double sumSquaresTotal      = 0.0;
+    double sumSquaresRegression = 0.0;
+    double sumSquaresError      = 0.0;
+    double meanWeight           = Utils::vectorSum(m_ws) / m_n;
+    for (int i=0; i<m_n; ++i)
+    {
+        sumSquaresTotal         += m_ws.at(i) / meanWeight * pow(m_ys.at(i)        - meanY,      2.0);
+        sumSquaresRegression    += m_ws.at(i) / meanWeight * pow(m_fittedYs.at(i)  - meanY,      2.0);
+        sumSquaresError         += m_ws.at(i) / meanWeight * pow(m_ys.at(i) - m_fittedYs.at(i), 2.0);
+    }
+    double meanSquaredRegression = sumSquaresRegression / (m_k);
+    m_rSquared      = 1.0 - (sumSquaresError / sumSquaresTotal);
+    m_adjustedRSquared = 1.0 - (sumSquaresError / (m_n - m_p)) / (sumSquaresTotal / (m_n - 1));
+    m_fStatistic    = (m_n-m_p) * sumSquaresRegression / (sumSquaresError * m_k);
+    m_sSquared      = 1.0 / (m_n-m_p) * sumSquaresError;
+    m_s             = sqrt(m_sSquared);
+    m_h_diagonal.resize(m_n, 0.0);
+//     auto XIterator = m_X.begin2(); // row-wise
+//     auto AIterator = m_A.begin1(); // column-wise
+    for (int i = 0; i < m_n; ++i)
+    {
+        double sumProduct = 0.0;
+        for (int j = 0; j < m_p; ++j)
+            sumProduct += m_X(i, j) * m_A(j, i);
+        m_h_diagonal.at(i) = sumProduct;
+    }
+    m_pressStatistic    = 0.0;
+    m_presarStatistic   = 0.0;
+    m_predictedYs.resize(m_n);
+    for (int i = 0; i < m_n; ++i)
+    {
+        double ei = m_fittedYs.at(i) - m_ys.at(i);
+        double hii = m_h_diagonal.at(i);
+        double ei_prediction = ei / (1.0 - hii); // best thing eva!!!
+        m_predictedYs.at(i)  = m_ys.at(i) + ei_prediction;
+        m_presarStatistic   += m_ws.at(i) / meanWeight * abs((float)ei_prediction);
+        m_pressStatistic    += m_ws.at(i) / meanWeight * pow(ei_prediction, 2.0);
+    }
+    m_rSquaredPrediction = 1.0 - m_pressStatistic / sumSquaresTotal;
+}
+void LinearRegression::estimateYs()
+{
+    m_fittedYs.clear();
+    m_fittedYs.resize(m_ys.size(), m_constant);
+    for (unsigned int i=0; i<m_ys.size(); ++i)
+    {
+        for (unsigned int j=0; j<m_bs.size(); ++j)
+            m_fittedYs.at(i) += m_bs.at(j) * m_xs.at(i).at(j);
+    }
+}
+void LinearRegression::checkParametersAreEstimated()
+{
+    if (!m_paramsAreValid)
+        throw std::runtime_error("[LinearRegression] Parameters have not been estimated");
+}
+double LinearRegression::getRSquared()
+{
+    return m_rSquared;
+}
+double LinearRegression::getFstatistic()
+{
+    return m_fStatistic;
+}
+vector<double>& LinearRegression::getFittedYs()
+{
+    return m_fittedYs;
+}
+vector<double>& LinearRegression::getTstatistics()
+{
+    return m_tStatistics;
+}
+void LinearRegression::populateMembers()
+{
+    m_k = m_xs.front().size();
+    m_p = m_k + (m_constantIsFixed ? 0 : 1);
+    m_n = m_xs.size();
+    // populate m_X
+    m_X.resize(m_n, m_p);
+    ublas::matrix<double>::iterator2 matrixIterator = m_X.begin2();
+    BOOST_FOREACH(vector<double>& row, m_xs)
+    {
+        matrixIterator = std::copy(row.begin(), row.end(), matrixIterator);
+        if (!m_constantIsFixed) *(matrixIterator++) = 1.0;
+    }
+    // populate m_Y
+    m_Y.resize(m_n, 1);
+    ublas::matrix<double>::iterator1 matrixIterator2 = m_Y.begin1();
+    BOOST_FOREACH(double& y, m_ys)
+    {
+        (*matrixIterator2) = y;
+        ++matrixIterator2;
+    }
+    // populate m_ws with 1's if it's not already defined
+    if (!m_ws.size())
+    {
+        m_ws.resize(m_n, 1.0);
+    }
+    // form the matrix X'  [P x N]
+    m_Xtranspose = ublas::trans(m_X);
+    // form the matrix X'WX [P x N] . [N x N] . [N x P] => [P x P]
+    m_Xtranspose_W_X.resize(m_p, m_p);
+    m_Xtranspose_W = multiplyMatrixByWeights(m_Xtranspose);
+    m_Xtranspose_W_X = ublas::prod(m_Xtranspose_W, m_X);
+    // Invert the matrix
+    m_Xtranspose_W_X_inverse.resize(m_p, m_p);
+    InvertMatrix(m_Xtranspose_W_X, m_Xtranspose_W_X_inverse);
+}
+void LinearRegression::calculateParameterStatistics()
+{
+    m_tStatistics.resize(m_p);
+    m_standardErrors.resize(m_p);
+    ublas::matrix<double> AAt = prod(m_A, ublas::trans(m_A));
+    for (int i=0; i<m_p; ++i)
+    {
+        // made more complicated by weights!!!
+        m_standardErrors.at(i) = m_s * sqrt(AAt(i,i));
+        m_tStatistics.at(i) = m_B(i,0) / m_standardErrors.at(i);
+    }
+}
+double LinearRegression::getPressStatistic()
+{
+    return m_pressStatistic;
+}
+double LinearRegression::getPresarStatistic()
+{
+    return m_presarStatistic;
+}
+double LinearRegression::getRSquaredPrediction()
+{
+    return m_rSquaredPrediction;
+}
+vector<double>& LinearRegression::getPredictedYs()
+{
+    return m_predictedYs;
+}
+double LinearRegression::getAdjustedRSquared()
+{
+    return m_adjustedRSquared;
+}
+matrix<double> LinearRegression::multiplyMatrixByWeights(matrix<double>& mat)
+{
+    if (mat.size2() != m_ws.size())
+        throw std::runtime_error("[LinearRegression::multiplyMatrixByWeights] invalid matrix dimensions!");
+    matrix<double> new_matrix = mat; // copy
+    for (unsigned int j = 0; j < new_matrix.size2(); ++j) // each column
+    {
+        double weight = m_ws.at(j);
+        for (unsigned int i = 0; i < new_matrix.size1(); ++i) // each row
+            new_matrix(i,j) *= weight;
+    }
+    return new_matrix;
+}
+matrix<double> LinearRegression::multiplyWeightsByMatrix(matrix<double>& mat)
+{
+    if (mat.size1() != m_ws.size())
+        throw std::runtime_error("[LinearRegression::multiplyMatrixByWeights] invalid matrix dimensions!");
+    matrix<double> new_matrix = mat; // copy
+    for (unsigned int i = 0; i < new_matrix.size2(); ++i) // each row
+    {
+        double weight = m_ws.at(i);
+        for (unsigned int j = 0; j < new_matrix.size1(); ++j) // each column
+            new_matrix(i,j) *= weight;
+    }
+    return new_matrix;
+}
+vector<double>& LinearRegression::getStandardErrors()
+{
+    return m_standardErrors;
+}
+double LinearRegression::getSSquared()
+{
+    return m_sSquared;
+}

data/ext/ml4r/LinearRegression/OLSLinearRegression.cpp ADDED Viewed

@@ -0,0 +1,75 @@
+#include "LinearRegression/OLSLinearRegression.h"
+#include "utils/MathUtils.h"
+#include "utils/Utils.h"
+#include <iostream>
+#include <boost/numeric/ublas/io.hpp>
+using std::cout;
+using std::endl;
+namespace ublas = boost::numeric::ublas;
+using Utils::operator+=;
+using ublas::matrix;
+using ublas::prod;
+OLSLinearRegression::OLSLinearRegression(std::vector<double> xs, std::vector<double> ys,
+                                         std::vector<double> weights)
+: LinearRegression(xs, ys, weights)
+{
+    calculate();
+}
+OLSLinearRegression::OLSLinearRegression(std::vector<std::vector<double> > xs, std::vector<double> ys,
+                                         std::vector<double> weights)
+: LinearRegression(xs, ys, weights)
+{
+    calculate();
+}
+OLSLinearRegression::OLSLinearRegression(std::vector<std::vector<double> > xs, std::vector<double> ys,
+                                         double fixedConstant, std::vector<double> weights)
+: LinearRegression(xs, ys, fixedConstant, weights)
+{
+    calculate();
+}
+OLSLinearRegression::~OLSLinearRegression()
+{}
+void OLSLinearRegression::calculate()
+{
+    checkDimensions();
+    // matrix based implementation
+    // b = inverse(X'WX)X'Wy
+    // where X is the data matrix (rows are observations, columns are our X variables.  If a constant is to be estimated,
+    // then the first column is set to 1, and the first estimated parameter will be the constant).
+    // X' is the transpose of X
+    // W is a matrix with diag(w1, w2, w3) etc, where wi is the weight of observation i
+    // y is the column matrix containing the observed y's.
+    populateMembers();
+    EstimateBs();
+    if (m_paramsAreValid) calculateStatistics();
+}
+void OLSLinearRegression::EstimateBs()
+{
+    matrix<double> Y = m_Y;
+    if (m_constantIsFixed)
+    {
+        for (int i = 0; i < m_n; ++i) Y(i, 0) -= m_constant;
+    }
+    m_A = prod(m_Xtranspose_W_X_inverse, m_Xtranspose_W);
+    m_B = prod(m_A, Y);
+    // set m_bs and constant
+    m_bs.resize(m_k);
+    for (int i = 0; i < m_k; ++i)
+        m_bs.at(i) = m_B(i, 0);
+    if (!m_constantIsFixed)
+        m_constant = m_B(m_p-1, 0);
+    m_paramsAreValid = true;
+}

data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.cpp ADDED Viewed

@@ -0,0 +1,50 @@
+#include "MachineLearning/DecisionTree/DecisionTreeExperiment.h"
+DecisionTreeExperiment::DecisionTreeExperiment()
+    : MLExperiment()
+{
+}
+DecisionTreeExperiment::DecisionTreeExperiment(shared_ptr<MLExperiment> mlExperiment)
+    : MLExperiment(mlExperiment)
+{
+}
+DecisionTreeExperiment::~DecisionTreeExperiment()
+{
+}
+void DecisionTreeExperiment::setF(double f)
+{
+    m_F = f;
+}
+void DecisionTreeExperiment::setZ(double z)
+{
+    m_Z = z;
+}
+double DecisionTreeExperiment::getF()
+{
+    return m_F;
+}
+double DecisionTreeExperiment::getZ()
+{
+    return m_Z;
+}
+void DecisionTreeExperiment::incrementF(double increment)
+{
+    m_F += increment;
+}
+double DecisionTreeExperiment::getY()
+{
+    return m_yValue;
+}

data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.cpp ADDED Viewed

@@ -0,0 +1,195 @@
+#include "MachineLearning/DecisionTree/DecisionTreeNode.h"
+#include "MachineLearning/DecisionTree/SplitDefinition.h"
+#include "MachineLearning/DecisionTree/DecisionTreeExperiment.h"
+#include "utils/Utils.h"
+#include <stdexcept>
+using std::runtime_error;
+bool   DecisionTreeNode::m_missingValueDefined  = false;
+double DecisionTreeNode::m_missingValue         = -1.0;
+DecisionTreeNode::DecisionTreeNode( vector<shared_ptr<DecisionTreeExperiment> > experiments,
+                                    double sumZ,
+                                    double sumW,
+                                    Partition partition,
+                                    shared_ptr<SplitDefinition> parentSplitDefinition)
+    : m_experiments(experiments), m_nodeHasChildren(false), m_sumZ(sumZ), m_sumW(sumW),
+      m_whichPartitionAmI(partition), m_parentSplitDefinition(parentSplitDefinition)
+{
+}
+DecisionTreeNode::~DecisionTreeNode()
+{
+}
+shared_ptr<DecisionTreeNode> DecisionTreeNode::getTerminalNodeForExperiment(shared_ptr<DecisionTreeExperiment> experiment)
+{
+    if (!m_nodeHasChildren)
+        throw std::runtime_error("Node is a terminal node, so you shouldn't ask it for a terminal node!");
+    if (m_splitDefinition.get() == 0)
+        throw std::runtime_error("Node has children, but split definition is empty");
+    shared_ptr<DecisionTreeNode> childForExperiment = chooseChild(experiment);
+    if (childForExperiment.get() == 0)
+        return childForExperiment;
+    else if (childForExperiment->getSumW() == 0)
+    {
+        // this likely means that the value is missing, but there weren't any missing values in the
+        // bagged training set.  Therefore, there is no weight in the missing child.
+        // return an empty pointer, and this DecisionTreeNode will become the one chosen.
+        return shared_ptr<DecisionTreeNode>();
+    }
+    else if (childForExperiment->isTerminalNode())
+        return childForExperiment;
+    else
+    {
+        shared_ptr<DecisionTreeNode> terminalNode = childForExperiment->getTerminalNodeForExperiment(experiment);
+        if (terminalNode.get() == 0)
+        {
+            // we have encountered a NEW category...therefore we couldn't split on childForExperiment.
+            // therefore, return the child itself.
+            return childForExperiment;
+        }
+        else
+            return terminalNode;
+    }
+}
+shared_ptr<DecisionTreeNode> DecisionTreeNode::chooseChild(shared_ptr<DecisionTreeExperiment> experiment)
+{
+    if (!m_nodeHasChildren)
+        throw std::runtime_error("[DecisionTreeNode::chooseChild] - this Decision Tree has no children!");
+    double featureValue = experiment->getFeatureValue(m_splitDefinition->getFeatureIndex());
+    if (m_missingValueDefined && m_missingValue == featureValue)
+        return m_missingChild;
+    if (m_splitDefinition->isCategorical()) // categorical variable
+    {
+        if (Utils::hasElement(m_splitDefinition->getLhsCategories(), featureValue))
+            return m_lhsChild;
+        else if (Utils::hasElement(m_splitDefinition->getRhsCategories(), featureValue))
+            return m_rhsChild;
+        else
+        {
+            // it's not missing, but not in left or right.  Therefore, we have a NEW category.
+            // We should return an empty pointer, and let the parent handle it.
+            return shared_ptr<DecisionTreeNode>();
+        }
+    }
+    else // continuous variable
+    {
+        double splitValue = m_splitDefinition->getSplitValue();
+        if (m_missingValueDefined && m_missingValue == splitValue)
+        {
+            // complicated logic.  Our split value equals the missing value.  Therefore, we split off missing versus
+            // everything else (which gets put in the rhsChild).  As our feature value is not the missing value, we choose
+            // the rhsChild.
+            return m_rhsChild;
+        }
+        else if (featureValue < splitValue)
+            return m_lhsChild;
+        else
+            return m_rhsChild;
+    }
+}
+void DecisionTreeNode::defineSplit( shared_ptr<SplitDefinition> splitDefinition,
+                                    shared_ptr<DecisionTreeNode> lhsChild,
+                                    shared_ptr<DecisionTreeNode> rhsChild,
+                                    shared_ptr<DecisionTreeNode> missingChild)
+{
+    setChildren(lhsChild, rhsChild, missingChild);
+    m_splitDefinition = splitDefinition;
+}
+void DecisionTreeNode::setChildren( shared_ptr<DecisionTreeNode> lhsChild,
+                                    shared_ptr<DecisionTreeNode> rhsChild,
+                                    shared_ptr<DecisionTreeNode> missingChild)
+{
+    m_nodeHasChildren   = true;
+    m_lhsChild          = lhsChild;
+    m_rhsChild          = rhsChild;
+    m_missingChild      = missingChild;
+}
+vector<shared_ptr<DecisionTreeExperiment> > DecisionTreeNode::getExperiments()
+{
+    return m_experiments;
+}
+bool DecisionTreeNode::isTerminalNode()
+{
+    return !m_nodeHasChildren;
+}
+void DecisionTreeNode::clearExperimentsWithinTree()
+{
+    m_experiments.clear();
+    if (m_nodeHasChildren)
+    {
+        m_lhsChild->clearExperimentsWithinTree();
+        m_rhsChild->clearExperimentsWithinTree();
+        m_missingChild->clearExperimentsWithinTree();
+    }
+}
+double DecisionTreeNode::getSumZ()
+{
+    return m_sumZ;
+}
+double DecisionTreeNode::getSumW()
+{
+    return m_sumW;
+}
+void DecisionTreeNode::setMissingValue( double missingValue )
+{
+    m_missingValue = missingValue;
+    m_missingValueDefined = true;
+}
+shared_ptr<SplitDefinition> DecisionTreeNode::getSplitDefinition()
+{
+    return m_splitDefinition;
+}
+shared_ptr<SplitDefinition> DecisionTreeNode::getParentSplitDefinition()
+{
+    return m_parentSplitDefinition;
+}
+Partition DecisionTreeNode::getPartition()
+{
+    return m_whichPartitionAmI;
+}
+void DecisionTreeNode::setSumZ( double sumZ )
+{
+    m_sumZ = sumZ;
+}
+void DecisionTreeNode::setSumW( double sumW )
+{
+    m_sumW = sumW;
+}
+void DecisionTreeNode::updateSums()
+{
+    m_sumW = 0.0;
+    m_sumZ = 0.0;
+    for (unsigned int i=0; i<m_experiments.size(); ++i)
+    {
+        double w = m_experiments.at(i)->getWeight();
+        m_sumW += w;
+        m_sumZ += w * m_experiments.at(i)->getZ();
+    }
+}