RubyGems - ml4r - Versions diffs - 0.1.2 → 0.1.4 - Mend

ml4r 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

data/ext/ml4r/{LinearRegression.h → LinearRegression/LinearRegression.h} +25 -19
data/ext/ml4r/LinearRegression/OLSLinearRegression.h +29 -0
data/ext/ml4r/MachineLearning/DecisionTree/CategoryInfo.h +32 -0
data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.h +30 -0
data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.h +86 -0
data/ext/ml4r/MachineLearning/DecisionTree/FeatureInteraction.h +31 -0
data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.h +45 -0
data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.h +17 -0
data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.h +16 -0
data/ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.h +81 -0
data/ext/ml4r/MachineLearning/GBM/BernoulliCalculator.h +29 -0
data/ext/ml4r/MachineLearning/GBM/GBM.h +50 -0
data/ext/ml4r/MachineLearning/GBM/GBMCalculator.h +31 -0
data/ext/ml4r/MachineLearning/GBM/GBMData.h +0 -0
data/ext/ml4r/MachineLearning/GBM/GBMEstimator.h +79 -0
data/ext/ml4r/MachineLearning/GBM/GBMOutput.h +53 -0
data/ext/ml4r/MachineLearning/GBM/GBMParameters.h +50 -0
data/ext/ml4r/MachineLearning/GBM/GBMRunner.h +35 -0
data/ext/ml4r/MachineLearning/GBM/GaussianCalculator.h +29 -0
data/ext/ml4r/MachineLearning/GBM/ZenithGBM.h +27 -0
data/ext/ml4r/MachineLearning/MLData/MLData.h +77 -0
data/ext/ml4r/MachineLearning/MLData/MLDataFields.h +25 -0
data/ext/ml4r/MachineLearning/MLData/MLDataReader.h +37 -0
data/ext/ml4r/MachineLearning/MLData/ZenithMLData.h +13 -0
data/ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.h +20 -0
data/ext/ml4r/MachineLearning/MLEstimator.h +30 -0
data/ext/ml4r/MachineLearning/MLEstimatorFactory.h +25 -0
data/ext/ml4r/MachineLearning/MLExperiment.h +41 -0
data/ext/ml4r/MachineLearning/MLOutput.h +45 -0
data/ext/ml4r/MachineLearning/MLParameters.h +16 -0
data/ext/ml4r/MachineLearning/MLRunner.h +47 -0
data/ext/ml4r/MachineLearning/MLUtils.h +75 -0
data/ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.h +47 -0
data/ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.h +33 -0
data/ext/ml4r/MachineLearning/RandomForest/RandomForestParameters.h +32 -0
data/ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.h +34 -0
data/ext/ml4r/extconf.rb +16 -3
data/ext/ml4r/{MathUtils.h → utils/MathUtils.h} +0 -0
data/ext/ml4r/{MatrixInversion.h → utils/MatrixInversion.h} +0 -0
data/ext/ml4r/utils/StochasticUtils.h +33 -0
data/ext/ml4r/utils/Utils.h +147 -0
data/ext/ml4r/utils/VlcMessage.h +44 -0
data/lib/ml4r/linear_regression.rb +7 -0
metadata +45 -13
data/ext/ml4r/LinearRegression/ZenithRegression.h +0 -17
data/ext/ml4r/OLSLinearRegression.h +0 -23
data/ext/ml4r/Utils.h +0 -53
data/ext/ml4r/example.h +0 -18
data/ext/ml4r/swig/example.h +0 -13
data/ext/ml4r/swig/example_wrap.c +0 -2093
data/ext/ml4r/utils/RubyUtils.h +0 -174

data/ext/ml4r/{LinearRegression.h → LinearRegression/LinearRegression.h} RENAMED Viewed

@@ -8,37 +8,43 @@ using boost::numeric::ublas::matrix;
 #include <utility>
 using std::pair;
-//#include "ZenithBase.h"
-//#include "boost/MatrixInverse.h"
 class LinearRegression
 {
 public:
-	 LinearRegression() : m_constantIsFixed(false), m_paramsAreValid(false) {}
+	LinearRegression(std::vector<std::vector<double> > xs, std::vector<double> ys,
+	                 std::vector<double> weights = std::vector<double>())
+	: m_xs(xs), m_ys(ys), m_ws(weights), m_constantIsFixed(false), m_paramsAreValid(false) {}
+	LinearRegression(std::vector<double> xs, std::vector<double> ys,
+                     std::vector<double> weights = std::vector<double>())
+    : m_ys(ys), m_ws(weights), m_constantIsFixed(false), m_paramsAreValid(false)
+    {
+        m_xs.resize(xs.size());
+        for (unsigned int i=0; i<xs.size(); ++i) m_xs.at(i).resize(1, xs.at(i));
+    }
+	LinearRegression(std::vector<std::vector<double> > xs, std::vector<double> ys, double fixedConstant,
+                     std::vector<double> weights = std::vector<double>())
+    : m_xs(xs), m_ys(ys), m_ws(weights), m_constantIsFixed(true), m_constant(fixedConstant), m_paramsAreValid(false) {}
 	~LinearRegression() {}
-    void setXs(std::vector<std::vector<double> > xs);
-    void setYs(std::vector<double> ys);
-    void setWeights(std::vector<double> weights);
     void setFixedConstant(double val);
     pair<std::vector<double>,double>  getParameterEstimates();
     std::vector<double>&              getFittedYs();
     std::vector<double>&              getPredictedYs();
-    // double                       GetConstant();
-    double                       getRSquared();
-    double                       getSSquared();
-    double                       getFstatistic();
+    // double                         GetConstant();
+    double                            getRSquared();
+    double                            getSSquared();
+    double                            getFstatistic();
     std::vector<double>&              getTstatistics();
     std::vector<double>&              getStandardErrors();
-    double                       getPressStatistic();
-    double                       getPresarStatistic();
-    double                       getAdjustedRSquared();
-    double                       getRSquaredPrediction();
-    // BOOM THIS IS THE PROBLEM HERE - CAN'T INSTANTIATE A PURE VIRTUAL CLASS
-    virtual void Execute() = 0;
+    double                            getPressStatistic();
+    double                            getPresarStatistic();
+    double                            getAdjustedRSquared();
+    double                            getRSquaredPrediction();
 protected:
@@ -75,7 +81,7 @@ protected:
     boost::numeric::ublas::matrix<double>   m_Y;
     boost::numeric::ublas::matrix<double>   m_B; // m_B = prod(m_A, m_Y)
     boost::numeric::ublas::matrix<double>   m_A; // m_A = (X'WX)-1 X'W
-    std::vector<double>     m_h_diagonal; // hat ublas::matrix = XA. This is the diagonal of it.
+    std::vector<double>                     m_h_diagonal; // hat ublas::matrix = XA. This is the diagonal of it.
     boost::numeric::ublas::matrix<double>   m_Xtranspose;
     boost::numeric::ublas::matrix<double>   m_Xtranspose_W;
     boost::numeric::ublas::matrix<double>   m_Xtranspose_W_X;

data/ext/ml4r/LinearRegression/OLSLinearRegression.h ADDED Viewed

@@ -0,0 +1,29 @@
+#ifndef OLSLinearRegression_h__
+#define OLSLinearRegression_h__
+#include "LinearRegression.h"
+class OLSLinearRegression : public LinearRegression
+{
+public:
+	OLSLinearRegression(std::vector<std::vector<double> > xs, std::vector<double> ys,
+	                    std::vector<double> weights = std::vector<double>());
+    OLSLinearRegression(std::vector<double> xs, std::vector<double> ys,
+    	                std::vector<double> weights = std::vector<double>());
+    OLSLinearRegression(std::vector<std::vector<double> > xs, std::vector<double> ys, double fixedConstant,
+                        std::vector<double> weights = std::vector<double>());
+	~OLSLinearRegression();
+    void calculate();
+    void EstimateBs();
+protected:
+private:
+};
+#endif // OLSLinearRegression_h__

data/ext/ml4r/MachineLearning/DecisionTree/CategoryInfo.h ADDED Viewed

@@ -0,0 +1,32 @@
+#ifndef __CATEGORY_INFO_H__
+#define __CATEGORY_INFO_H__
+struct CategoryInfo
+{
+    CategoryInfo()
+     : sumZ(0), sumW(0), countN(0), meanZ(0), category(-1)
+    {
+    }
+    vector<shared_ptr<DecisionTreeExperiment> > experiments;
+    double sumW;
+    double sumZ;
+    int    countN;
+    double meanZ;
+    double    category;
+    void addExperiment(shared_ptr<DecisionTreeExperiment> experiment)
+    {
+        experiments.push_back(experiment);
+        sumW += experiment->getWeight();
+        sumZ += experiment->getWeight() * experiment->getZ();
+        countN += 1;
+        meanZ = sumZ / sumW;
+    }
+    // define operator to sort on meanZ
+    bool operator<(const CategoryInfo& rhs) const { return this->meanZ < rhs.meanZ; }
+};
+#endif // __CATEGORY_INFO_H__

data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.h ADDED Viewed

@@ -0,0 +1,30 @@
+#ifndef DecisionTreeExperiment_h__
+#define DecisionTreeExperiment_h__
+#include "MachineLearning/MLExperiment.h"
+#include <boost/shared_ptr.hpp>
+using boost::shared_ptr;
+class DecisionTreeExperiment : public MLExperiment
+{
+public:
+	DecisionTreeExperiment();
+    DecisionTreeExperiment(shared_ptr<MLExperiment> mlExperiment);
+	~DecisionTreeExperiment();
+    void    setF(double f);
+    void    setZ(double z);
+    double  getF();
+    double  getY();
+    double  getZ();
+    void    incrementF(double increment);
+protected:
+    double m_F; // modelled estimate (utility in the case of bernoulli)
+    double m_Z; // the thing which decision trees attempt to partition
+private:
+};
+#endif // DecisionTreeExperiment_h__

data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.h ADDED Viewed

@@ -0,0 +1,86 @@
+#ifndef __DecisionTreeNode_h__
+#define __DecisionTreeNode_h__
+#include <vector>
+#include <boost/shared_ptr.hpp>
+#include <set>
+#include <map>
+using std::map;
+using std::set;
+using boost::shared_ptr;
+using std::vector;
+class DecisionTreeExperiment;
+class SplitDefinition;
+enum Partition
+{
+    ROOT,
+    LHS,
+    RHS,
+    MISSING,
+};
+class DecisionTreeNode
+{
+public:
+    DecisionTreeNode(   vector<shared_ptr<DecisionTreeExperiment> > experiments,
+                        double sumZ,
+                        double sumW,
+                        Partition partition,
+                        shared_ptr<SplitDefinition> parentSplitDefinition);
+    ~DecisionTreeNode();
+    shared_ptr<DecisionTreeNode> getTerminalNodeForExperiment(shared_ptr<DecisionTreeExperiment> experiment);
+    void defineSplit(shared_ptr<SplitDefinition> splitDefinition,
+                     shared_ptr<DecisionTreeNode> lhsChild,
+                     shared_ptr<DecisionTreeNode> rhsChild,
+                     shared_ptr<DecisionTreeNode> missingChild);
+    vector<shared_ptr<DecisionTreeExperiment> > getExperiments();
+    bool isTerminalNode();
+    void clearExperimentsWithinTree();
+    double getSumZ();
+    double getSumW();
+    void   setSumZ(double sumZ);
+    void   setSumW(double sumW);
+    void   updateSums();
+    shared_ptr<SplitDefinition> getSplitDefinition();
+    shared_ptr<SplitDefinition> getParentSplitDefinition();
+    Partition                   getPartition();
+    static void setMissingValue(double missingValue);
+protected:
+    shared_ptr<DecisionTreeNode> chooseChild(shared_ptr<DecisionTreeExperiment> experiment);
+    void setChildren(shared_ptr<DecisionTreeNode> lhsChild,
+                     shared_ptr<DecisionTreeNode> rhsChild,
+                     shared_ptr<DecisionTreeNode> missingChild);
+    // if this decision tree node has been further split, the following variables will be populated, otherwise they will be null!
+    bool                                    m_nodeHasChildren;
+    shared_ptr<DecisionTreeNode>            m_lhsChild;
+    shared_ptr<DecisionTreeNode>            m_rhsChild;
+    shared_ptr<DecisionTreeNode>            m_missingChild;
+    vector<shared_ptr<DecisionTreeExperiment> > m_experiments;
+    static bool                             m_missingValueDefined;
+    static double                           m_missingValue;
+    double                                  m_sumZ;
+    double                                  m_sumW;
+    Partition                               m_whichPartitionAmI;
+    shared_ptr<SplitDefinition>             m_parentSplitDefinition;
+    shared_ptr<SplitDefinition>             m_splitDefinition;
+};
+#endif // DecisionTreeNode_h__

data/ext/ml4r/MachineLearning/DecisionTree/FeatureInteraction.h ADDED Viewed

@@ -0,0 +1,31 @@
+#ifndef FeatureInteraction_h__
+#define FeatureInteraction_h__
+#include <boost/shared_ptr.hpp>
+using boost::shared_ptr;
+#include "DecisionTreeNode.h"
+class SplitDefinition;
+class FeatureInteraction
+{
+public:
+	FeatureInteraction(shared_ptr<SplitDefinition> primarySplitDefinition,shared_ptr<SplitDefinition> secondarySplitDefinition,Partition primaryPartition)
+        : primarySplitDefinition(primarySplitDefinition), secondarySplitDefinition(secondarySplitDefinition),
+          primaryPartition(primaryPartition)
+    {
+    }
+    ~FeatureInteraction() {};
+    shared_ptr<SplitDefinition> primarySplitDefinition;
+    shared_ptr<SplitDefinition> secondarySplitDefinition;
+    Partition primaryPartition;
+protected:
+private:
+};
+#endif // FeatureInteraction_h__

data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.h ADDED Viewed

@@ -0,0 +1,45 @@
+#ifndef __NodeSplitter_h__
+#define __NodeSplitter_h__
+#include "MachineLearning/DecisionTree/DecisionTreeNode.h"
+#include <boost/shared_ptr.hpp>
+using boost::shared_ptr;
+class SplitDefinition;
+class DecisionTreeNode;
+class DecisionTreeExperiment;
+class MLData;
+// pure virtual base class for NodeSplitterContinuous and NodeSplitterCategorical
+class NodeSplitter
+{
+public:
+    NodeSplitter(MLData* data, int minObservations, double scale);
+    ~NodeSplitter();
+    shared_ptr<SplitDefinition> createSplitDefinition(shared_ptr<DecisionTreeNode> node, int featureIndex);
+    shared_ptr<SplitDefinition> createContinuousSplitDefinition(shared_ptr<DecisionTreeNode> node, int featureIndex);
+    shared_ptr<SplitDefinition> createCategoricalSplitDefinition(shared_ptr<DecisionTreeNode> node, int featureIndex);
+    double calculateImprovement(double lhsSumW, double lhsSumZ, double rhsSumW, double rhsSumZ, double missingSumW, double missingSumZ);
+    shared_ptr<DecisionTreeNode> createLhsChild(shared_ptr<SplitDefinition> splitDefinition);
+    shared_ptr<DecisionTreeNode> createRhsChild(shared_ptr<SplitDefinition> splitDefinition);
+    shared_ptr<DecisionTreeNode> createMissingChild(shared_ptr<SplitDefinition> splitDefinition);
+    shared_ptr<DecisionTreeNode> createChild(shared_ptr<SplitDefinition> splitDefinition, Partition partition);
+    vector<shared_ptr<DecisionTreeExperiment> > partitionExperiments(vector<shared_ptr<DecisionTreeExperiment> >& experiments,
+        shared_ptr<SplitDefinition> splitDefinition, Partition partition);
+    vector<shared_ptr<DecisionTreeNode> > splitNode(shared_ptr<DecisionTreeNode> nodeToSplit, vector<int> featuresToConsider);
+protected:
+    MLData* m_data;
+    double  m_missingValue;
+    int     m_minObservations;
+    bool    m_missingValueDefined;
+    double  m_scale;
+};
+#endif // NodeSplitter_h__

data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.h ADDED Viewed

@@ -0,0 +1,17 @@
+#ifndef __NodeSplitterCategorical_h__
+#define __NodeSplitterCategorical_h__
+#include "MachineLearning/DecisionTree/NodeSplitter.h"
+class NodeSplitterCategorical : public NodeSplitter
+{
+public:
+    NodeSplitterCategorical(MLData* data, int minObservations, double scale);
+    ~NodeSplitterCategorical();
+    shared_ptr<SplitDefinition> createSplitDefinition(shared_ptr<DecisionTreeNode> node, int featureIndex);
+protected:
+};
+#endif // NodeSplitterCategorical_h__

data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.h ADDED Viewed

@@ -0,0 +1,16 @@
+#ifndef __NodeSplitterContinuous_h__
+#define __NodeSplitterContinuous_h__
+#include "MachineLearning/DecisionTree/NodeSplitter.h"
+class NodeSplitterContinuous : public NodeSplitter
+{
+public:
+    NodeSplitterContinuous(MLData* data, int minObservations, double scale);
+    ~NodeSplitterContinuous();
+    shared_ptr<SplitDefinition> createSplitDefinition(shared_ptr<DecisionTreeNode> node, int featureIndex);
+protected:
+};
+#endif // NodeSplitterContinuous_h__

data/ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.h ADDED Viewed

@@ -0,0 +1,81 @@
+#ifndef __SplitDefinition_h__
+#define __SplitDefinition_h__
+#include <boost/shared_ptr.hpp>
+#include <set>
+using std::set;
+using boost::shared_ptr;
+class DecisionTreeExperiment;
+class DecisionTreeNode;
+class SplitDefinition
+{
+public:
+    SplitDefinition(shared_ptr<DecisionTreeNode> nodeToSplit,
+                    int       featureIndex,
+                    set<double>& lhsCategories,
+                    set<double>& rhsCategories,
+                    double    lhsSumZ,
+                    double    lhsSumW,
+                    int       lhsCount,
+                    double    rhsSumZ,
+                    double    rhsSumW,
+                    int       rhsCount,
+                    double    missingSumZ,
+                    double    missingSumW,
+                    int       missingCount,
+                    double    improvement);
+    SplitDefinition(shared_ptr<DecisionTreeNode> nodeToSplit,
+                    int    featureIndex,
+                    double splitValue,
+                    double    lhsSumZ,
+                    double    lhsSumW,
+                    int       lhsCount,
+                    double    rhsSumZ,
+                    double    rhsSumW,
+                    int       rhsCount,
+                    double    missingSumZ,
+                    double    missingSumW,
+                    int       missingCount,
+                    double improvement);
+    ~SplitDefinition();
+    int                                 getFeatureIndex();
+    double                              getImprovement();
+    shared_ptr<DecisionTreeNode>        getNodeToSplit();
+    double                              getLhsSumZ();
+    double                              getLhsSumW();
+    int                                 getLhsExperimentCount();
+    double                              getRhsSumZ();
+    double                              getRhsSumW();
+    int                                 getRhsExperimentCount();
+    double                              getMissingSumZ();
+    double                              getMissingSumW();
+    int                                 getMissingExperimentCount();
+    set<double>&                        getLhsCategories();
+    set<double>&                        getRhsCategories();
+    double                              getSplitValue();
+    bool                                isCategorical();
+protected:
+    shared_ptr<DecisionTreeNode> m_nodeToSplit;
+    int         m_splitFeatureIndex;
+    set<double> m_lhsCategories;
+    set<double> m_rhsCategories;
+    double      m_splitValue;
+    double      m_lhsSumZ;
+    double      m_rhsSumZ;
+    double      m_missingSumZ;
+    double      m_lhsSumW;
+    double      m_rhsSumW;
+    double      m_missingSumW;
+    int         m_lhsCount;
+    int         m_rhsCount;
+    int         m_missingCount;
+    double      m_improvement;
+    bool        m_featureIsCategorical;
+};
+#endif // SplitDefinition_h__