RubyGems - ml4r - Versions diffs - 0.1.2 → 0.1.4 - Mend

ml4r 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

data/ext/ml4r/{LinearRegression.h → LinearRegression/LinearRegression.h} +25 -19
data/ext/ml4r/LinearRegression/OLSLinearRegression.h +29 -0
data/ext/ml4r/MachineLearning/DecisionTree/CategoryInfo.h +32 -0
data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.h +30 -0
data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.h +86 -0
data/ext/ml4r/MachineLearning/DecisionTree/FeatureInteraction.h +31 -0
data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.h +45 -0
data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.h +17 -0
data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.h +16 -0
data/ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.h +81 -0
data/ext/ml4r/MachineLearning/GBM/BernoulliCalculator.h +29 -0
data/ext/ml4r/MachineLearning/GBM/GBM.h +50 -0
data/ext/ml4r/MachineLearning/GBM/GBMCalculator.h +31 -0
data/ext/ml4r/MachineLearning/GBM/GBMData.h +0 -0
data/ext/ml4r/MachineLearning/GBM/GBMEstimator.h +79 -0
data/ext/ml4r/MachineLearning/GBM/GBMOutput.h +53 -0
data/ext/ml4r/MachineLearning/GBM/GBMParameters.h +50 -0
data/ext/ml4r/MachineLearning/GBM/GBMRunner.h +35 -0
data/ext/ml4r/MachineLearning/GBM/GaussianCalculator.h +29 -0
data/ext/ml4r/MachineLearning/GBM/ZenithGBM.h +27 -0
data/ext/ml4r/MachineLearning/MLData/MLData.h +77 -0
data/ext/ml4r/MachineLearning/MLData/MLDataFields.h +25 -0
data/ext/ml4r/MachineLearning/MLData/MLDataReader.h +37 -0
data/ext/ml4r/MachineLearning/MLData/ZenithMLData.h +13 -0
data/ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.h +20 -0
data/ext/ml4r/MachineLearning/MLEstimator.h +30 -0
data/ext/ml4r/MachineLearning/MLEstimatorFactory.h +25 -0
data/ext/ml4r/MachineLearning/MLExperiment.h +41 -0
data/ext/ml4r/MachineLearning/MLOutput.h +45 -0
data/ext/ml4r/MachineLearning/MLParameters.h +16 -0
data/ext/ml4r/MachineLearning/MLRunner.h +47 -0
data/ext/ml4r/MachineLearning/MLUtils.h +75 -0
data/ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.h +47 -0
data/ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.h +33 -0
data/ext/ml4r/MachineLearning/RandomForest/RandomForestParameters.h +32 -0
data/ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.h +34 -0
data/ext/ml4r/extconf.rb +16 -3
data/ext/ml4r/{MathUtils.h → utils/MathUtils.h} +0 -0
data/ext/ml4r/{MatrixInversion.h → utils/MatrixInversion.h} +0 -0
data/ext/ml4r/utils/StochasticUtils.h +33 -0
data/ext/ml4r/utils/Utils.h +147 -0
data/ext/ml4r/utils/VlcMessage.h +44 -0
data/lib/ml4r/linear_regression.rb +7 -0
metadata +45 -13
data/ext/ml4r/LinearRegression/ZenithRegression.h +0 -17
data/ext/ml4r/OLSLinearRegression.h +0 -23
data/ext/ml4r/Utils.h +0 -53
data/ext/ml4r/example.h +0 -18
data/ext/ml4r/swig/example.h +0 -13
data/ext/ml4r/swig/example_wrap.c +0 -2093
data/ext/ml4r/utils/RubyUtils.h +0 -174

data/ext/ml4r/MachineLearning/GBM/BernoulliCalculator.h ADDED Viewed

@@ -0,0 +1,29 @@
+#ifndef BernoulliCalculator_h__
+#define BernoulliCalculator_h__
+#include "MachineLearning/GBM/GBMCalculator.h"
+#include <boost/shared_ptr.hpp>
+#include <vector>
+using std::vector;
+using boost::shared_ptr;
+class BernoulliCalculator : public GBMCalculator
+{
+public:
+	BernoulliCalculator();
+	~BernoulliCalculator();
+    double calculateDeviance(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
+    void   populateInitialF(vector<shared_ptr<DecisionTreeExperiment> >& experiments, bool useInitialPredictions);
+    void   updateZ(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
+    double computeFIncrement(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
+    void   updatePredictions(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
+    double calculatePrediction(double f);
+    double calculateF(double prediction);
+protected:
+private:
+};
+#endif // BernoulliCalculator_h__

data/ext/ml4r/MachineLearning/GBM/GBM.h ADDED Viewed

@@ -0,0 +1,50 @@
+#ifndef __GBM_h__
+#define __GBM_h__
+#include "MachineLearning/GBM/GBMParameters.h"
+#include <boost/shared_ptr.hpp>
+#include <vector>
+using std::vector;
+using boost::shared_ptr;
+class MLData;
+class DecisionTreeExperiment;
+class MLExperiment;
+class MLDataFields;
+class DecisionTreeNode;
+class SplitDefinition;
+class GBMOutput;
+class GBMCalculator;
+class GBM
+{
+public:
+    GBM();
+    ~GBM();
+    void estimate();
+    void estimateMore(int numTrees);
+    void setData(MLData* mlData);
+    vector<double> getPredictions(MLData* newData);
+    vector<double> getPredictions(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
+    vector<double> getMeanTrainingPredictions();
+    vector<double> getCrossValidationPredictions();
+    GBMParameters parameters;
+protected:
+    MLData* m_data;
+    vector<shared_ptr<GBMOutput> > m_outputObjects;
+    shared_ptr<GBMCalculator> m_gbmCalculator;
+    void config();
+    void input();
+    void goNuts();
+    void output();
+    vector<shared_ptr<DecisionTreeExperiment> > makeDecisionTreeExperiments(vector<shared_ptr<MLExperiment> >& experiments);
+};
+#endif // GBM_h__

data/ext/ml4r/MachineLearning/GBM/GBMCalculator.h ADDED Viewed

@@ -0,0 +1,31 @@
+#ifndef GBMCalculator_h__
+#define GBMCalculator_h__
+#include <boost/shared_ptr.hpp>
+#include <vector>
+using std::vector;
+using boost::shared_ptr;
+class DecisionTreeExperiment;
+class GBMCalculator
+{
+public:
+    GBMCalculator() {};
+    ~GBMCalculator() {};
+    virtual double calculateDeviance(vector<shared_ptr<DecisionTreeExperiment> >& experiments) = 0;
+    virtual void   populateInitialF(vector<shared_ptr<DecisionTreeExperiment> >& experiments, bool useInitialPredictions) = 0;
+    virtual void   updateZ(vector<shared_ptr<DecisionTreeExperiment> >& experiments) = 0;
+    virtual double computeFIncrement(vector<shared_ptr<DecisionTreeExperiment> >& experiments) = 0;
+    virtual void   updatePredictions(vector<shared_ptr<DecisionTreeExperiment> >& experiments) = 0;
+    virtual double calculatePrediction(double f) = 0;
+    virtual double calculateF(double prediction) = 0;
+protected:
+private:
+};
+#endif // GBMCalculator_h__

data/ext/ml4r/MachineLearning/GBM/GBMData.h ADDED Viewed

File without changes

data/ext/ml4r/MachineLearning/GBM/GBMEstimator.h ADDED Viewed

@@ -0,0 +1,79 @@
+#ifndef GBMEstimator_h__
+#define GBMEstimator_h__
+#include "MachineLearning/DecisionTree/FeatureInteraction.h"
+#include "MachineLearning/MLEstimator.h"
+#include <boost/shared_ptr.hpp>
+#include <vector>
+#include <map>
+#include <set>
+#include <utility>
+using std::pair;
+using std::set;
+using std::map;
+using std::vector;
+using boost::shared_ptr;
+class GBMParameters;
+class GBMOutput;
+class DecisionTreeExperiment;
+class DecisionTreeNode;
+class SplitDefinition;
+class MLData;
+class GBMCalculator;
+class MLExperiment;
+class GBMEstimator : public MLEstimator
+{
+public:
+	GBMEstimator(MLData* data, vector<shared_ptr<MLExperiment> > experiments, shared_ptr<GBMParameters> parameters);
+	~GBMEstimator();
+    shared_ptr<MLOutput> estimate();
+    void estimateMore(int numTrees);
+    vector<FeatureInteraction> findInteractions(int howMany);
+protected:
+    void initializeEstimator();
+    void performIteration();
+    void constructFeatureIndices();
+    void initialiseGBMExperimentData();
+    void populateInitialF();
+    void updateZ(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
+    void updatePredictions(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
+    void sortTrainingExperiments();
+    void constructDecisionTree(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
+    void constructGenerousDecisionTree(vector<shared_ptr<DecisionTreeExperiment> >& experiments, int rfToLevel);
+    // map<int, vector<shared_ptr<DecisionTreeExperiment> > > partitionSortedExperiments(shared_ptr<SplitDefinition> splitDefinition, Partition partition);
+    map<int, vector<shared_ptr<DecisionTreeExperiment> > > bagSortedExperiments(vector<shared_ptr<DecisionTreeExperiment> >& baggedExperiments);
+    vector<int> getRandomFeatureList();
+    void calculateFIncrementPerDecisionTreeNode();
+    void applyFIncrementToInBagExperiments();
+    pair<vector<shared_ptr<DecisionTreeExperiment> >, vector<shared_ptr<DecisionTreeExperiment> > > bagExperiments();
+    void applyFIncrementToExperiments(vector<shared_ptr<DecisionTreeExperiment> >& outOfBagExperiments);
+    void reportDeviance(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
+    void deleteRedundantData();
+    shared_ptr<GBMOutput>                                 m_output;
+	vector<shared_ptr<DecisionTreeExperiment> >           m_decisionTreeExperiments;
+    shared_ptr<GBMParameters>                             m_parameters;
+    vector<int>                                           m_featureIndices;
+    shared_ptr<DecisionTreeNode>                          m_decisionTreeHead;
+    set<shared_ptr<DecisionTreeNode> >                    m_terminalNodes;
+    map<shared_ptr<DecisionTreeNode>, double>             m_FIncrements;
+    shared_ptr<GBMCalculator>                             m_gbmCalculator;
+    bool                                                  m_missingValueDefined;
+    double                                                m_missingValue;
+private:
+};
+#endif // GBMEstimator_h__

data/ext/ml4r/MachineLearning/GBM/GBMOutput.h ADDED Viewed

@@ -0,0 +1,53 @@
+#ifndef GBMOutput_h__
+#define GBMOutput_h__
+#include "MachineLearning/GBM/GBMParameters.h"
+#include "MachineLearning/MLOutput.h"
+#include <boost/shared_ptr.hpp>
+#include <vector>
+#include <map>
+using std::map;
+using std::vector;
+using boost::shared_ptr;
+class DecisionTreeNode;
+class DecisionTreeExperiment;
+class MLData;
+class GBMCalculator;
+class GBMOutput : public MLOutput
+{
+public:
+    GBMOutput(MLData* trainingData, vector<int> trainingExperimentIndicies,shared_ptr<GBMParameters> parameters);
+    ~GBMOutput();
+    // recording inputs of GBM estimation
+    shared_ptr<GBMParameters>                   getParameters();
+    // recording outputs of GBM estimation
+    void setMeanY(double y);
+    void addHeadDecisionTreeNode(shared_ptr<DecisionTreeNode> node);
+    void addFIncrements(map<shared_ptr<DecisionTreeNode>, double> m_FIncrements);
+    // applying model to new data
+    double predictForExperiment(shared_ptr<MLExperiment> experiment);
+    void setPredictionForDecisionTreeExperiment(shared_ptr<DecisionTreeExperiment> experiment);
+    // stats
+    int  getNumTrees();
+    void capTrees(int numTrees);
+protected:
+    vector<shared_ptr<DecisionTreeNode> >                m_headNodes;
+    vector<map<shared_ptr<DecisionTreeNode>, double> >   m_fIncrements;
+    double                                              m_meanY;
+    bool                                                m_useMeanY;
+    shared_ptr<GBMParameters>                           m_parameters;
+    shared_ptr<GBMCalculator>                           m_gbmCalculator;
+private:
+};
+#endif // GBMOutput_h__

data/ext/ml4r/MachineLearning/GBM/GBMParameters.h ADDED Viewed

@@ -0,0 +1,50 @@
+#ifndef __GBMParameters_h__
+#define __GBMParameters_h__
+#include <string>
+#include <vector>
+#include <limits>
+using std::vector;
+using std::string;
+enum GBMDistribution {
+    BERNOULLI,
+    GAUSSIAN
+};
+class GBMParameters
+{
+public:
+    GBMParameters()
+        : tryMVariables(-1),growKDecisionTreeNodes(5),bagFraction(1.0),
+          shrinkageFactor(0.01),numIterations(100),minObservations(10),
+          distribution(BERNOULLI), greedy(true), rfToLevel(0), verbose(false),
+          scale(std::numeric_limits<double>::infinity())
+    {};
+    ~GBMParameters() {};
+    // parameters will be public
+    vector<string>  featuresToRun;          // X's for this run
+    int             tryMVariables;
+    int             growKDecisionTreeNodes;
+    double          bagFraction;
+    double          shrinkageFactor;
+    int             numIterations;
+    int             minObservations;
+    vector<int>     trainingExperimentIds;
+    GBMDistribution distribution;
+    int             rfToLevel;
+    bool            greedy;
+    double          scale;
+    bool            verbose;
+protected:
+};
+#endif // GBMParameters_h__

data/ext/ml4r/MachineLearning/GBM/GBMRunner.h ADDED Viewed

@@ -0,0 +1,35 @@
+#ifndef __GBM_h__
+#define __GBM_h__
+#include "MachineLearning/MLRunner.h"
+#include <boost/shared_ptr.hpp>
+#include <vector>
+using std::vector;
+using boost::shared_ptr;
+class MLData;
+class MLExperiment;
+class FeatureInteraction;
+class GBMParameters;
+class GBMRunner : public MLRunner
+{
+public:
+    GBMRunner();
+    ~GBMRunner();
+    void estimateMore(int numTrees);
+    void capTrees(int numTrees);
+    vector<FeatureInteraction> getFeatureInteractions(int howMany);
+    shared_ptr<MLEstimator> createEstimator(MLData* data, vector<shared_ptr<MLExperiment> > trainingExperiments);
+    shared_ptr<GBMParameters> parameters;
+protected:
+    void config();
+};
+#endif // GBM_h__

data/ext/ml4r/MachineLearning/GBM/GaussianCalculator.h ADDED Viewed

@@ -0,0 +1,29 @@
+#ifndef GaussianCalculator_h__
+#define GaussianCalculator_h__
+#include "MachineLearning/GBM/GBMCalculator.h"
+#include <boost/shared_ptr.hpp>
+#include <vector>
+using std::vector;
+using boost::shared_ptr;
+class GaussianCalculator : public GBMCalculator
+{
+public:
+	GaussianCalculator();
+	~GaussianCalculator();
+    double calculateDeviance(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
+    void   populateInitialF(vector<shared_ptr<DecisionTreeExperiment> >& experiments, bool useInitialPredictions);
+    void   updateZ(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
+    double computeFIncrement(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
+    void   updatePredictions(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
+    double calculatePrediction(double f);
+    double calculateF(double prediction);
+protected:
+private:
+};
+#endif // GaussianCalculator_h__

data/ext/ml4r/MachineLearning/GBM/ZenithGBM.h ADDED Viewed

@@ -0,0 +1,27 @@
+// #ifndef ZenithGBM_h__
+// #define ZenithGBM_h__
+// #include "stdafx.h"
+// // ruby interface methods
+// void zenith_gbm_Free(void* v);
+// OtInterface::VALUE zenith_gbm_New(int argc, VALUE* argv, VALUE klass);
+// OtInterface::VALUE zenith_gbm_Initialize(VALUE self);
+// OtInterface::VALUE zenith_gbm_estimate(VALUE self);
+// OtInterface::VALUE zenith_gbm_estimateMore(VALUE self, VALUE numTrees);
+// OtInterface::VALUE zenith_gbm_setFeaturesToRun(VALUE self, VALUE featuresValue);
+// OtInterface::VALUE zenith_gbm_setData(VALUE self, VALUE data);
+// OtInterface::VALUE zenith_gbm_setDistribution(VALUE self, VALUE distribution);
+// OtInterface::VALUE zenith_gbm_setTryMVariables(VALUE self, VALUE mVariablesValue);
+// OtInterface::VALUE zenith_gbm_setKTerminalNodes(VALUE self, VALUE kNodesValue);
+// OtInterface::VALUE zenith_gbm_setNumIterations(VALUE self, VALUE numIterationsValue);
+// OtInterface::VALUE zenith_gbm_setShrinkageFactor(VALUE self, VALUE shrinkageFactorValue);
+// OtInterface::VALUE zenith_gbm_setBagFraction(VALUE self, VALUE bagFractionValue);
+// OtInterface::VALUE zenith_gbm_setTrainingExperimentIds(VALUE self, VALUE experimentIdsValue);
+// OtInterface::VALUE zenith_gbm_minObservations(VALUE self, VALUE minObservations);
+// OtInterface::VALUE zenith_gbm_verbose(VALUE self, VALUE verbose);
+// OtInterface::VALUE zenith_gbm_predictions(VALUE self, VALUE newMlData);
+// OtInterface::VALUE zenith_gbm_training_predictions(VALUE self);
+// OtInterface::VALUE zenith_gbm_crossvalidation_predictions(VALUE self);
+// #endif // ZenithGBM_h__

data/ext/ml4r/MachineLearning/MLData/MLData.h ADDED Viewed

@@ -0,0 +1,77 @@
+#ifndef __MLData_h__
+#define __MLData_h__
+#include <boost/shared_ptr.hpp>
+#include <vector>
+#include <set>
+#include <map>
+#include <utility>
+#include <string>
+using std::string;
+using std::pair;
+using std::set;
+using std::map;
+using std::vector;
+using boost::shared_ptr;
+class MLExperiment;
+class MLData
+{
+public:
+    MLData();
+    ~MLData();
+    void setExperiments(vector<shared_ptr<MLExperiment> > experiments);
+    vector<shared_ptr<MLExperiment> >& getExperiments();
+    shared_ptr<MLExperiment> getExperiment(int experimentIndex);
+    void createFolds(int numFolds, int randomSeed);
+    void setFolds(vector<int> folds);
+    void setTrainingExperimentIds(vector<int>& experimentIds);
+    void setTrainingExperiments(vector<shared_ptr<MLExperiment> > experiments);
+    vector<int>& getFoldNumbers();
+    vector<int>  getFolds();
+    vector<shared_ptr<MLExperiment> >& getTrainingExperiments(int fold);
+    vector<shared_ptr<MLExperiment> >& getCrossValidationExperiments(int fold);
+    shared_ptr<MLExperiment> getExperimentWithId(int experimentId);
+    // vector<shared_ptr<MLExperiment> >& getExperimentsSortedOnFeature(int featureIndex);
+    vector<string>& getFeatures();
+    void  setFeatures(vector<string> features);
+    int   getFeatureIndex(string& feature);
+    void        constructCategories(vector<string> categoricalFeatures);
+    set<int>&   getCategoricalFeatureIndices();
+    void setInitialPredictions(vector<double> initialPredictions);
+    void setInitialPredictionsDefined(bool defined);
+    bool initialPredictionsDefined();
+    int getNumFolds();
+    void setMissingValue(double missingValue);
+    bool missingValueDefined();
+    double getMissingValue();
+protected:
+    vector<shared_ptr<MLExperiment> > m_experiments;
+    map<int, vector<shared_ptr<MLExperiment> > > m_trainingExperiments;
+    map<int, vector<shared_ptr<MLExperiment> > > m_cvExperiments;
+    map<int, shared_ptr<MLExperiment> > m_experimentsById;
+    vector<string>                     m_featureNames;
+    map<string, int>                   m_featureIndices;
+    set<int>                           m_categoricalFeatureIndices;
+    bool                               m_initialPredictionsDefined;
+    vector<int>                        m_foldNumbers;
+    double                             m_missingValue;
+    bool                               m_missingValueDefined;
+    // map<int, vector<shared_ptr<MLExperiment> > > m_experimentsSortedByFeature;
+};
+#endif // MLData_h__