RubyGems - ml4r - Versions diffs - 0.1.2 → 0.1.4 - Mend

ml4r 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

data/ext/ml4r/{LinearRegression.h → LinearRegression/LinearRegression.h} +25 -19
data/ext/ml4r/LinearRegression/OLSLinearRegression.h +29 -0
data/ext/ml4r/MachineLearning/DecisionTree/CategoryInfo.h +32 -0
data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.h +30 -0
data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.h +86 -0
data/ext/ml4r/MachineLearning/DecisionTree/FeatureInteraction.h +31 -0
data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.h +45 -0
data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.h +17 -0
data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.h +16 -0
data/ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.h +81 -0
data/ext/ml4r/MachineLearning/GBM/BernoulliCalculator.h +29 -0
data/ext/ml4r/MachineLearning/GBM/GBM.h +50 -0
data/ext/ml4r/MachineLearning/GBM/GBMCalculator.h +31 -0
data/ext/ml4r/MachineLearning/GBM/GBMData.h +0 -0
data/ext/ml4r/MachineLearning/GBM/GBMEstimator.h +79 -0
data/ext/ml4r/MachineLearning/GBM/GBMOutput.h +53 -0
data/ext/ml4r/MachineLearning/GBM/GBMParameters.h +50 -0
data/ext/ml4r/MachineLearning/GBM/GBMRunner.h +35 -0
data/ext/ml4r/MachineLearning/GBM/GaussianCalculator.h +29 -0
data/ext/ml4r/MachineLearning/GBM/ZenithGBM.h +27 -0
data/ext/ml4r/MachineLearning/MLData/MLData.h +77 -0
data/ext/ml4r/MachineLearning/MLData/MLDataFields.h +25 -0
data/ext/ml4r/MachineLearning/MLData/MLDataReader.h +37 -0
data/ext/ml4r/MachineLearning/MLData/ZenithMLData.h +13 -0
data/ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.h +20 -0
data/ext/ml4r/MachineLearning/MLEstimator.h +30 -0
data/ext/ml4r/MachineLearning/MLEstimatorFactory.h +25 -0
data/ext/ml4r/MachineLearning/MLExperiment.h +41 -0
data/ext/ml4r/MachineLearning/MLOutput.h +45 -0
data/ext/ml4r/MachineLearning/MLParameters.h +16 -0
data/ext/ml4r/MachineLearning/MLRunner.h +47 -0
data/ext/ml4r/MachineLearning/MLUtils.h +75 -0
data/ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.h +47 -0
data/ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.h +33 -0
data/ext/ml4r/MachineLearning/RandomForest/RandomForestParameters.h +32 -0
data/ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.h +34 -0
data/ext/ml4r/extconf.rb +16 -3
data/ext/ml4r/{MathUtils.h → utils/MathUtils.h} +0 -0
data/ext/ml4r/{MatrixInversion.h → utils/MatrixInversion.h} +0 -0
data/ext/ml4r/utils/StochasticUtils.h +33 -0
data/ext/ml4r/utils/Utils.h +147 -0
data/ext/ml4r/utils/VlcMessage.h +44 -0
data/lib/ml4r/linear_regression.rb +7 -0
metadata +45 -13
data/ext/ml4r/LinearRegression/ZenithRegression.h +0 -17
data/ext/ml4r/OLSLinearRegression.h +0 -23
data/ext/ml4r/Utils.h +0 -53
data/ext/ml4r/example.h +0 -18
data/ext/ml4r/swig/example.h +0 -13
data/ext/ml4r/swig/example_wrap.c +0 -2093
data/ext/ml4r/utils/RubyUtils.h +0 -174

data/ext/ml4r/MachineLearning/MLData/MLDataFields.h ADDED Viewed

@@ -0,0 +1,25 @@
+#ifndef __MLDataFields_h__
+#define __MLDataFields_h__
+#include <string>
+#include <vector>
+using std::vector;
+using std::string;
+class MLDataFields
+{
+public:
+    MLDataFields() {};
+    ~MLDataFields() {};
+    string experimentIdField;
+    string weightsField;
+    string actualYField;
+    string initialPredictionsField;
+    vector<string> featuresFields;
+protected:
+};
+#endif // MLDataFields_h__

data/ext/ml4r/MachineLearning/MLData/MLDataReader.h ADDED Viewed

@@ -0,0 +1,37 @@
+#ifndef __MLDataReader_h__
+#define __MLDataReader_h__
+#include <boost/shared_ptr.hpp>
+#include <vector>
+#include <string>
+using std::string;
+using std::vector;
+using boost::shared_ptr;
+class MLData;
+class MLDataFields;
+class MLDataReader
+{
+public:
+    MLDataReader();
+    ~MLDataReader();
+    void execute(MLData* mlData);
+    string databaseName;
+    string tableName;
+    shared_ptr<MLDataFields>    fieldsSpec;
+    vector<string>              categoricalFeatures;
+    double                      missingValue;
+    bool                        missingValueDefined;
+    void reportOnData(MLData* data, shared_ptr<MLDataFields> fieldsSpec);
+protected:
+    string getSelectSql();
+};
+#endif // MLDataReader_h__

data/ext/ml4r/MachineLearning/MLData/ZenithMLData.h ADDED Viewed

@@ -0,0 +1,13 @@
+// #ifndef ZenithMLData_h__
+// #define ZenithMLData_h__
+// #include "stdafx.h"
+// // ruby interface methods
+// void zenith_mldata_Free(void* v);
+// OtInterface::VALUE zenith_mldata_New(int argc, VALUE* argv, VALUE klass);
+// OtInterface::VALUE zenith_mldata_Initialize(VALUE self);
+// OtInterface::VALUE zenith_mldata_createFolds(VALUE self, VALUE numFolds, VALUE randomSeed);
+// OtInterface::VALUE zenith_mldata_getResponse(VALUE self);
+// #endif // ZenithMLData_h__

data/ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.h ADDED Viewed

@@ -0,0 +1,20 @@
+// #ifndef ZenithMLDataReader_h__
+// #define ZenithMLDataReader_h__
+// #include "stdafx.h"
+// // ruby interface methods
+// void zenith_mldatareader_Free(void* v);
+// OtInterface::VALUE zenith_mldatareader_New(int argc, VALUE* argv, VALUE klass);
+// OtInterface::VALUE zenith_mldatareader_Initialize(VALUE self);
+// OtInterface::VALUE zenith_mldatareader_execute(VALUE self);
+// OtInterface::VALUE zenith_mldatareader_setFeaturesToLoad(VALUE self, VALUE featuresValue);
+// OtInterface::VALUE zenith_mldatareader_setCategoricalFeatures(VALUE self, VALUE categoricalFeaturesValue);
+// OtInterface::VALUE zenith_mldatareader_setDatabaseName(VALUE self, VALUE databaseNameValue);
+// OtInterface::VALUE zenith_mldatareader_setTableName(VALUE self, VALUE tableNameValue);
+// OtInterface::VALUE zenith_mldatareader_setActualYField(VALUE self, VALUE yFieldValue);
+// OtInterface::VALUE zenith_mldatareader_setExperimentIdField(VALUE self, VALUE experimentIdFieldValue);
+// OtInterface::VALUE zenith_mldatareader_setWeightsField(VALUE self, VALUE weightsFieldValue);
+// OtInterface::VALUE zenith_mldatareader_setInitialPredictionsField(VALUE self, VALUE initialEstimatesFieldValue);
+// OtInterface::VALUE zenith_mldatareader_setMissingValue(VALUE self, VALUE missingValue);
+// #endif // ZenithMLDataReader_h__

data/ext/ml4r/MachineLearning/MLEstimator.h ADDED Viewed

@@ -0,0 +1,30 @@
+#ifndef MLEstimator_h__
+#define MLEstimator_h__
+class MLOutput;
+class MLData;
+class MLExperiment;
+#include <vector>
+#include <boost/shared_ptr.hpp>
+using boost::shared_ptr;
+using std::vector;
+// pure virtual class
+class MLEstimator
+{
+public:
+    MLEstimator(MLData* data, vector<shared_ptr<MLExperiment> > experiments)
+        : m_data(data), m_trainingExperiments(experiments)
+    {};
+    ~MLEstimator() {};
+    virtual shared_ptr<MLOutput> estimate() = 0;
+protected:
+    MLData*                          m_data;
+    vector<shared_ptr<MLExperiment> > m_trainingExperiments;
+private:
+};
+#endif // MLAlgorithm_h__

data/ext/ml4r/MachineLearning/MLEstimatorFactory.h ADDED Viewed

@@ -0,0 +1,25 @@
+#ifndef MLEstimatorFactory_h__
+#define MLEstimatorFactory_h__
+class MLExperiment;
+class MLData;
+class MLParameters;
+class MLEstimator;
+#include <boost/shared_ptr.hpp>
+#include <vector>
+using std::vector;
+using boost::shared_ptr;
+class MLEstimatorFactory
+{
+public:
+    MLEstimatorFactory() {};
+    ~MLEstimatorFactory() {};
+    virtual shared_ptr<MLEstimator> create(MLData* data, vector<shared_ptr<MLExperiment> > trainingExperiments) = 0;
+protected:
+private:
+};
+#endif // MLEstimatorFactory_h__

data/ext/ml4r/MachineLearning/MLExperiment.h ADDED Viewed

@@ -0,0 +1,41 @@
+#ifndef __MLExperiment_h__
+#define __MLExperiment_h__
+#include <vector>
+#include <map>
+#include <boost/shared_ptr.hpp>
+using boost::shared_ptr;
+using std::map;
+using std::vector;
+class MLExperiment
+{
+public:
+    MLExperiment();
+    MLExperiment(int experimentId, int experimentIndex, double y, double initialPrediction,
+        double weight, vector<double> features);
+    MLExperiment(shared_ptr<MLExperiment> experiment);
+    ~MLExperiment();
+    double          getY();
+    int             getExperimentId();
+    int             getExperimentIndex();
+    double          getPrediction();
+    double          getWeight();
+    vector<double>& getFeatureValues();
+    double          getFeatureValue(int featureIndex);
+    void            setPrediction(double prediction);
+protected:
+    double m_yValue;
+    int m_experimentId;
+    int m_experimentIndex;
+    double m_prediction;
+    double m_weight;
+    vector<double> m_features;
+};
+#endif // MLExperiment_h__

data/ext/ml4r/MachineLearning/MLOutput.h ADDED Viewed

@@ -0,0 +1,45 @@
+#ifndef MLOutput_h__
+#define MLOutput_h__
+#include <boost/shared_ptr.hpp>
+#include <vector>
+#include <stdexcept>
+using std::runtime_error;
+using std::vector;
+using boost::shared_ptr;
+#include "utils/Utils.h"
+class MLData;
+class MLExperiment;
+class GBMParameters;
+// pure virtual class
+class MLOutput
+{
+public:
+    MLOutput(MLData* data, vector<int> trainingExperimentIndicies)
+        : m_trainingData(data), m_trainingExperimentIndicies(trainingExperimentIndicies)
+    {};
+    ~MLOutput() {};
+    virtual double  predictForExperiment(shared_ptr<MLExperiment> experiment) = 0;
+    virtual double  calculateAveragePredictions(vector<double> predictions)
+    {
+        if (predictions.size() == 0)
+            throw std::runtime_error("[MLOutput::calculateAveragePredictions] Cannot calculate an average on an empty vector!");
+        return Utils::vectorSum<double>(predictions) / predictions.size();
+    };
+    vector<int> getTrainingExperimentIndicies() { return m_trainingExperimentIndicies; };
+    MLData* getTrainingData() { return m_trainingData; }
+protected:
+    MLData*                          m_trainingData;
+    vector<int>                      m_trainingExperimentIndicies;
+private:
+};
+#endif // MLOutput_h__

data/ext/ml4r/MachineLearning/MLParameters.h ADDED Viewed

@@ -0,0 +1,16 @@
+#ifndef MLParameters_h__
+#define MLParameters_h__
+class MLParameters
+{
+public:
+    MLParameters() : verbose(false) {};
+    ~MLParameters() {};
+    bool verbose;
+protected:
+private:
+};
+#endif // MLParameters_h__

data/ext/ml4r/MachineLearning/MLRunner.h ADDED Viewed

@@ -0,0 +1,47 @@
+#ifndef MLRunner_h__
+#define MLRunner_h__
+#include <boost/shared_ptr.hpp>
+#include <vector>
+using std::vector;
+using boost::shared_ptr;
+class MLData;
+class MLExperiment;
+class MLParameters;
+class MLOutput;
+class MLEstimator;
+class MLRunner
+{
+public:
+    MLRunner();
+    ~MLRunner();
+    void execute();
+    void    setData(MLData* data);
+    MLData* getData();
+    vector<double> getPredictions(MLData* newData);
+    vector<double> getMeanTrainingPredictions();
+    vector<double> getCrossValidationPredictions();
+    virtual shared_ptr<MLEstimator> createEstimator(MLData* data, vector<shared_ptr<MLExperiment> > trainingExperiments) = 0;
+protected:
+    virtual void checks();
+    virtual void config();
+    virtual void input();
+    virtual void estimate();
+    virtual void output();
+    vector<double> getPredictions(vector<shared_ptr<MLExperiment> > experiments);
+    MLData*                         m_data;
+    vector<shared_ptr<MLOutput> >    m_outputObjects;
+    vector<shared_ptr<MLEstimator> > m_estimators;
+private:
+};
+#endif // MLRunner_h__

data/ext/ml4r/MachineLearning/MLUtils.h ADDED Viewed

@@ -0,0 +1,75 @@
+#ifndef __MLUtils_h__
+#define __MLUtils_h__
+#include "MachineLearning/MLExperiment.h"
+#include <vector>
+#include <boost/shared_ptr.hpp>
+#include <utility>
+using std::pair;
+using boost::shared_ptr;
+using std::vector;
+namespace MLUtils
+{
+    double getMeanY(vector<shared_ptr<MLExperiment> > experiments);
+    template <class T>
+    vector<T> bagObjectsWithReplacement(vector<T> experiments, int bagSize);
+    template <class T>
+    pair<vector<T>,vector<T> > bagObjectsWithoutReplacement(vector<T> experiments, int bagSize);
+};
+template <class T>
+vector<T> MLUtils::bagObjectsWithReplacement(vector<T> objects, int bagSize)
+{
+    vector<T> inBagObjects;
+    int numTrainingExperiments  = (int) objects.size();
+    int numBagged               = 0;
+    inBagObjects.reserve(bagSize);
+    for (int i = 0; i < bagSize; ++i)
+    {
+        double unit_rand = rand() * 1.0 / (RAND_MAX + 1.0);
+        long index = unit_rand * objects.size();
+        inBagObjects.push_back(objects.at(index));
+    }
+    return inBagObjects;
+}
+template <class T>
+pair<vector<T>,vector<T> >
+    MLUtils::bagObjectsWithoutReplacement(vector<T> objects, int bagSize)
+{
+    vector<T> inBagObjects;
+    vector<T> outOfBagObjects;
+    // int numTotalExperiments     = allExperiments.size();
+    long numObjects  = objects.size();
+    int numBagged               = 0;
+    inBagObjects.reserve(bagSize);
+    outOfBagObjects.reserve(numObjects - bagSize);
+    for (int i = 0; i < numObjects; ++i)
+    {
+        double unit_rand = rand() * 1.0 / (RAND_MAX + 1.0);
+        T object = objects.at(i);
+        if (unit_rand * (numObjects - i) < bagSize - numBagged)
+        {
+            inBagObjects.push_back(object);
+            ++numBagged;
+        }
+        else
+            outOfBagObjects.push_back(object);
+    }
+    return make_pair(inBagObjects, outOfBagObjects);
+}
+#endif // MLUtils_h__

data/ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.h ADDED Viewed

@@ -0,0 +1,47 @@
+#ifndef RandomForestEstimator_h__
+#define RandomForestEstimator_h__
+#include "MachineLearning/RandomForest/RandomForestParameters.h"
+#include "MachineLearning/MLEstimator.h"
+#include <boost/shared_ptr.hpp>
+#include <vector>
+using std::vector;
+using boost::shared_ptr;
+class RandomForestOutput;
+class DecisionTreeExperiment;
+class MLData;
+class DecisionTreeNode;
+class RandomForestEstimator : public MLEstimator
+{
+public:
+	RandomForestEstimator(MLData* data, vector<shared_ptr<MLExperiment> > experiments, shared_ptr<RandomForestParameters> parameters);
+	~RandomForestEstimator();
+    shared_ptr<MLOutput> estimate();
+    shared_ptr<MLOutput> estimateMore(int numTrees);
+protected:
+    void    updateZ();
+    void    performIteration();
+    void    initializeEstimator();
+    void    constructFeatureIndices();
+    void    constructDecisionTree(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
+	MLData*                                     m_data;
+    vector<shared_ptr<DecisionTreeExperiment> >  m_decisionTreeExperiments;
+    shared_ptr<RandomForestParameters>          m_parameters;
+    shared_ptr<RandomForestOutput>              m_output;
+    shared_ptr<DecisionTreeNode>                m_decisionTreeHead;
+    bool                                        m_missingValueDefined;
+    double                                      m_missingValue;
+    vector<int>                                 m_featureIndices;
+private:
+};
+#endif // RandomForestEstimator_h__

data/ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.h ADDED Viewed

@@ -0,0 +1,33 @@
+#ifndef RandomForestOutput_h__
+#define RandomForestOutput_h__
+#include "MachineLearning/MLOutput.h"
+class MLData;
+class RandomForestParameters;
+class MLExperiment;
+class DecisionTreeNode;
+class RandomForestOutput : public MLOutput
+{
+public:
+    RandomForestOutput( MLData* trainingData,
+                        vector<int> trainingExperimentIndicies,
+                        shared_ptr<RandomForestParameters> parameters);
+	~RandomForestOutput();
+    MLData*                            getTrainingData();
+    shared_ptr<RandomForestParameters> getParameters();
+    double predictForExperiment(shared_ptr<MLExperiment> experiment);
+    void addHeadDecisionTreeNode(shared_ptr<DecisionTreeNode> node);
+    // stats
+    int  getNumTrees();
+protected:
+    shared_ptr<RandomForestParameters>     m_parameters;
+    vector<shared_ptr<DecisionTreeNode> >  m_headNodes;
+private:
+};
+#endif // RandomForestOutput_h__