RubyGems - ml4r - Versions diffs - 0.1.4 → 0.1.5 - Mend

ml4r 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

data/ext/ml4r/LinearRegression/LinearRegression.cpp +305 -0
data/ext/ml4r/LinearRegression/OLSLinearRegression.cpp +75 -0
data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.cpp +50 -0
data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.cpp +195 -0
data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.cpp +551 -0
data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.cpp +22 -0
data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.cpp +21 -0
data/ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.cpp +142 -0
data/ext/ml4r/MachineLearning/GBM/BernoulliCalculator.cpp +95 -0
data/ext/ml4r/MachineLearning/GBM/GBMEstimator.cpp +601 -0
data/ext/ml4r/MachineLearning/GBM/GBMOutput.cpp +86 -0
data/ext/ml4r/MachineLearning/GBM/GBMRunner.cpp +117 -0
data/ext/ml4r/MachineLearning/GBM/GaussianCalculator.cpp +94 -0
data/ext/ml4r/MachineLearning/GBM/ZenithGBM.cpp +317 -0
data/ext/ml4r/MachineLearning/MLData/MLData.cpp +232 -0
data/ext/ml4r/MachineLearning/MLData/MLDataFields.cpp +1 -0
data/ext/ml4r/MachineLearning/MLData/MLDataReader.cpp +139 -0
data/ext/ml4r/MachineLearning/MLData/ZenithMLData.cpp +96 -0
data/ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.cpp +113 -0
data/ext/ml4r/MachineLearning/MLExperiment.cpp +69 -0
data/ext/ml4r/MachineLearning/MLRunner.cpp +183 -0
data/ext/ml4r/MachineLearning/MLUtils.cpp +15 -0
data/ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.cpp +172 -0
data/ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.cpp +66 -0
data/ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.cpp +84 -0
data/ext/ml4r/MachineLearning/RandomForest/ZenithRandomForest.cpp +184 -0
data/ext/ml4r/ml4r.cpp +34 -0
data/ext/ml4r/ml4r_wrap.cpp +15727 -0
data/ext/ml4r/utils/MathUtils.cpp +204 -0
data/ext/ml4r/utils/StochasticUtils.cpp +73 -0
data/ext/ml4r/utils/Utils.cpp +14 -0
data/ext/ml4r/utils/VlcMessage.cpp +3 -0
metadata +33 -1

data/ext/ml4r/MachineLearning/GBM/GBMRunner.cpp ADDED Viewed

@@ -0,0 +1,117 @@
+#include "MachineLearning/GBM/GBMRunner.h"
+#include "MachineLearning/GBM/GBMEstimator.h"
+#include "MachineLearning/GBM/GBMOutput.h"
+#include "MachineLearning/GBM/BernoulliCalculator.h"
+#include "MachineLearning/GBM/GaussianCalculator.h"
+#include "MachineLearning/MLData/MLData.h"
+#include "MachineLearning/DecisionTree/DecisionTreeExperiment.h"
+#include "MachineLearning/DecisionTree/DecisionTreeNode.h"
+#include "MachineLearning/DecisionTree/FeatureInteraction.h"
+#include "utils/VlcMessage.h"
+// #ifdef TBB_USE_THREADING_TOOLS
+// #undef TBB_USE_THREADING_TOOLS
+// #endif
+// #define TBB_USE_THREADING_TOOLS 1
+// #include "tbb/task_scheduler_init.h"
+// #include "tbb/parallel_for.h"
+// #include "tbb/blocked_range.h"
+// #include "tbb/explicit_range.h"
+#include <math.h>
+#include <boost/pointer_cast.hpp>
+#include <boost/make_shared.hpp>
+#include <boost/foreach.hpp>
+using boost::make_shared;
+using boost::dynamic_pointer_cast;
+GBMRunner::GBMRunner()
+{
+    parameters = make_shared<GBMParameters>();
+}
+GBMRunner::~GBMRunner()
+{
+}
+void GBMRunner::config()
+{
+    vector<string>& dataFeatures = m_data->getFeatures();
+    // parameters->loadedFeatures = dataFeatures;
+    if (parameters->featuresToRun.empty())
+        parameters->featuresToRun = dataFeatures;
+    else
+    {
+        BOOST_FOREACH(string feature, parameters->featuresToRun)
+        {
+            if (!Utils::hasElement(dataFeatures, feature))
+                throw std::runtime_error("Feature '" + feature + "' specified as part of parameter 'featuresToRun', but feature not found in data");
+        }
+    }
+    if (parameters->featuresToRun.empty())
+        throw std::runtime_error("There are no features to run!");
+    if (m_data->missingValueDefined())
+        DecisionTreeNode::setMissingValue(m_data->getMissingValue());
+}
+void GBMRunner::estimateMore(int numTrees)
+{
+    int numFolds             = m_data->getNumFolds();
+    int numThreads           = numFolds; // TODO: change this!
+    // tbb::task_scheduler_init init(numFolds);
+    // static tbb::simple_partitioner sp;
+    int grainSize       = numFolds / numThreads;
+    // tbb::parallel_for(explicit_range<size_t>(0, numFolds, grainSize),
+    //     [&](const explicit_range<size_t>& r) {
+    //         int threadNumber = r.begin() / grainSize;
+    //         for(size_t foldIndex=r.begin(); foldIndex!=r.end(); ++foldIndex)
+            for (int foldIndex = 0; foldIndex < numFolds; ++foldIndex)
+            {
+                vlcMessage.Begin("Estimating more...");
+                shared_ptr<GBMEstimator> estimator = dynamic_pointer_cast<GBMEstimator>(m_estimators.at(foldIndex));
+                estimator->estimateMore(numTrees);
+                vlcMessage.End();
+            }
+    // }, sp);
+}
+void GBMRunner::capTrees( int numTrees )
+{
+    BOOST_FOREACH(shared_ptr<MLOutput>& output, m_outputObjects)
+    {
+        shared_ptr<GBMOutput> gbmOutput = dynamic_pointer_cast<GBMOutput>(output);
+        gbmOutput->capTrees(numTrees);
+    }
+}
+vector<FeatureInteraction> GBMRunner::getFeatureInteractions( int howMany )
+{
+    config();
+    GBMEstimator gbmEstimator(m_data, m_data->getExperiments(), parameters);
+    return gbmEstimator.findInteractions(howMany);
+}
+shared_ptr<MLEstimator> GBMRunner::createEstimator(MLData* data, vector<shared_ptr<MLExperiment> > trainingExperiments)
+{
+    return shared_ptr<MLEstimator>(shared_ptr<GBMEstimator>(new GBMEstimator(data, trainingExperiments, parameters)));
+}

data/ext/ml4r/MachineLearning/GBM/GaussianCalculator.cpp ADDED Viewed

@@ -0,0 +1,94 @@
+#include "MachineLearning/GBM/GaussianCalculator.h"
+#include "MachineLearning/DecisionTree/DecisionTreeExperiment.h"
+#include <boost/foreach.hpp>
+GaussianCalculator::GaussianCalculator()
+{
+}
+GaussianCalculator::~GaussianCalculator()
+{
+}
+double GaussianCalculator::calculateDeviance(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
+{
+    double sumSquaredErrors = 0.0;
+    double sumWeight        = 0.0;
+    BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& experiment, experiments)
+    {
+        double error = experiment->getY() - experiment->getPrediction();
+        sumSquaredErrors += experiment->getWeight() * error * error;
+        sumWeight        += experiment->getWeight();
+    }
+    return sumSquaredErrors / sumWeight;
+}
+void GaussianCalculator::populateInitialF(vector<shared_ptr<DecisionTreeExperiment> >& experiments, bool useInitialPredictions)
+{
+    if (!useInitialPredictions)
+    {
+        // compute mean
+        double sumY = 0.0;
+        double sumW = 0.0;
+        BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& experiment, experiments)
+        {
+            sumY += experiment->getWeight() * experiment->getY();
+            sumW += experiment->getWeight();
+        }
+        double meanY = sumY / sumW;
+        BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& experiment, experiments)
+        {
+            experiment->setPrediction(meanY);
+        }
+    }
+    BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& experiment, experiments)
+    {
+        experiment->setF(calculateF(experiment->getPrediction()));
+    }
+}
+void GaussianCalculator::updateZ(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
+{
+    BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& experiment, experiments)
+    {
+        double z = experiment->getY() - experiment->getPrediction();
+        experiment->setZ(z);
+    }
+}
+double GaussianCalculator::computeFIncrement(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
+{
+    double sumZ = 0.0;
+    double sumW = 0.0;
+    BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& experiment, experiments)
+    {
+        sumZ += experiment->getWeight() * experiment->getZ();
+        sumW += experiment->getWeight();
+    }
+    if (sumW == 0)
+        return 0.0;
+    return sumZ / sumW;
+}
+void GaussianCalculator::updatePredictions(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
+{
+    BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& experiment, experiments)
+    {
+        experiment->setPrediction(calculatePrediction(experiment->getF()));
+    }
+}
+double GaussianCalculator::calculatePrediction(double f)
+{
+    return f;
+}
+double GaussianCalculator::calculateF(double prediction)
+{
+    return prediction;
+}

data/ext/ml4r/MachineLearning/GBM/ZenithGBM.cpp ADDED Viewed

@@ -0,0 +1,317 @@
+// #include "MachineLearning/GBM/ZenithGBM.h"
+// #include "MachineLearning/GBM/GBMRunner.h"
+// #include "MachineLearning/DecisionTree/FeatureInteraction.h"
+// #include "MachineLearning/DecisionTree/SplitDefinition.h"
+// #include "MachineLearning/MLData/MLData.h"
+// #include "MachineLearning/gbm/GBMParameters.h"
+// #include "stringConversion.h"
+// #include "RubyUtils.h"
+// using namespace RubyUtils;
+// void zenith_gbm_Free(void* v)
+// {
+//     delete (reinterpret_cast<GBMRunner*>(v));
+// }
+// OtInterface::VALUE zenith_gbm_New(int argc, VALUE* argv, VALUE klass)
+// {
+//     VALUE obj = otRuby->DataWrapStruct(klass, 0, zenith_gbm_Free, 0);
+//     otRuby->rb_obj_call_init(obj, argc, argv);
+//     return obj;
+// }
+// OtInterface::VALUE zenith_gbm_Initialize(VALUE self)
+// {
+//     if (otRuby->GetDataPtr(self)) zenith_gbm_Free(otRuby->GetDataPtr(self));
+//     otRuby->SetDataPtr(self, NULL);
+//     GBMRunner* gbm = new GBMRunner();
+//     if (gbm == NULL) otRuby->rb_sys_fail("ZenithGBM class could not be created");
+//     otRuby->SetDataPtr(self, gbm);
+//     return self;
+// }
+// OtInterface::VALUE zenith_gbm_estimate(VALUE self)
+// {
+//     GBMRunner* gbm = (GBMRunner*)otRuby->GetDataPtr(self);
+//     try
+//     {
+//         gbm->execute();
+//     }
+//     catch (std::exception e)
+//     {
+//         vlcMessage.Raise((string("Caught error: ") + e.what()).c_str());
+//     }
+//     return TOtRubyInterface::Qnil;
+// }
+// OtInterface::VALUE zenith_gbm_estimateMore(VALUE self, VALUE numTrees)
+// {
+//     GBMRunner* gbm = (GBMRunner*)otRuby->GetDataPtr(self);
+//     try
+//     {
+//         gbm->estimateMore(RubyUtils::fromValue<int>(numTrees));
+//     }
+//     catch (std::exception e)
+//     {
+//         vlcMessage.Raise((string("Caught error: ") + e.what()).c_str());
+//     }
+//     return TOtRubyInterface::Qnil;
+// }
+// OtInterface::VALUE zenith_gbm_setFeaturesToRun(VALUE self, VALUE featuresValue)
+// {
+//     GBMRunner* gbm = (GBMRunner*)otRuby->GetDataPtr(self);
+//     gbm->parameters->featuresToRun = RubyUtils::fromValue<vector<string> >(featuresValue);
+//     return TOtRubyInterface::Qnil;
+// }
+// OtInterface::VALUE zenith_gbm_setData(VALUE self, VALUE data)
+// {
+//     GBMRunner* gbm = (GBMRunner*)otRuby->GetDataPtr(self);
+//     MLData* mlData = (MLData*)otRuby->GetDataPtr(data);
+//     gbm->setData(mlData);
+//     return TOtRubyInterface::Qnil;
+// }
+// OtInterface::VALUE zenith_gbm_setTryMVariables(VALUE self, VALUE mVariablesValue)
+// {
+//     GBMRunner* gbm = (GBMRunner*)otRuby->GetDataPtr(self);
+//     gbm->parameters->tryMVariables = RubyUtils::fromValue<int>(mVariablesValue);
+//     return TOtRubyInterface::Qnil;
+// }
+// OtInterface::VALUE zenith_gbm_setKTerminalNodes(VALUE self, VALUE kNodesValue)
+// {
+//     GBMRunner* gbm = (GBMRunner*)otRuby->GetDataPtr(self);
+//     gbm->parameters->growKDecisionTreeNodes = RubyUtils::fromValue<int>(kNodesValue);
+//     return TOtRubyInterface::Qnil;
+// }
+// OtInterface::VALUE zenith_gbm_setNumIterations(VALUE self, VALUE numIterationsValue)
+// {
+//     GBMRunner* gbm = (GBMRunner*)otRuby->GetDataPtr(self);
+//     gbm->parameters->numIterations = RubyUtils::fromValue<int>(numIterationsValue);
+//     return TOtRubyInterface::Qnil;
+// }
+// OtInterface::VALUE zenith_gbm_setShrinkageFactor(VALUE self, VALUE shrinkageFactorValue)
+// {
+//     GBMRunner* gbm = (GBMRunner*)otRuby->GetDataPtr(self);
+//     gbm->parameters->shrinkageFactor = RubyUtils::fromValue<double>(shrinkageFactorValue);
+//     return TOtRubyInterface::Qnil;
+// }
+// OtInterface::VALUE zenith_gbm_setBagFraction(VALUE self, VALUE bagFractionValue)
+// {
+//     GBMRunner* gbm = (GBMRunner*)otRuby->GetDataPtr(self);
+//     gbm->parameters->bagFraction = RubyUtils::fromValue<double>(bagFractionValue);
+//     return TOtRubyInterface::Qnil;
+// }
+// OtInterface::VALUE zenith_gbm_setTrainingExperimentIds(VALUE self, VALUE experimentIdsValue)
+// {
+//     GBMRunner* gbm = (GBMRunner*)otRuby->GetDataPtr(self);
+//     gbm->parameters->trainingExperimentIds = RubyUtils::fromValue<vector<int> >(experimentIdsValue);
+//     return TOtRubyInterface::Qnil;
+// }
+// OtInterface::VALUE zenith_gbm_predictions(VALUE self, VALUE newMlData)
+// {
+//     GBMRunner* gbm = (GBMRunner*)otRuby->GetDataPtr(self);
+//     MLData* data = (MLData*)otRuby->GetDataPtr(newMlData);
+//     vector<double> predictions;
+//     try
+//     {
+//         predictions = gbm->getPredictions(data);
+//     }
+//     catch (std::exception e)
+//     {
+//         vlcMessage.Raise((string("Could not get predictions. Error: ") + e.what()).c_str());
+//     }
+//     return RubyUtils::toValue(predictions);
+// }
+// OtInterface::VALUE zenith_gbm_training_predictions(VALUE self)
+// {
+//     GBMRunner* gbm = (GBMRunner*)otRuby->GetDataPtr(self);
+//     vector<double> predictions;
+//     try
+//     {
+//         predictions = gbm->getMeanTrainingPredictions();
+//     }
+//     catch (std::exception e)
+//     {
+//         vlcMessage.Raise((string("Could not get training predictions. Error: ") + e.what()).c_str());
+//     }
+//     return RubyUtils::toValue(predictions);
+// }
+// OtInterface::VALUE zenith_gbm_crossvalidation_predictions(VALUE self)
+// {
+//     GBMRunner* gbm = (GBMRunner*)otRuby->GetDataPtr(self);
+//     vector<double> predictions;
+//     try
+//     {
+//         predictions = gbm->getCrossValidationPredictions();
+//     }
+//     catch (std::exception e)
+//     {
+//         vlcMessage.Raise((string("Could not get cross validation predictions. Error: ") + e.what()).c_str());
+//     }
+//     return RubyUtils::toValue(predictions);
+// }
+// OtInterface::VALUE zenith_gbm_minObservations(VALUE self, VALUE minObservations)
+// {
+//     GBMRunner* gbm = (GBMRunner*)otRuby->GetDataPtr(self);
+//     gbm->parameters->minObservations = RubyUtils::fromValue<int>(minObservations);
+//     return TOtRubyInterface::Qnil;
+// }
+// OtInterface::VALUE zenith_gbm_setDistribution(VALUE self, VALUE rb_distribution)
+// {
+//     GBMRunner* gbm = (GBMRunner*)otRuby->GetDataPtr(self);
+//     string distribution = stringToLower(RubyUtils::fromValue<string>(rb_distribution));
+//     if (distribution == "bernoulli")
+//         gbm->parameters->distribution = BERNOULLI;
+//     else if (distribution == "gaussian")
+//         gbm->parameters->distribution = GAUSSIAN;
+//     else
+//         throw std::invalid_argument("ZenithGBM::distribution = " + distribution);
+//     return TOtRubyInterface::Qnil;
+// }
+// OtInterface::VALUE zenith_gbm_verbose(VALUE self, VALUE rb_verbose)
+// {
+//     GBMRunner* gbm = (GBMRunner*)otRuby->GetDataPtr(self);
+//     bool verbose = RubyUtils::fromValue<bool>(rb_verbose);
+//     gbm->parameters->verbose = verbose;
+//     return TOtRubyInterface::Qnil;
+// }
+// OtInterface::VALUE zenith_gbm_setGreedy( VALUE self, VALUE rb_greedy )
+// {
+//     GBMRunner* gbm = (GBMRunner*)otRuby->GetDataPtr(self);
+//     bool greedy = RubyUtils::fromValue<bool>(rb_greedy);
+//     gbm->parameters->greedy = greedy;
+//     return TOtRubyInterface::Qnil;
+// }
+// OtInterface::VALUE zenith_gbm_setRfToLevel( VALUE self, VALUE rb_rfToLevel )
+// {
+//     GBMRunner* gbm = (GBMRunner*)otRuby->GetDataPtr(self);
+//     int rfToLevel = RubyUtils::fromValue<int>(rb_rfToLevel);
+//     gbm->parameters->rfToLevel = rfToLevel;
+//     return TOtRubyInterface::Qnil;
+// }
+// OtInterface::VALUE zenith_gbm_capTrees( VALUE self, VALUE rb_cap )
+// {
+//     GBMRunner* gbm = (GBMRunner*)otRuby->GetDataPtr(self);
+//     int cap = RubyUtils::fromValue<int>(rb_cap);
+//     gbm->capTrees(cap);
+//     return TOtRubyInterface::Qnil;
+// }
+// OtInterface::VALUE zenith_gbm_setScale( VALUE self, VALUE rb_scale )
+// {
+//     GBMRunner* gbm = (GBMRunner*)otRuby->GetDataPtr(self);
+//     double scale = RubyUtils::fromValue<double>(rb_scale);
+//     gbm->parameters->scale = scale;;
+//     return TOtRubyInterface::Qnil;
+// }
+// OtInterface::VALUE splitValueOrCategories(MLData* data, shared_ptr<SplitDefinition> splitDefinition, Partition partition)
+// {
+//     VALUE returnValue;
+//     if (splitDefinition->isCategorical())
+//     {
+//         // categories as an array
+//         if (partition == LHS)
+//             returnValue = RubyUtils::toValue(splitDefinition->getLhsCategories());
+//         else if (partition == RHS)
+//             returnValue = RubyUtils::toValue(splitDefinition->getRhsCategories());
+//         else if (partition == MISSING)
+//         {
+//             set<double> setMissingValue;
+//             setMissingValue.insert(data->getMissingValue());
+//             returnValue = RubyUtils::toValue(setMissingValue);
+//         }
+//         else
+//             throw std::runtime_error("Primary partition should be either LHS, RHS or MISSING!");
+//     }
+//     else
+//     {
+//         // split value as a double
+//         returnValue = RubyUtils::toValue(splitDefinition->getSplitValue());
+//     }
+//     return returnValue;
+// }
+// OtInterface::VALUE zenith_gbm_getFeatureInteractions( VALUE self, VALUE howMany )
+// {
+//     GBMRunner* gbm = (GBMRunner*)otRuby->GetDataPtr(self);
+//     vector<FeatureInteraction> featureInteractions;
+//     try
+//     {
+//         featureInteractions = gbm->getFeatureInteractions(RubyUtils::fromValue<int>(howMany));
+//     }
+//     catch (std::exception e)
+//     {
+//         vlcMessage.Raise((string("Could not get calculate interactions. Error: ") + e.what()).c_str());
+//     }
+//     MLData* data = gbm->getData();
+//     vector<string> featureNames = data->getFeatures();
+//     vector<vector<VALUE> > returnVector;
+//     returnVector.reserve(featureInteractions.size());
+//     BOOST_FOREACH(auto& interaction, featureInteractions)
+//     {
+//         vector<VALUE> v;
+//         v.reserve(6);
+//         // improvement
+//         v.push_back(RubyUtils::toValue(interaction.secondarySplitDefinition->getImprovement()));
+//         // primary feature name
+//         v.push_back(RubyUtils::toValue(featureNames.at(interaction.primarySplitDefinition->getFeatureIndex())));
+//         // either the split value (as double), or the categories
+//         v.push_back(splitValueOrCategories(data, interaction.primarySplitDefinition, interaction.primaryPartition));
+//         // the partition chosen
+//         Partition p = interaction.primaryPartition;
+//         int partition = (p == LHS ? 1 : (p == RHS ? 2 : 3));
+//         v.push_back(RubyUtils::toValue(partition));
+//         // second feature name
+//         v.push_back(RubyUtils::toValue(featureNames.at(interaction.secondarySplitDefinition->getFeatureIndex())));
+//         // secondary split value / left hand side categories
+//         v.push_back(splitValueOrCategories(data, interaction.secondarySplitDefinition, LHS));
+//         // no need for a secondary partition, as all children of the second partition are important
+//         returnVector.push_back(v);
+//     }
+//     return RubyUtils::toValue(returnVector);
+// }