ml4r 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/ml4r/LinearRegression/LinearRegression.cpp +305 -0
- data/ext/ml4r/LinearRegression/OLSLinearRegression.cpp +75 -0
- data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.cpp +50 -0
- data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.cpp +195 -0
- data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.cpp +551 -0
- data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.cpp +22 -0
- data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.cpp +21 -0
- data/ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.cpp +142 -0
- data/ext/ml4r/MachineLearning/GBM/BernoulliCalculator.cpp +95 -0
- data/ext/ml4r/MachineLearning/GBM/GBMEstimator.cpp +601 -0
- data/ext/ml4r/MachineLearning/GBM/GBMOutput.cpp +86 -0
- data/ext/ml4r/MachineLearning/GBM/GBMRunner.cpp +117 -0
- data/ext/ml4r/MachineLearning/GBM/GaussianCalculator.cpp +94 -0
- data/ext/ml4r/MachineLearning/GBM/ZenithGBM.cpp +317 -0
- data/ext/ml4r/MachineLearning/MLData/MLData.cpp +232 -0
- data/ext/ml4r/MachineLearning/MLData/MLDataFields.cpp +1 -0
- data/ext/ml4r/MachineLearning/MLData/MLDataReader.cpp +139 -0
- data/ext/ml4r/MachineLearning/MLData/ZenithMLData.cpp +96 -0
- data/ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.cpp +113 -0
- data/ext/ml4r/MachineLearning/MLExperiment.cpp +69 -0
- data/ext/ml4r/MachineLearning/MLRunner.cpp +183 -0
- data/ext/ml4r/MachineLearning/MLUtils.cpp +15 -0
- data/ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.cpp +172 -0
- data/ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.cpp +66 -0
- data/ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.cpp +84 -0
- data/ext/ml4r/MachineLearning/RandomForest/ZenithRandomForest.cpp +184 -0
- data/ext/ml4r/ml4r.cpp +34 -0
- data/ext/ml4r/ml4r_wrap.cpp +15727 -0
- data/ext/ml4r/utils/MathUtils.cpp +204 -0
- data/ext/ml4r/utils/StochasticUtils.cpp +73 -0
- data/ext/ml4r/utils/Utils.cpp +14 -0
- data/ext/ml4r/utils/VlcMessage.cpp +3 -0
- metadata +33 -1
@@ -0,0 +1,183 @@
|
|
1
|
+
#include "MachineLearning/MLRunner.h"
|
2
|
+
#include "MachineLearning/MLData/MLData.h"
|
3
|
+
#include "MachineLearning/MLOutput.h"
|
4
|
+
#include "MachineLearning/MLExperiment.h"
|
5
|
+
#include "MachineLearning/MLParameters.h"
|
6
|
+
#include "MachineLearning/MLEstimator.h"
|
7
|
+
#include "MachineLearning/MLEstimatorFactory.h"
|
8
|
+
|
9
|
+
#include "utils/VlcMessage.h"
|
10
|
+
|
11
|
+
// #ifdef TBB_USE_THREADING_TOOLS
|
12
|
+
// #undef TBB_USE_THREADING_TOOLS
|
13
|
+
// #endif
|
14
|
+
// #define TBB_USE_THREADING_TOOLS 1
|
15
|
+
// #include "tbb/task_scheduler_init.h"
|
16
|
+
// #include "tbb/parallel_for.h"
|
17
|
+
// #include "tbb/blocked_range.h"
|
18
|
+
// #include "tbb/explicit_range.h"
|
19
|
+
|
20
|
+
#include <boost/foreach.hpp>
|
21
|
+
|
22
|
+
MLRunner::MLRunner()
|
23
|
+
: m_data(0)
|
24
|
+
{
|
25
|
+
|
26
|
+
}
|
27
|
+
|
28
|
+
MLRunner::~MLRunner()
|
29
|
+
{
|
30
|
+
|
31
|
+
}
|
32
|
+
|
33
|
+
void MLRunner::execute()
|
34
|
+
{
|
35
|
+
checks();
|
36
|
+
config();
|
37
|
+
input();
|
38
|
+
estimate();
|
39
|
+
output();
|
40
|
+
}
|
41
|
+
|
42
|
+
void MLRunner::checks()
|
43
|
+
{
|
44
|
+
if (m_data == 0)
|
45
|
+
throw std::runtime_error("[MLRunner::checks()] - MLData is not defined - have you provided input data?");
|
46
|
+
}
|
47
|
+
|
48
|
+
void MLRunner::config()
|
49
|
+
{
|
50
|
+
|
51
|
+
}
|
52
|
+
|
53
|
+
void MLRunner::input()
|
54
|
+
{
|
55
|
+
|
56
|
+
}
|
57
|
+
|
58
|
+
void MLRunner::estimate()
|
59
|
+
{
|
60
|
+
vector<int>& foldNumbers = m_data->getFoldNumbers();
|
61
|
+
long numFolds = foldNumbers.size();
|
62
|
+
long numThreads = numFolds; // TODO: change this!
|
63
|
+
|
64
|
+
// tbb::task_scheduler_init init(numFolds);
|
65
|
+
// static tbb::simple_partitioner sp;
|
66
|
+
|
67
|
+
//int grainSize = numFolds / numThreads;
|
68
|
+
|
69
|
+
m_outputObjects.resize(numFolds);
|
70
|
+
m_estimators.resize(numFolds);
|
71
|
+
|
72
|
+
//tbb::parallel_for(explicit_range<size_t>(0, numFolds, grainSize),
|
73
|
+
// [&](const explicit_range<size_t>& r) {
|
74
|
+
// int threadNumber = r.begin() / grainSize;
|
75
|
+
// for(size_t foldIndex=r.begin(); foldIndex!=r.end(); ++foldIndex)
|
76
|
+
for (long foldIndex = 0; foldIndex < numFolds; ++foldIndex)
|
77
|
+
{
|
78
|
+
vlcMessage.Begin("Estimating");
|
79
|
+
int foldNumber = foldNumbers.at(foldIndex);
|
80
|
+
|
81
|
+
shared_ptr<MLEstimator> estimator = createEstimator(m_data, m_data->getTrainingExperiments(foldNumber));
|
82
|
+
m_estimators.at(foldIndex) = estimator;
|
83
|
+
m_outputObjects.at(foldIndex) = estimator->estimate();
|
84
|
+
|
85
|
+
vlcMessage.End();
|
86
|
+
}
|
87
|
+
//}, sp);
|
88
|
+
}
|
89
|
+
|
90
|
+
void MLRunner::output()
|
91
|
+
{
|
92
|
+
|
93
|
+
}
|
94
|
+
|
95
|
+
void MLRunner::setData( MLData* data )
|
96
|
+
{
|
97
|
+
m_data = data;
|
98
|
+
}
|
99
|
+
|
100
|
+
MLData* MLRunner::getData()
|
101
|
+
{
|
102
|
+
return m_data;
|
103
|
+
}
|
104
|
+
|
105
|
+
vector<double> MLRunner::getPredictions( MLData* newData )
|
106
|
+
{
|
107
|
+
if (m_data->initialPredictionsDefined() && !newData->initialPredictionsDefined())
|
108
|
+
throw std::runtime_error("Cannot apply model to new data as initial predictions are not defined (but were in initial data).");
|
109
|
+
|
110
|
+
if (m_data->getFeatures() != newData->getFeatures())
|
111
|
+
throw std::runtime_error("Features in prediction dataset do not match those in the estimation dataset (order is important)");
|
112
|
+
|
113
|
+
return getPredictions(newData->getExperiments());
|
114
|
+
}
|
115
|
+
|
116
|
+
vector<double> MLRunner::getPredictions( vector<shared_ptr<MLExperiment> > experiments )
|
117
|
+
{
|
118
|
+
// we can get a prediction from each of our outputs, when then need to be averaged.
|
119
|
+
vector<double> predictions;
|
120
|
+
predictions.reserve(experiments.size());
|
121
|
+
|
122
|
+
BOOST_FOREACH(shared_ptr<MLExperiment> experiment, experiments)
|
123
|
+
{
|
124
|
+
vector<double> experimentPredictions;
|
125
|
+
experimentPredictions.reserve(m_outputObjects.size());
|
126
|
+
|
127
|
+
BOOST_FOREACH(shared_ptr<MLOutput> outputObject, m_outputObjects)
|
128
|
+
{
|
129
|
+
experimentPredictions.push_back(outputObject->predictForExperiment(experiment));
|
130
|
+
}
|
131
|
+
predictions.push_back(m_outputObjects.front()->calculateAveragePredictions(experimentPredictions));
|
132
|
+
}
|
133
|
+
return predictions;
|
134
|
+
}
|
135
|
+
|
136
|
+
vector<double> MLRunner::getMeanTrainingPredictions()
|
137
|
+
{
|
138
|
+
long experimentCount = m_data->getExperiments().size();
|
139
|
+
vector<double> meanPredictions;
|
140
|
+
meanPredictions.reserve(experimentCount);
|
141
|
+
|
142
|
+
vector<vector<double> > predictionsForEachFold(experimentCount);
|
143
|
+
|
144
|
+
BOOST_FOREACH(vector<double>& experimentPredictions, predictionsForEachFold)
|
145
|
+
experimentPredictions.reserve(m_outputObjects.size() - 1);
|
146
|
+
|
147
|
+
BOOST_FOREACH(shared_ptr<MLOutput> outputObject, m_outputObjects)
|
148
|
+
{
|
149
|
+
BOOST_FOREACH(int experimentIndex, outputObject->getTrainingExperimentIndicies())
|
150
|
+
{
|
151
|
+
double prediction = outputObject->predictForExperiment(m_data->getExperiment(experimentIndex));
|
152
|
+
predictionsForEachFold.at(experimentIndex).push_back(prediction);
|
153
|
+
}
|
154
|
+
}
|
155
|
+
|
156
|
+
// take the mean of our predictions
|
157
|
+
BOOST_FOREACH(vector<double>& experimentPredictions, predictionsForEachFold)
|
158
|
+
meanPredictions.push_back(m_outputObjects.front()->calculateAveragePredictions(experimentPredictions));
|
159
|
+
|
160
|
+
return meanPredictions;
|
161
|
+
}
|
162
|
+
|
163
|
+
vector<double> MLRunner::getCrossValidationPredictions()
|
164
|
+
{
|
165
|
+
int experimentCount = (int) m_data->getExperiments().size();
|
166
|
+
vector<double> predictions(experimentCount);
|
167
|
+
|
168
|
+
int foldIndex = -1;
|
169
|
+
vector<int> foldNumbers = m_data->getFoldNumbers();
|
170
|
+
|
171
|
+
BOOST_FOREACH(shared_ptr<MLOutput> outputObject, m_outputObjects)
|
172
|
+
{
|
173
|
+
++foldIndex;
|
174
|
+
int foldNumber = foldNumbers.at(foldIndex);
|
175
|
+
|
176
|
+
BOOST_FOREACH(shared_ptr<MLExperiment> experiment, m_data->getCrossValidationExperiments(foldNumber))
|
177
|
+
{
|
178
|
+
double prediction = outputObject->predictForExperiment(experiment);
|
179
|
+
predictions.at(experiment->getExperimentIndex()) = prediction;
|
180
|
+
}
|
181
|
+
}
|
182
|
+
return predictions;
|
183
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
#include "MachineLearning/MLUtils.h"
|
2
|
+
|
3
|
+
#include <boost/foreach.hpp>
|
4
|
+
|
5
|
+
double MLUtils::getMeanY(vector<shared_ptr<MLExperiment> > experiments)
|
6
|
+
{
|
7
|
+
double sumY = 0.0, sumWeight = 0.0;
|
8
|
+
BOOST_FOREACH(shared_ptr<MLExperiment>& e, experiments)
|
9
|
+
{
|
10
|
+
sumY += e->getY() * e->getWeight();
|
11
|
+
sumWeight += e->getWeight();
|
12
|
+
}
|
13
|
+
|
14
|
+
return sumY / sumWeight;
|
15
|
+
}
|
@@ -0,0 +1,172 @@
|
|
1
|
+
#include "MachineLearning/RandomForest/RandomForestEstimator.h"
|
2
|
+
#include "MachineLearning/MLData/MLData.h"
|
3
|
+
#include "MachineLearning/DecisionTree/DecisionTreeExperiment.h"
|
4
|
+
#include "MachineLearning/DecisionTree/DecisionTreeNode.h"
|
5
|
+
#include "MachineLearning/DecisionTree/NodeSplitterCategorical.h"
|
6
|
+
#include "MachineLearning/DecisionTree/NodeSplitterContinuous.h"
|
7
|
+
#include "MachineLearning/DecisionTree/SplitDefinition.h"
|
8
|
+
#include "MachineLearning/RandomForest/RandomForestOutput.h"
|
9
|
+
#include "MachineLearning/MLUtils.h"
|
10
|
+
|
11
|
+
#include "utils/VlcMessage.h"
|
12
|
+
|
13
|
+
#include <boost/make_shared.hpp>
|
14
|
+
using boost::make_shared;
|
15
|
+
|
16
|
+
RandomForestEstimator::RandomForestEstimator(MLData* data,
|
17
|
+
vector<shared_ptr<MLExperiment> > experiments,
|
18
|
+
shared_ptr<RandomForestParameters> parameters)
|
19
|
+
: MLEstimator(data, experiments), m_parameters(parameters)
|
20
|
+
{
|
21
|
+
m_decisionTreeExperiments.reserve(experiments.size());
|
22
|
+
BOOST_FOREACH(shared_ptr<MLExperiment>& experiment, experiments)
|
23
|
+
m_decisionTreeExperiments.push_back(make_shared<DecisionTreeExperiment>(experiment));
|
24
|
+
|
25
|
+
vector<int> experimentIndicies;
|
26
|
+
experimentIndicies.reserve(experiments.size());
|
27
|
+
BOOST_FOREACH(shared_ptr<MLExperiment>& experiment, experiments)
|
28
|
+
experimentIndicies.push_back(experiment->getExperimentIndex());
|
29
|
+
|
30
|
+
m_output = shared_ptr<RandomForestOutput>(new RandomForestOutput(m_data, experimentIndicies, m_parameters));
|
31
|
+
}
|
32
|
+
|
33
|
+
RandomForestEstimator::~RandomForestEstimator()
|
34
|
+
{
|
35
|
+
|
36
|
+
}
|
37
|
+
|
38
|
+
shared_ptr<MLOutput> RandomForestEstimator::estimate()
|
39
|
+
{
|
40
|
+
initializeEstimator();
|
41
|
+
updateZ();
|
42
|
+
|
43
|
+
for (int iteration = 0; iteration < m_parameters->numIterations; ++iteration)
|
44
|
+
{
|
45
|
+
if (m_parameters->verbose)
|
46
|
+
vlcMessage.Begin((string("Iteration ") + boost::lexical_cast<string>(iteration + 1)).c_str());
|
47
|
+
|
48
|
+
performIteration();
|
49
|
+
|
50
|
+
if (m_parameters->verbose)
|
51
|
+
vlcMessage.End();
|
52
|
+
}
|
53
|
+
return shared_ptr<MLOutput>(m_output);
|
54
|
+
}
|
55
|
+
|
56
|
+
shared_ptr<MLOutput> RandomForestEstimator::estimateMore(int numTrees)
|
57
|
+
{
|
58
|
+
initializeEstimator();
|
59
|
+
updateZ();
|
60
|
+
int numberOfExistingTrees = m_output->getNumTrees();
|
61
|
+
|
62
|
+
for (int iteration = 0; iteration < numTrees; ++iteration)
|
63
|
+
{
|
64
|
+
if (m_parameters->verbose)
|
65
|
+
vlcMessage.Begin((string("Iteration ") + boost::lexical_cast<string>(numberOfExistingTrees + iteration + 1)).c_str());
|
66
|
+
|
67
|
+
performIteration();
|
68
|
+
|
69
|
+
if (m_parameters->verbose)
|
70
|
+
vlcMessage.End();
|
71
|
+
}
|
72
|
+
return shared_ptr<MLOutput>(m_output);
|
73
|
+
}
|
74
|
+
|
75
|
+
void RandomForestEstimator::updateZ()
|
76
|
+
{
|
77
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeExperiment> e, m_decisionTreeExperiments)
|
78
|
+
e->setZ(e->getY());
|
79
|
+
}
|
80
|
+
|
81
|
+
void RandomForestEstimator::performIteration()
|
82
|
+
{
|
83
|
+
vector<shared_ptr<DecisionTreeExperiment> > experiments;
|
84
|
+
size_t bagSize = m_decisionTreeExperiments.size() * m_parameters->bagFraction;
|
85
|
+
|
86
|
+
if (m_parameters->withReplacement)
|
87
|
+
experiments = MLUtils::bagObjectsWithReplacement<shared_ptr<DecisionTreeExperiment> >(m_decisionTreeExperiments, (int) bagSize);
|
88
|
+
else
|
89
|
+
{
|
90
|
+
pair<vector<shared_ptr<DecisionTreeExperiment> >,vector<shared_ptr<DecisionTreeExperiment> > > inAndOutOfBag =
|
91
|
+
MLUtils::bagObjectsWithoutReplacement<shared_ptr<DecisionTreeExperiment> >(m_decisionTreeExperiments, (int) std::min(m_decisionTreeExperiments.size(), bagSize));
|
92
|
+
experiments = inAndOutOfBag.first;
|
93
|
+
}
|
94
|
+
|
95
|
+
if (m_parameters->verbose)
|
96
|
+
vlcMessage.Begin("Constructing decision tree");
|
97
|
+
|
98
|
+
constructDecisionTree(experiments);
|
99
|
+
|
100
|
+
m_output->addHeadDecisionTreeNode(m_decisionTreeHead);
|
101
|
+
m_decisionTreeHead->clearExperimentsWithinTree();
|
102
|
+
|
103
|
+
if (m_parameters->verbose)
|
104
|
+
vlcMessage.End();
|
105
|
+
}
|
106
|
+
|
107
|
+
void RandomForestEstimator::constructDecisionTree(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
|
108
|
+
{
|
109
|
+
vector<shared_ptr<DecisionTreeNode> > currentGeneration;
|
110
|
+
vector<shared_ptr<DecisionTreeNode> > nextGeneration;
|
111
|
+
|
112
|
+
// create a head DecisionTreeNode
|
113
|
+
double sumZ = 0.0, sumW = 0.0;
|
114
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& e, experiments)
|
115
|
+
{
|
116
|
+
double w = e->getWeight();
|
117
|
+
sumW += w;
|
118
|
+
sumZ += w * e->getZ();
|
119
|
+
}
|
120
|
+
|
121
|
+
m_decisionTreeHead = shared_ptr<DecisionTreeNode>(new DecisionTreeNode(experiments, sumZ, sumW, ROOT, shared_ptr<SplitDefinition>()));
|
122
|
+
currentGeneration.push_back(m_decisionTreeHead);
|
123
|
+
|
124
|
+
NodeSplitter splitter(m_data, m_parameters->minObservations, m_parameters->scale);
|
125
|
+
|
126
|
+
while (!currentGeneration.empty())
|
127
|
+
{
|
128
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeNode> nodeToSplit, currentGeneration)
|
129
|
+
{
|
130
|
+
if (nodeToSplit->getSumW() == 0)
|
131
|
+
continue;
|
132
|
+
|
133
|
+
// choose M variables to test splitting on
|
134
|
+
// find terminal node with best improvement for any of those variables
|
135
|
+
pair<vector<int>,vector<int> > inAndOut = MLUtils::bagObjectsWithoutReplacement<int>(m_featureIndices, std::min((int)m_featureIndices.size(), m_parameters->tryMVariables));
|
136
|
+
vector<int> featuresToConsider = inAndOut.first;
|
137
|
+
|
138
|
+
double bestImprovement = 0.0;
|
139
|
+
shared_ptr<SplitDefinition> bestSplit;
|
140
|
+
|
141
|
+
vector<shared_ptr<DecisionTreeNode> > children = splitter.splitNode(nodeToSplit, featuresToConsider);
|
142
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeNode>& child, children)
|
143
|
+
{
|
144
|
+
nextGeneration.push_back(child);
|
145
|
+
}
|
146
|
+
}
|
147
|
+
currentGeneration = nextGeneration;
|
148
|
+
nextGeneration.clear();
|
149
|
+
}
|
150
|
+
|
151
|
+
|
152
|
+
}
|
153
|
+
|
154
|
+
void RandomForestEstimator::initializeEstimator()
|
155
|
+
{
|
156
|
+
m_missingValueDefined = m_data->missingValueDefined();
|
157
|
+
if (m_missingValueDefined)
|
158
|
+
m_missingValue = m_data->getMissingValue();
|
159
|
+
|
160
|
+
|
161
|
+
constructFeatureIndices();
|
162
|
+
// sortTrainingExperiments();
|
163
|
+
}
|
164
|
+
|
165
|
+
void RandomForestEstimator::constructFeatureIndices()
|
166
|
+
{
|
167
|
+
BOOST_FOREACH(string feature, m_parameters->featuresToRun)
|
168
|
+
{
|
169
|
+
// note that in a given run, we may not "run" with all loaded variables.
|
170
|
+
m_featureIndices.push_back(m_data->getFeatureIndex(feature));
|
171
|
+
}
|
172
|
+
}
|
@@ -0,0 +1,66 @@
|
|
1
|
+
#include "MachineLearning/RandomForest/RandomForestOutput.h"
|
2
|
+
#include "MachineLearning/RandomForest/RandomForestParameters.h"
|
3
|
+
#include "MachineLearning/DecisionTree/DecisionTreeNode.h"
|
4
|
+
#include "MachineLearning/DecisionTree/DecisionTreeExperiment.h"
|
5
|
+
|
6
|
+
#include "utils/VlcMessage.h"
|
7
|
+
|
8
|
+
#include <boost/make_shared.hpp>
|
9
|
+
using boost::make_shared;
|
10
|
+
|
11
|
+
RandomForestOutput::RandomForestOutput( MLData* trainingData, vector<int> trainingExperimentIndicies, shared_ptr<RandomForestParameters> parameters )
|
12
|
+
: MLOutput(trainingData, trainingExperimentIndicies), m_parameters(parameters)
|
13
|
+
{
|
14
|
+
|
15
|
+
}
|
16
|
+
|
17
|
+
RandomForestOutput::~RandomForestOutput()
|
18
|
+
{
|
19
|
+
|
20
|
+
}
|
21
|
+
|
22
|
+
shared_ptr<RandomForestParameters> RandomForestOutput::getParameters()
|
23
|
+
{
|
24
|
+
return m_parameters;
|
25
|
+
}
|
26
|
+
|
27
|
+
double RandomForestOutput::predictForExperiment( shared_ptr<MLExperiment> experiment )
|
28
|
+
{
|
29
|
+
shared_ptr<DecisionTreeExperiment> dtExperiment = make_shared<DecisionTreeExperiment>(experiment);
|
30
|
+
|
31
|
+
double sumPrediction = 0.0;
|
32
|
+
int count = 0;
|
33
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeNode>& head, m_headNodes)
|
34
|
+
{
|
35
|
+
shared_ptr<DecisionTreeNode> node = head->getTerminalNodeForExperiment(dtExperiment);
|
36
|
+
if (node.get() == 0)
|
37
|
+
node = head;
|
38
|
+
|
39
|
+
if (node->getSumW() == 0)
|
40
|
+
{
|
41
|
+
|
42
|
+
vlcMessage.Write("Zero weight!! WTF!!");
|
43
|
+
vlcMessage.Write("SumZ: "+ boost::lexical_cast<string>(node->getSumZ()));
|
44
|
+
vlcMessage.Write("exp.size() " + boost::lexical_cast<string>(node->getExperiments().size()));
|
45
|
+
vlcMessage.Write("Node is head: " + boost::lexical_cast<string>(node == head));
|
46
|
+
}
|
47
|
+
|
48
|
+
if (node->isTerminalNode())
|
49
|
+
{
|
50
|
+
sumPrediction += node->getSumZ() / node->getSumW();
|
51
|
+
count++;
|
52
|
+
}
|
53
|
+
|
54
|
+
}
|
55
|
+
return sumPrediction / count;
|
56
|
+
}
|
57
|
+
|
58
|
+
void RandomForestOutput::addHeadDecisionTreeNode( shared_ptr<DecisionTreeNode> node )
|
59
|
+
{
|
60
|
+
m_headNodes.push_back(node);
|
61
|
+
}
|
62
|
+
|
63
|
+
int RandomForestOutput::getNumTrees()
|
64
|
+
{
|
65
|
+
return (int) m_headNodes.size();
|
66
|
+
}
|
@@ -0,0 +1,84 @@
|
|
1
|
+
#include "MachineLearning/RandomForest/RandomForestRunner.h"
|
2
|
+
#include "MachineLearning/RandomForest/RandomForestOutput.h"
|
3
|
+
#include "MachineLearning/RandomForest/RandomForestEstimator.h"
|
4
|
+
#include "MachineLearning/DecisionTree/DecisionTreeNode.h"
|
5
|
+
#include "MachineLearning/MLData/MLData.h"
|
6
|
+
|
7
|
+
#include "utils/VlcMessage.h"
|
8
|
+
|
9
|
+
// #ifdef TBB_USE_THREADING_TOOLS
|
10
|
+
// #undef TBB_USE_THREADING_TOOLS
|
11
|
+
// #endif
|
12
|
+
// #define TBB_USE_THREADING_TOOLS 1
|
13
|
+
// #include "tbb/task_scheduler_init.h"
|
14
|
+
// #include "tbb/parallel_for.h"
|
15
|
+
// #include "tbb/blocked_range.h"
|
16
|
+
// #include "tbb/explicit_range.h"
|
17
|
+
|
18
|
+
#include <boost/pointer_cast.hpp>
|
19
|
+
using boost::dynamic_pointer_cast;
|
20
|
+
#include <math.h>
|
21
|
+
|
22
|
+
RandomForestRunner::RandomForestRunner()
|
23
|
+
{
|
24
|
+
|
25
|
+
}
|
26
|
+
|
27
|
+
RandomForestRunner::~RandomForestRunner()
|
28
|
+
{
|
29
|
+
|
30
|
+
}
|
31
|
+
|
32
|
+
void RandomForestRunner::estimateMore(int numTrees)
|
33
|
+
{
|
34
|
+
int numFolds = m_data->getNumFolds();
|
35
|
+
int numThreads = m_data->getNumFolds();
|
36
|
+
|
37
|
+
// tbb::task_scheduler_init init(numFolds);
|
38
|
+
// static tbb::simple_partitioner sp;
|
39
|
+
|
40
|
+
int grainSize = numFolds / numThreads;
|
41
|
+
|
42
|
+
// tbb::parallel_for(explicit_range<size_t>(0, numFolds, grainSize),
|
43
|
+
// [&](const explicit_range<size_t>& r) {
|
44
|
+
// int threadNumber = r.begin() / grainSize;
|
45
|
+
// for(size_t foldIndex=r.begin(); foldIndex!=r.end(); ++foldIndex)
|
46
|
+
for(int foldIndex=numFolds; foldIndex<numFolds; ++foldIndex)
|
47
|
+
{
|
48
|
+
vlcMessage.Begin("Estimating more...");
|
49
|
+
|
50
|
+
shared_ptr<RandomForestEstimator> estimator = dynamic_pointer_cast<RandomForestEstimator>(m_estimators.at(foldIndex));
|
51
|
+
estimator->estimateMore(numTrees);
|
52
|
+
|
53
|
+
vlcMessage.End();
|
54
|
+
}
|
55
|
+
// }, sp);
|
56
|
+
}
|
57
|
+
|
58
|
+
void RandomForestRunner::config()
|
59
|
+
{
|
60
|
+
vector<string>& dataFeatures = m_data->getFeatures();
|
61
|
+
|
62
|
+
if (parameters->featuresToRun.empty())
|
63
|
+
parameters->featuresToRun = dataFeatures;
|
64
|
+
else
|
65
|
+
{
|
66
|
+
BOOST_FOREACH(string& feature, parameters->featuresToRun)
|
67
|
+
{
|
68
|
+
if (Utils::vectorIndex(dataFeatures, feature) == -1)
|
69
|
+
throw std::runtime_error("Feature '" + feature + "' specified as part of parameter 'featuresToRun', but feature not found in data");
|
70
|
+
}
|
71
|
+
}
|
72
|
+
if (parameters->featuresToRun.empty())
|
73
|
+
throw std::runtime_error("There are no features to run!");
|
74
|
+
|
75
|
+
if (m_data->missingValueDefined())
|
76
|
+
DecisionTreeNode::setMissingValue(m_data->getMissingValue());
|
77
|
+
}
|
78
|
+
|
79
|
+
|
80
|
+
|
81
|
+
shared_ptr<MLEstimator> RandomForestRunner::createEstimator( MLData* data, vector<shared_ptr<MLExperiment> > trainingExperiments )
|
82
|
+
{
|
83
|
+
return shared_ptr<MLEstimator>(shared_ptr<RandomForestEstimator>(new RandomForestEstimator(data, trainingExperiments, parameters)));
|
84
|
+
}
|