ml4r 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/ml4r/LinearRegression/LinearRegression.cpp +305 -0
- data/ext/ml4r/LinearRegression/OLSLinearRegression.cpp +75 -0
- data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.cpp +50 -0
- data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.cpp +195 -0
- data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.cpp +551 -0
- data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.cpp +22 -0
- data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.cpp +21 -0
- data/ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.cpp +142 -0
- data/ext/ml4r/MachineLearning/GBM/BernoulliCalculator.cpp +95 -0
- data/ext/ml4r/MachineLearning/GBM/GBMEstimator.cpp +601 -0
- data/ext/ml4r/MachineLearning/GBM/GBMOutput.cpp +86 -0
- data/ext/ml4r/MachineLearning/GBM/GBMRunner.cpp +117 -0
- data/ext/ml4r/MachineLearning/GBM/GaussianCalculator.cpp +94 -0
- data/ext/ml4r/MachineLearning/GBM/ZenithGBM.cpp +317 -0
- data/ext/ml4r/MachineLearning/MLData/MLData.cpp +232 -0
- data/ext/ml4r/MachineLearning/MLData/MLDataFields.cpp +1 -0
- data/ext/ml4r/MachineLearning/MLData/MLDataReader.cpp +139 -0
- data/ext/ml4r/MachineLearning/MLData/ZenithMLData.cpp +96 -0
- data/ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.cpp +113 -0
- data/ext/ml4r/MachineLearning/MLExperiment.cpp +69 -0
- data/ext/ml4r/MachineLearning/MLRunner.cpp +183 -0
- data/ext/ml4r/MachineLearning/MLUtils.cpp +15 -0
- data/ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.cpp +172 -0
- data/ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.cpp +66 -0
- data/ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.cpp +84 -0
- data/ext/ml4r/MachineLearning/RandomForest/ZenithRandomForest.cpp +184 -0
- data/ext/ml4r/ml4r.cpp +34 -0
- data/ext/ml4r/ml4r_wrap.cpp +15727 -0
- data/ext/ml4r/utils/MathUtils.cpp +204 -0
- data/ext/ml4r/utils/StochasticUtils.cpp +73 -0
- data/ext/ml4r/utils/Utils.cpp +14 -0
- data/ext/ml4r/utils/VlcMessage.cpp +3 -0
- metadata +33 -1
@@ -0,0 +1,601 @@
|
|
1
|
+
#include "MachineLearning/GBM/GBMEstimator.h"
|
2
|
+
#include "MachineLearning/GBM/GBMParameters.h"
|
3
|
+
#include "MachineLearning/GBM/GBMOutput.h"
|
4
|
+
#include "MachineLearning/GBM/BernoulliCalculator.h"
|
5
|
+
#include "MachineLearning/GBM/GaussianCalculator.h"
|
6
|
+
#include "MachineLearning/MLData/MLData.h"
|
7
|
+
#include "MachineLearning/MLUtils.h"
|
8
|
+
#include "MachineLearning/DecisionTree/SplitDefinition.h"
|
9
|
+
#include "MachineLearning/DecisionTree/NodeSplitterCategorical.h"
|
10
|
+
#include "MachineLearning/DecisionTree/NodeSplitterContinuous.h"
|
11
|
+
#include "MachineLearning/DecisionTree/DecisionTreeExperiment.h"
|
12
|
+
#include "MachineLearning/DecisionTree/DecisionTreeNode.h"
|
13
|
+
#include "MachineLearning/DecisionTree/FeatureInteraction.h"
|
14
|
+
|
15
|
+
#include <algorithm>
|
16
|
+
#include <boost/foreach.hpp>
|
17
|
+
#include <boost/make_shared.hpp>
|
18
|
+
#include <boost/lexical_cast.hpp>
|
19
|
+
using boost::lexical_cast;
|
20
|
+
using boost::make_shared;
|
21
|
+
using std::make_pair;
|
22
|
+
|
23
|
+
#include "utils/VlcMessage.h"
|
24
|
+
|
25
|
+
GBMEstimator::GBMEstimator(MLData* data, vector<shared_ptr<MLExperiment> > experiments, shared_ptr<GBMParameters> parameters)
|
26
|
+
: MLEstimator(data, experiments), m_parameters(parameters)
|
27
|
+
{
|
28
|
+
m_decisionTreeExperiments.reserve(experiments.size());
|
29
|
+
BOOST_FOREACH(shared_ptr<MLExperiment>& experiment, experiments)
|
30
|
+
m_decisionTreeExperiments.push_back(make_shared<DecisionTreeExperiment>(experiment));
|
31
|
+
|
32
|
+
vector<int> experimentIndicies;
|
33
|
+
experimentIndicies.reserve(experiments.size());
|
34
|
+
BOOST_FOREACH(shared_ptr<MLExperiment>& experiment, experiments)
|
35
|
+
experimentIndicies.push_back(experiment->getExperimentIndex());
|
36
|
+
|
37
|
+
m_output = shared_ptr<GBMOutput>(new GBMOutput(m_data, experimentIndicies, m_parameters));
|
38
|
+
|
39
|
+
initializeEstimator();
|
40
|
+
}
|
41
|
+
|
42
|
+
GBMEstimator::~GBMEstimator() {}
|
43
|
+
|
44
|
+
shared_ptr<MLOutput> GBMEstimator::estimate()
|
45
|
+
{
|
46
|
+
initialiseGBMExperimentData();
|
47
|
+
|
48
|
+
for (int iteration = 0; iteration < m_parameters->numIterations; ++iteration)
|
49
|
+
{
|
50
|
+
if (m_parameters->verbose)
|
51
|
+
vlcMessage.Begin((string("Iteration ") + lexical_cast<string>(iteration + 1)).c_str());
|
52
|
+
|
53
|
+
performIteration();
|
54
|
+
|
55
|
+
if (m_parameters->verbose)
|
56
|
+
vlcMessage.End();
|
57
|
+
}
|
58
|
+
return shared_ptr<MLOutput>(m_output);
|
59
|
+
}
|
60
|
+
|
61
|
+
|
62
|
+
void GBMEstimator::estimateMore(int numTrees)
|
63
|
+
{
|
64
|
+
int numberOfExistingTrees = m_output->getNumTrees();
|
65
|
+
|
66
|
+
for (int iteration = 0; iteration < numTrees; ++iteration)
|
67
|
+
{
|
68
|
+
if (m_parameters->verbose)
|
69
|
+
vlcMessage.Begin(string("Iteration ") + lexical_cast<string>(numberOfExistingTrees + iteration + 1));
|
70
|
+
|
71
|
+
performIteration();
|
72
|
+
|
73
|
+
if (m_parameters->verbose)
|
74
|
+
vlcMessage.End();
|
75
|
+
}
|
76
|
+
}
|
77
|
+
|
78
|
+
|
79
|
+
void GBMEstimator::performIteration()
|
80
|
+
{
|
81
|
+
// update Z based on latest F
|
82
|
+
vector<shared_ptr<DecisionTreeExperiment> >& experiments = m_decisionTreeExperiments;
|
83
|
+
|
84
|
+
long bagSize = m_parameters->bagFraction * m_decisionTreeExperiments.size();
|
85
|
+
|
86
|
+
pair<vector<shared_ptr<DecisionTreeExperiment> >,vector<shared_ptr<DecisionTreeExperiment> > > inAndOutOfBagExperiments =
|
87
|
+
MLUtils::bagObjectsWithoutReplacement<shared_ptr<DecisionTreeExperiment> >(m_decisionTreeExperiments, (int) bagSize);
|
88
|
+
|
89
|
+
if (m_parameters->verbose)
|
90
|
+
vlcMessage.Begin("Constructing decision tree");
|
91
|
+
|
92
|
+
if (m_parameters->greedy)
|
93
|
+
constructDecisionTree(inAndOutOfBagExperiments.first);
|
94
|
+
else
|
95
|
+
constructGenerousDecisionTree(inAndOutOfBagExperiments.first, m_parameters->rfToLevel);
|
96
|
+
|
97
|
+
m_output->addHeadDecisionTreeNode(m_decisionTreeHead);
|
98
|
+
|
99
|
+
if (m_parameters->verbose)
|
100
|
+
vlcMessage.End();
|
101
|
+
|
102
|
+
// update F
|
103
|
+
calculateFIncrementPerDecisionTreeNode();
|
104
|
+
m_output->addFIncrements(m_FIncrements);
|
105
|
+
|
106
|
+
// applyFIncrementToInBagExperiments();
|
107
|
+
applyFIncrementToExperiments(experiments);
|
108
|
+
|
109
|
+
// update predictions and Z
|
110
|
+
updatePredictions(experiments);
|
111
|
+
updateZ(experiments);
|
112
|
+
reportDeviance(experiments);
|
113
|
+
deleteRedundantData();
|
114
|
+
}
|
115
|
+
|
116
|
+
void GBMEstimator::constructFeatureIndices()
|
117
|
+
{
|
118
|
+
BOOST_FOREACH(string& feature, m_parameters->featuresToRun)
|
119
|
+
{
|
120
|
+
// note that in a given run, we may not "run" with all loaded variables.
|
121
|
+
m_featureIndices.push_back(m_data->getFeatureIndex(feature));
|
122
|
+
}
|
123
|
+
}
|
124
|
+
|
125
|
+
void GBMEstimator::populateInitialF()
|
126
|
+
{
|
127
|
+
m_gbmCalculator->populateInitialF(m_decisionTreeExperiments, m_data->initialPredictionsDefined());
|
128
|
+
|
129
|
+
if (!m_data->initialPredictionsDefined())
|
130
|
+
m_output->setMeanY(m_decisionTreeExperiments.front()->getPrediction());
|
131
|
+
}
|
132
|
+
|
133
|
+
void GBMEstimator::updateZ(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
|
134
|
+
{
|
135
|
+
m_gbmCalculator->updateZ(experiments);
|
136
|
+
}
|
137
|
+
|
138
|
+
void GBMEstimator::updatePredictions(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
|
139
|
+
{
|
140
|
+
// convert from F to prediction
|
141
|
+
m_gbmCalculator->updatePredictions(experiments);
|
142
|
+
}
|
143
|
+
|
144
|
+
void GBMEstimator::constructDecisionTree(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
|
145
|
+
{
|
146
|
+
// create a head DecisionTreeNode
|
147
|
+
double sumZ = 0.0, sumW = 0.0;
|
148
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& e, experiments)
|
149
|
+
{
|
150
|
+
double w = e->getWeight();
|
151
|
+
sumW += w;
|
152
|
+
sumZ += w * e->getZ();
|
153
|
+
}
|
154
|
+
m_decisionTreeHead = shared_ptr<DecisionTreeNode>(new DecisionTreeNode(experiments, sumZ, sumW, ROOT, shared_ptr<SplitDefinition>()));
|
155
|
+
|
156
|
+
// m_decisionTreeHead->setSortedExperiments(bagSortedExperiments(experiments));
|
157
|
+
|
158
|
+
m_terminalNodes.clear();
|
159
|
+
m_terminalNodes.insert(m_decisionTreeHead);
|
160
|
+
set<shared_ptr<DecisionTreeNode> > nodesToSplit;
|
161
|
+
nodesToSplit.insert(m_decisionTreeHead);
|
162
|
+
|
163
|
+
// map from a decision tree node and feature index to a potential split definition
|
164
|
+
map<pair<shared_ptr<DecisionTreeNode>, int>, shared_ptr<SplitDefinition> > potentialSplitDefinitions;
|
165
|
+
set<int>& categoricalFeatures = m_data->getCategoricalFeatureIndices();
|
166
|
+
|
167
|
+
NodeSplitter splitter(m_data, m_parameters->minObservations, m_parameters->scale);
|
168
|
+
|
169
|
+
for (int k = 0; k < m_parameters->growKDecisionTreeNodes; ++k)
|
170
|
+
{
|
171
|
+
// choose M variables to test splitting on
|
172
|
+
// find terminal node with best improvement for any of those variables
|
173
|
+
vector<int> featuresToConsider = getRandomFeatureList();
|
174
|
+
// pair<shared_ptr<DecisionTreeNode>, int> bestNodeFeature;
|
175
|
+
shared_ptr<SplitDefinition> bestSplit;
|
176
|
+
double bestImprovement = 0.0;
|
177
|
+
|
178
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeNode> node, nodesToSplit)
|
179
|
+
{
|
180
|
+
if (node->getSumW() == 0)
|
181
|
+
continue;
|
182
|
+
|
183
|
+
BOOST_FOREACH(int featureIndex, featuresToConsider)
|
184
|
+
{
|
185
|
+
pair<shared_ptr<DecisionTreeNode>, int> e = make_pair(node, featureIndex);
|
186
|
+
|
187
|
+
if (potentialSplitDefinitions.find(e) == potentialSplitDefinitions.end())
|
188
|
+
{
|
189
|
+
if (Utils::hasElement(categoricalFeatures, featureIndex))
|
190
|
+
potentialSplitDefinitions[e] = splitter.createCategoricalSplitDefinition(node, featureIndex);
|
191
|
+
else
|
192
|
+
potentialSplitDefinitions[e] = splitter.createContinuousSplitDefinition(node, featureIndex);
|
193
|
+
}
|
194
|
+
|
195
|
+
shared_ptr<SplitDefinition> splitDefinition = potentialSplitDefinitions[e];
|
196
|
+
|
197
|
+
if (!splitDefinition.get()) // it returned an invalid
|
198
|
+
continue;
|
199
|
+
|
200
|
+
|
201
|
+
if (splitDefinition->getImprovement() > bestImprovement)
|
202
|
+
{
|
203
|
+
bestImprovement = splitDefinition->getImprovement();
|
204
|
+
bestSplit = splitDefinition;
|
205
|
+
// bestNodeFeature = e;
|
206
|
+
}
|
207
|
+
}
|
208
|
+
}
|
209
|
+
|
210
|
+
if (bestImprovement == 0.0)
|
211
|
+
{
|
212
|
+
if (m_parameters->verbose)
|
213
|
+
vlcMessage.Write("Can't split the tree any further.", 1);
|
214
|
+
|
215
|
+
return; // we obviously didn't get any love out of our terminal nodes - probably means they
|
216
|
+
// can't split any further.
|
217
|
+
}
|
218
|
+
// we now have our best split, so do it!!!
|
219
|
+
|
220
|
+
int featureIndex = bestSplit->getFeatureIndex();
|
221
|
+
bool isCategorical = (categoricalFeatures.find(featureIndex) != categoricalFeatures.end());
|
222
|
+
|
223
|
+
shared_ptr<DecisionTreeNode> lhsChild = splitter.createLhsChild(bestSplit);
|
224
|
+
shared_ptr<DecisionTreeNode> rhsChild = splitter.createRhsChild(bestSplit);
|
225
|
+
shared_ptr<DecisionTreeNode> missingChild = splitter.createMissingChild(bestSplit);
|
226
|
+
|
227
|
+
shared_ptr<DecisionTreeNode> nodeToSplit = bestSplit->getNodeToSplit();
|
228
|
+
|
229
|
+
nodeToSplit->defineSplit(bestSplit, lhsChild,rhsChild,missingChild);
|
230
|
+
|
231
|
+
// finally, remove the node we just split from the terminal nodes, and add the children
|
232
|
+
nodesToSplit.erase(nodeToSplit);
|
233
|
+
nodesToSplit.insert(lhsChild);
|
234
|
+
nodesToSplit.insert(rhsChild);
|
235
|
+
nodesToSplit.insert(missingChild);
|
236
|
+
|
237
|
+
// if it's categorical, there is a chance a new category will come along, and we won't be able to split on it.
|
238
|
+
// which would make this a potential terminal node.
|
239
|
+
// so only erase if it's continuous
|
240
|
+
//if (!isCategorical)
|
241
|
+
// m_terminalNodes.erase(nodeToSplit);
|
242
|
+
|
243
|
+
m_terminalNodes.insert(lhsChild);
|
244
|
+
m_terminalNodes.insert(rhsChild);
|
245
|
+
m_terminalNodes.insert(missingChild);
|
246
|
+
|
247
|
+
if (m_parameters->verbose)
|
248
|
+
{
|
249
|
+
vlcMessage.Write("Level " + lexical_cast<string>(k+1) + ": Split on feature "
|
250
|
+
+ m_data->getFeatures().at(featureIndex) + " at "
|
251
|
+
+ lexical_cast<string>(bestSplit->getSplitValue()) + ". Improvement: "
|
252
|
+
+ lexical_cast<string>(bestImprovement));
|
253
|
+
}
|
254
|
+
|
255
|
+
}
|
256
|
+
}
|
257
|
+
|
258
|
+
|
259
|
+
void GBMEstimator::constructGenerousDecisionTree(vector<shared_ptr<DecisionTreeExperiment> >& experiments, int rfToLevel)
|
260
|
+
{
|
261
|
+
// set Z to Y for RF part
|
262
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& experiment, experiments)
|
263
|
+
{
|
264
|
+
experiment->setZ(experiment->getY());
|
265
|
+
}
|
266
|
+
|
267
|
+
double sumZ = 0.0, sumW = 0.0;
|
268
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& e, experiments)
|
269
|
+
{
|
270
|
+
double w = e->getWeight();
|
271
|
+
sumW += w;
|
272
|
+
sumZ += w * e->getZ();
|
273
|
+
}
|
274
|
+
|
275
|
+
m_decisionTreeHead = shared_ptr<DecisionTreeNode>(new DecisionTreeNode(experiments, sumZ, sumW, ROOT, shared_ptr<SplitDefinition>()));
|
276
|
+
|
277
|
+
// m_decisionTreeHead->setSortedExperiments(bagSortedExperiments(experiments));
|
278
|
+
|
279
|
+
m_terminalNodes.clear();
|
280
|
+
m_terminalNodes.insert(m_decisionTreeHead);
|
281
|
+
vector<shared_ptr<DecisionTreeNode> > nodesToSplit;
|
282
|
+
nodesToSplit.push_back(m_decisionTreeHead);
|
283
|
+
vector<shared_ptr<DecisionTreeNode> > nextNodesToSplit;
|
284
|
+
|
285
|
+
// map from a decision tree node and feature index to a potential split definition
|
286
|
+
set<int>& categoricalFeatures = m_data->getCategoricalFeatureIndices();
|
287
|
+
|
288
|
+
NodeSplitter splitter(m_data, m_parameters->minObservations, m_parameters->scale);
|
289
|
+
|
290
|
+
for (int level = 0; level < rfToLevel; ++level)
|
291
|
+
{
|
292
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeNode>& nodeToSplit, nodesToSplit)
|
293
|
+
{
|
294
|
+
vector<int> featuresToConsider = getRandomFeatureList();
|
295
|
+
vector<shared_ptr<DecisionTreeNode> > children = splitter.splitNode(nodeToSplit, featuresToConsider);
|
296
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeNode>& child, children)
|
297
|
+
{
|
298
|
+
nextNodesToSplit.push_back(child);
|
299
|
+
m_terminalNodes.insert(child);
|
300
|
+
}
|
301
|
+
}
|
302
|
+
nodesToSplit = nextNodesToSplit;
|
303
|
+
nextNodesToSplit.clear();
|
304
|
+
}
|
305
|
+
// have successfully built a random forest to depth rfLevels
|
306
|
+
// reset Z to residuals
|
307
|
+
updateZ(experiments);
|
308
|
+
|
309
|
+
// now gradient boost on the nodesToSplit
|
310
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeNode>& nodeToSplit, nodesToSplit)
|
311
|
+
{
|
312
|
+
nodeToSplit->updateSums();
|
313
|
+
vector<int> featuresToConsider = getRandomFeatureList();
|
314
|
+
vector<shared_ptr<DecisionTreeNode> > children = splitter.splitNode(nodeToSplit, featuresToConsider);
|
315
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeNode>& child, children)
|
316
|
+
m_terminalNodes.insert(child);
|
317
|
+
}
|
318
|
+
}
|
319
|
+
|
320
|
+
vector<int> GBMEstimator::getRandomFeatureList()
|
321
|
+
{
|
322
|
+
vector<int> randomlySelectedFeatures;
|
323
|
+
map<int, bool> featureChosen;
|
324
|
+
|
325
|
+
unsigned int numberToChoose = std::min((int)m_featureIndices.size(), m_parameters->tryMVariables);
|
326
|
+
|
327
|
+
while (randomlySelectedFeatures.size() < numberToChoose)
|
328
|
+
{
|
329
|
+
long r = rand();
|
330
|
+
long index = r * (1.0 / (RAND_MAX + 1L)) * m_featureIndices.size();
|
331
|
+
if (!featureChosen[index] == 1)
|
332
|
+
{
|
333
|
+
featureChosen[index] = 1;
|
334
|
+
randomlySelectedFeatures.push_back(m_featureIndices.at(index));
|
335
|
+
}
|
336
|
+
|
337
|
+
}
|
338
|
+
return randomlySelectedFeatures;
|
339
|
+
}
|
340
|
+
|
341
|
+
|
342
|
+
|
343
|
+
void GBMEstimator::calculateFIncrementPerDecisionTreeNode()
|
344
|
+
{
|
345
|
+
m_FIncrements.clear();
|
346
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeNode> node, m_terminalNodes)
|
347
|
+
{
|
348
|
+
vector<shared_ptr<DecisionTreeExperiment> > experiments = node->getExperiments();
|
349
|
+
|
350
|
+
double fIncrement = m_gbmCalculator->computeFIncrement(experiments);
|
351
|
+
m_FIncrements[node] = fIncrement * m_parameters->shrinkageFactor;
|
352
|
+
}
|
353
|
+
}
|
354
|
+
|
355
|
+
void GBMEstimator::applyFIncrementToInBagExperiments()
|
356
|
+
{
|
357
|
+
// THIS IS BAD, because when bagging with replacement, you can increment the same record twice!!
|
358
|
+
// we know which experiments are in-bag because they're stored by the terminal nodes!
|
359
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeNode> node, m_terminalNodes)
|
360
|
+
{
|
361
|
+
double increment = m_FIncrements[node];
|
362
|
+
|
363
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeExperiment> experiment, node->getExperiments())
|
364
|
+
{
|
365
|
+
// vlcMessage.Write("experiment->getExperimentIndex() => " + ToString(experiment->getExperimentIndex()));
|
366
|
+
// vlcMessage.Write("increment => " + ToString(increment));
|
367
|
+
experiment->incrementF(increment);
|
368
|
+
}
|
369
|
+
}
|
370
|
+
}
|
371
|
+
|
372
|
+
void GBMEstimator::applyFIncrementToExperiments(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
|
373
|
+
{
|
374
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& experiment, experiments)
|
375
|
+
{
|
376
|
+
// get appropriate node
|
377
|
+
shared_ptr<DecisionTreeNode> terminalNode =
|
378
|
+
(m_decisionTreeHead->isTerminalNode() ? m_decisionTreeHead : m_decisionTreeHead->getTerminalNodeForExperiment(experiment));
|
379
|
+
|
380
|
+
if (terminalNode.get() == 0)
|
381
|
+
terminalNode = m_decisionTreeHead;
|
382
|
+
|
383
|
+
if (m_FIncrements.find(terminalNode) == m_FIncrements.end())
|
384
|
+
throw std::runtime_error("We have no increment for this terminal node!!");
|
385
|
+
|
386
|
+
double incrementF = m_FIncrements[terminalNode];
|
387
|
+
experiment->incrementF(incrementF);
|
388
|
+
}
|
389
|
+
}
|
390
|
+
|
391
|
+
void GBMEstimator::initialiseGBMExperimentData()
|
392
|
+
{
|
393
|
+
populateInitialF();
|
394
|
+
updatePredictions(m_decisionTreeExperiments);
|
395
|
+
updateZ(m_decisionTreeExperiments);
|
396
|
+
}
|
397
|
+
|
398
|
+
void GBMEstimator::reportDeviance(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
|
399
|
+
{
|
400
|
+
if (m_parameters->verbose)
|
401
|
+
vlcMessage.Write("Deviance: " + lexical_cast<string>(m_gbmCalculator->calculateDeviance(experiments)));
|
402
|
+
}
|
403
|
+
|
404
|
+
void GBMEstimator::deleteRedundantData()
|
405
|
+
{
|
406
|
+
m_decisionTreeHead->clearExperimentsWithinTree();
|
407
|
+
}
|
408
|
+
|
409
|
+
// map<int, vector<shared_ptr<DecisionTreeExperiment> > >
|
410
|
+
// GBMEstimator::partitionSortedExperiments(shared_ptr<SplitDefinition> splitDefinition,
|
411
|
+
// Partition partition)
|
412
|
+
// {
|
413
|
+
// map<int, vector<shared_ptr<DecisionTreeExperiment> > >& sortedExperiments =
|
414
|
+
// splitDefinition->getNodeToSplit()->getSortedExperiments();
|
415
|
+
//
|
416
|
+
// map<int, vector<shared_ptr<DecisionTreeExperiment> > > partitionSortedExperiments;
|
417
|
+
//
|
418
|
+
// BOOST_FOREACH(auto& e, sortedExperiments)
|
419
|
+
// {
|
420
|
+
// int featureIndex = e.first;
|
421
|
+
// vector<shared_ptr<DecisionTreeExperiment> >& experimentsForFeature = e.second;
|
422
|
+
//
|
423
|
+
// if (experimentsForFeature.size() == 0)
|
424
|
+
// continue;
|
425
|
+
// partitionSortedExperiments[featureIndex] = partitionExperiments(experimentsForFeature, splitDefinition, partition);
|
426
|
+
// }
|
427
|
+
// return partitionSortedExperiments;
|
428
|
+
// }
|
429
|
+
|
430
|
+
|
431
|
+
|
432
|
+
// void GBMEstimator::sortTrainingExperiments()
|
433
|
+
// {
|
434
|
+
// BOOST_FOREACH(auto& featureIndex, m_featureIndices)
|
435
|
+
// {
|
436
|
+
// vector<shared_ptr<DecisionTreeExperiment> > experiments = m_trainingExperiments;
|
437
|
+
// featureSorter.featureIndexToSort = featureIndex;
|
438
|
+
//
|
439
|
+
// sort(experiments.begin(), experiments.end(), featureSorter);
|
440
|
+
// m_sortedTrainingExperiments[featureIndex] = experiments;
|
441
|
+
// }
|
442
|
+
// }
|
443
|
+
|
444
|
+
// map<int, vector<shared_ptr<DecisionTreeExperiment> > > GBMEstimator::bagSortedExperiments(vector<shared_ptr<DecisionTreeExperiment> >& baggedExperiments)
|
445
|
+
// {
|
446
|
+
// vector<bool> inBag(m_data->getExperiments().size(), false);
|
447
|
+
//
|
448
|
+
// BOOST_FOREACH(auto& experiment, baggedExperiments)
|
449
|
+
// inBag.at(experiment->getExperimentIndex()) = true;
|
450
|
+
//
|
451
|
+
// map<int, vector<shared_ptr<DecisionTreeExperiment> > > sortedExperiments;
|
452
|
+
// BOOST_FOREACH(auto& e, m_sortedTrainingExperiments)
|
453
|
+
// {
|
454
|
+
// int featureIndex = e.first;
|
455
|
+
// vector<shared_ptr<DecisionTreeExperiment> >& experiments = e.second;
|
456
|
+
//
|
457
|
+
// vector<shared_ptr<DecisionTreeExperiment> >& baggedSortedExperiments = sortedExperiments[featureIndex];
|
458
|
+
// baggedSortedExperiments.reserve(baggedExperiments.size());
|
459
|
+
// BOOST_FOREACH(auto& experiment, experiments)
|
460
|
+
// {
|
461
|
+
// if (inBag.at(experiment->getExperimentIndex()))
|
462
|
+
// baggedSortedExperiments.push_back(experiment);
|
463
|
+
// }
|
464
|
+
// }
|
465
|
+
// return sortedExperiments;
|
466
|
+
// }
|
467
|
+
|
468
|
+
void GBMEstimator::initializeEstimator()
|
469
|
+
{
|
470
|
+
m_missingValueDefined = m_data->missingValueDefined();
|
471
|
+
if (m_missingValueDefined)
|
472
|
+
m_missingValue = m_data->getMissingValue();
|
473
|
+
|
474
|
+
constructFeatureIndices();
|
475
|
+
// sortTrainingExperiments();
|
476
|
+
|
477
|
+
if (m_parameters->distribution == GAUSSIAN)
|
478
|
+
m_gbmCalculator = make_shared<GaussianCalculator>();
|
479
|
+
else if (m_parameters->distribution == BERNOULLI)
|
480
|
+
m_gbmCalculator = make_shared<BernoulliCalculator>();
|
481
|
+
}
|
482
|
+
|
483
|
+
struct FeatureInteractionSorter
|
484
|
+
{
|
485
|
+
FeatureInteractionSorter()
|
486
|
+
{}
|
487
|
+
|
488
|
+
bool operator() (FeatureInteraction a, FeatureInteraction b)
|
489
|
+
{
|
490
|
+
return a.secondarySplitDefinition->getImprovement() > b.secondarySplitDefinition->getImprovement();
|
491
|
+
}
|
492
|
+
} featureInteractionSorter;
|
493
|
+
|
494
|
+
vector<FeatureInteraction> GBMEstimator::findInteractions(int howMany)
|
495
|
+
{
|
496
|
+
vlcMessage.Write("Finding interactions!");
|
497
|
+
|
498
|
+
vector<shared_ptr<DecisionTreeExperiment> >& experiments = m_decisionTreeExperiments;
|
499
|
+
|
500
|
+
vlcMessage.Write("Initialising");
|
501
|
+
|
502
|
+
initialiseGBMExperimentData();
|
503
|
+
// now reset Z to be Y
|
504
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& experiment, experiments)
|
505
|
+
{
|
506
|
+
experiment->setZ(experiment->getY());
|
507
|
+
}
|
508
|
+
|
509
|
+
double sumZ = 0.0, sumW = 0.0;
|
510
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& e, experiments)
|
511
|
+
{
|
512
|
+
double w = e->getWeight();
|
513
|
+
sumW += w;
|
514
|
+
sumZ += w * e->getZ();
|
515
|
+
}
|
516
|
+
|
517
|
+
vlcMessage.Write("Creating head");
|
518
|
+
m_decisionTreeHead = shared_ptr<DecisionTreeNode>(new DecisionTreeNode(experiments, sumZ, sumW, ROOT, shared_ptr<SplitDefinition>()));
|
519
|
+
|
520
|
+
// map from a decision tree node and feature index to a potential split definition
|
521
|
+
set<int>& categoricalFeatures = m_data->getCategoricalFeatureIndices();
|
522
|
+
|
523
|
+
NodeSplitter splitter(m_data, m_parameters->minObservations, m_parameters->scale);
|
524
|
+
|
525
|
+
map<int, vector<shared_ptr<DecisionTreeNode> > > topLevelSplits;
|
526
|
+
|
527
|
+
vector<int> singleFeature;
|
528
|
+
singleFeature.push_back(0);
|
529
|
+
|
530
|
+
vlcMessage.Write("Creating top level splits");
|
531
|
+
vector<string>& featureNames = m_data->getFeatures();
|
532
|
+
BOOST_FOREACH(int& featureIndex, m_featureIndices)
|
533
|
+
{
|
534
|
+
vlcMessage.Write("Top level " + lexical_cast<string>(featureIndex));
|
535
|
+
// find the best split definition for this feature index
|
536
|
+
singleFeature.at(0) = featureIndex;
|
537
|
+
topLevelSplits[featureIndex] = splitter.splitNode(m_decisionTreeHead, singleFeature);
|
538
|
+
if (featureNames.at(featureIndex) == "Quan_4")
|
539
|
+
{
|
540
|
+
shared_ptr<SplitDefinition> def = topLevelSplits[featureIndex].at(0)->getParentSplitDefinition();
|
541
|
+
vlcMessage.Write("Splitting Quan_4");
|
542
|
+
vlcMessage.Write("Imp: " + lexical_cast<string>(def->getImprovement()));
|
543
|
+
vlcMessage.Write("Split value " + lexical_cast<string>(def->getSplitValue()));
|
544
|
+
vlcMessage.Write("LhsSumZ: " + lexical_cast<string>(def->getLhsSumZ()));
|
545
|
+
vlcMessage.Write("LhsSumW: " + lexical_cast<string>(def->getLhsSumW()));
|
546
|
+
vlcMessage.Write("RhsSumZ: " + lexical_cast<string>(def->getRhsSumZ()));
|
547
|
+
vlcMessage.Write("RhsSumW: " + lexical_cast<string>(def->getRhsSumW()));
|
548
|
+
vlcMessage.Write("MissingSumZ: " + lexical_cast<string>(def->getMissingSumZ()));
|
549
|
+
vlcMessage.Write("MissingSumW: " + lexical_cast<string>(def->getMissingSumW()));
|
550
|
+
}
|
551
|
+
}
|
552
|
+
|
553
|
+
vlcMessage.Write("Updating Z");
|
554
|
+
// reset Z to residuals
|
555
|
+
updateZ(m_decisionTreeExperiments);
|
556
|
+
|
557
|
+
vlcMessage.Write("Allocating mem");
|
558
|
+
vector<FeatureInteraction> featureInteractions;
|
559
|
+
featureInteractions.reserve(m_featureIndices.size() * m_featureIndices.size() * 3);
|
560
|
+
|
561
|
+
|
562
|
+
typedef pair<int, vector<shared_ptr<DecisionTreeNode> > > ElementType;
|
563
|
+
BOOST_FOREACH(ElementType e, topLevelSplits)
|
564
|
+
{
|
565
|
+
int primaryFeatureIndex = e.first;
|
566
|
+
vector<shared_ptr<DecisionTreeNode> > children = e.second;
|
567
|
+
|
568
|
+
if (children.size() == 0)
|
569
|
+
continue;
|
570
|
+
|
571
|
+
vlcMessage.Write("Secondary splits on " + lexical_cast<string>(primaryFeatureIndex));
|
572
|
+
shared_ptr<SplitDefinition> primarySplitDefinition = children.at(0)->getParentSplitDefinition();
|
573
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeNode>& child, children)
|
574
|
+
{
|
575
|
+
if (child->getSumW() == 0)
|
576
|
+
continue;
|
577
|
+
|
578
|
+
// update sumZ / sumW
|
579
|
+
child->updateSums();
|
580
|
+
|
581
|
+
BOOST_FOREACH(int& secondaryFeatureIndex, m_featureIndices)
|
582
|
+
{
|
583
|
+
if (secondaryFeatureIndex == primaryFeatureIndex)
|
584
|
+
continue;
|
585
|
+
|
586
|
+
shared_ptr<SplitDefinition> secondarySplitDefinition = splitter.createSplitDefinition(child,secondaryFeatureIndex);
|
587
|
+
if (secondarySplitDefinition.get() == 0)
|
588
|
+
continue;
|
589
|
+
|
590
|
+
// vlcMessage.Write("Secondary split on with imp " + ToString(secondarySplitDefinition->getImprovement()) + " lhsSumZ: " + ToString(secondarySplitDefinition->getLhsSumZ()) + " lhsSumW: " + ToString(secondarySplitDefinition->getLhsSumW()) + " rhsSumZ: " + ToString(secondarySplitDefinition->getRhsSumZ()) + " rhsSumW: " + ToString(secondarySplitDefinition->getRhsSumW()) + " missingSumZ: " + ToString(secondarySplitDefinition->getMissingSumZ()) + " missingSumZ: " + ToString(secondarySplitDefinition->getMissingSumW()));
|
591
|
+
FeatureInteraction interaction(primarySplitDefinition,secondarySplitDefinition,child->getPartition());
|
592
|
+
featureInteractions.push_back(interaction);
|
593
|
+
}
|
594
|
+
}
|
595
|
+
}
|
596
|
+
vlcMessage.Write("Sorting...");
|
597
|
+
|
598
|
+
sort(featureInteractions.begin(), featureInteractions.end(), featureInteractionSorter);
|
599
|
+
|
600
|
+
return vector<FeatureInteraction>(featureInteractions.begin(), featureInteractions.begin() + howMany);
|
601
|
+
}
|
@@ -0,0 +1,86 @@
|
|
1
|
+
#include "MachineLearning/GBM/GBMOutput.h"
|
2
|
+
#include "MachineLearning/GBM/BernoulliCalculator.h"
|
3
|
+
#include "MachineLearning/GBM/GaussianCalculator.h"
|
4
|
+
#include "MachineLearning/DecisionTree/DecisionTreeExperiment.h"
|
5
|
+
#include "MachineLearning/DecisionTree/DecisionTreeNode.h"
|
6
|
+
|
7
|
+
#include <boost/foreach.hpp>
|
8
|
+
#include <boost/make_shared.hpp>
|
9
|
+
using boost::make_shared;
|
10
|
+
|
11
|
+
GBMOutput::GBMOutput(MLData* trainingData, vector<int> trainingExperimentIndicies, shared_ptr<GBMParameters> parameters)
|
12
|
+
: MLOutput(trainingData, trainingExperimentIndicies), m_parameters(parameters), m_useMeanY(false)
|
13
|
+
{
|
14
|
+
if (m_parameters->distribution == GAUSSIAN)
|
15
|
+
m_gbmCalculator = make_shared<GaussianCalculator>();
|
16
|
+
else if (m_parameters->distribution == BERNOULLI)
|
17
|
+
m_gbmCalculator = make_shared<BernoulliCalculator>();
|
18
|
+
}
|
19
|
+
|
20
|
+
GBMOutput::~GBMOutput()
|
21
|
+
{
|
22
|
+
|
23
|
+
}
|
24
|
+
|
25
|
+
void GBMOutput::addHeadDecisionTreeNode(shared_ptr<DecisionTreeNode> node)
|
26
|
+
{
|
27
|
+
m_headNodes.push_back(node);
|
28
|
+
}
|
29
|
+
|
30
|
+
double GBMOutput::predictForExperiment(shared_ptr<MLExperiment> experiment)
|
31
|
+
{
|
32
|
+
shared_ptr<DecisionTreeExperiment> dtExperiment = make_shared<DecisionTreeExperiment>(experiment);
|
33
|
+
setPredictionForDecisionTreeExperiment(dtExperiment);
|
34
|
+
return dtExperiment->getPrediction();
|
35
|
+
}
|
36
|
+
|
37
|
+
void GBMOutput::addFIncrements(map<shared_ptr<DecisionTreeNode>, double> fIncrements)
|
38
|
+
{
|
39
|
+
m_fIncrements.push_back(fIncrements);
|
40
|
+
}
|
41
|
+
|
42
|
+
void GBMOutput::setMeanY(double y)
|
43
|
+
{
|
44
|
+
m_meanY = y;
|
45
|
+
m_useMeanY = true;
|
46
|
+
}
|
47
|
+
|
48
|
+
int GBMOutput::getNumTrees()
|
49
|
+
{
|
50
|
+
return (int) m_headNodes.size();
|
51
|
+
}
|
52
|
+
|
53
|
+
shared_ptr<GBMParameters> GBMOutput::getParameters()
|
54
|
+
{
|
55
|
+
return m_parameters;
|
56
|
+
}
|
57
|
+
|
58
|
+
void GBMOutput::capTrees( int numTrees )
|
59
|
+
{
|
60
|
+
m_headNodes.resize(numTrees);
|
61
|
+
m_fIncrements.resize(numTrees);
|
62
|
+
}
|
63
|
+
|
64
|
+
void GBMOutput::setPredictionForDecisionTreeExperiment( shared_ptr<DecisionTreeExperiment> experiment )
|
65
|
+
{
|
66
|
+
// determine initial F
|
67
|
+
double initialPrediction = (m_useMeanY ? m_meanY : experiment->getPrediction());
|
68
|
+
|
69
|
+
experiment->setF(m_gbmCalculator->calculateF(initialPrediction));
|
70
|
+
|
71
|
+
int index = -1;
|
72
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeNode> node, m_headNodes)
|
73
|
+
{
|
74
|
+
++index;
|
75
|
+
shared_ptr<DecisionTreeNode> terminalNode = (node->isTerminalNode() ? node : node->getTerminalNodeForExperiment(experiment));
|
76
|
+
if (terminalNode.get() == 0)
|
77
|
+
terminalNode = node;
|
78
|
+
|
79
|
+
if (m_fIncrements.at(index).find(terminalNode) == m_fIncrements.at(index).end())
|
80
|
+
throw std::runtime_error("We have no increment for this terminal node!!");
|
81
|
+
|
82
|
+
double incrementF = m_fIncrements.at(index)[terminalNode];
|
83
|
+
experiment->incrementF(incrementF);
|
84
|
+
}
|
85
|
+
experiment->setPrediction(m_gbmCalculator->calculatePrediction(experiment->getF()));
|
86
|
+
}
|