ml4r 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. data/ext/ml4r/LinearRegression/LinearRegression.cpp +305 -0
  2. data/ext/ml4r/LinearRegression/OLSLinearRegression.cpp +75 -0
  3. data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.cpp +50 -0
  4. data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.cpp +195 -0
  5. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.cpp +551 -0
  6. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.cpp +22 -0
  7. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.cpp +21 -0
  8. data/ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.cpp +142 -0
  9. data/ext/ml4r/MachineLearning/GBM/BernoulliCalculator.cpp +95 -0
  10. data/ext/ml4r/MachineLearning/GBM/GBMEstimator.cpp +601 -0
  11. data/ext/ml4r/MachineLearning/GBM/GBMOutput.cpp +86 -0
  12. data/ext/ml4r/MachineLearning/GBM/GBMRunner.cpp +117 -0
  13. data/ext/ml4r/MachineLearning/GBM/GaussianCalculator.cpp +94 -0
  14. data/ext/ml4r/MachineLearning/GBM/ZenithGBM.cpp +317 -0
  15. data/ext/ml4r/MachineLearning/MLData/MLData.cpp +232 -0
  16. data/ext/ml4r/MachineLearning/MLData/MLDataFields.cpp +1 -0
  17. data/ext/ml4r/MachineLearning/MLData/MLDataReader.cpp +139 -0
  18. data/ext/ml4r/MachineLearning/MLData/ZenithMLData.cpp +96 -0
  19. data/ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.cpp +113 -0
  20. data/ext/ml4r/MachineLearning/MLExperiment.cpp +69 -0
  21. data/ext/ml4r/MachineLearning/MLRunner.cpp +183 -0
  22. data/ext/ml4r/MachineLearning/MLUtils.cpp +15 -0
  23. data/ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.cpp +172 -0
  24. data/ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.cpp +66 -0
  25. data/ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.cpp +84 -0
  26. data/ext/ml4r/MachineLearning/RandomForest/ZenithRandomForest.cpp +184 -0
  27. data/ext/ml4r/ml4r.cpp +34 -0
  28. data/ext/ml4r/ml4r_wrap.cpp +15727 -0
  29. data/ext/ml4r/utils/MathUtils.cpp +204 -0
  30. data/ext/ml4r/utils/StochasticUtils.cpp +73 -0
  31. data/ext/ml4r/utils/Utils.cpp +14 -0
  32. data/ext/ml4r/utils/VlcMessage.cpp +3 -0
  33. metadata +33 -1
@@ -0,0 +1,601 @@
1
+ #include "MachineLearning/GBM/GBMEstimator.h"
2
+ #include "MachineLearning/GBM/GBMParameters.h"
3
+ #include "MachineLearning/GBM/GBMOutput.h"
4
+ #include "MachineLearning/GBM/BernoulliCalculator.h"
5
+ #include "MachineLearning/GBM/GaussianCalculator.h"
6
+ #include "MachineLearning/MLData/MLData.h"
7
+ #include "MachineLearning/MLUtils.h"
8
+ #include "MachineLearning/DecisionTree/SplitDefinition.h"
9
+ #include "MachineLearning/DecisionTree/NodeSplitterCategorical.h"
10
+ #include "MachineLearning/DecisionTree/NodeSplitterContinuous.h"
11
+ #include "MachineLearning/DecisionTree/DecisionTreeExperiment.h"
12
+ #include "MachineLearning/DecisionTree/DecisionTreeNode.h"
13
+ #include "MachineLearning/DecisionTree/FeatureInteraction.h"
14
+
15
+ #include <algorithm>
16
+ #include <boost/foreach.hpp>
17
+ #include <boost/make_shared.hpp>
18
+ #include <boost/lexical_cast.hpp>
19
+ using boost::lexical_cast;
20
+ using boost::make_shared;
21
+ using std::make_pair;
22
+
23
+ #include "utils/VlcMessage.h"
24
+
25
+ GBMEstimator::GBMEstimator(MLData* data, vector<shared_ptr<MLExperiment> > experiments, shared_ptr<GBMParameters> parameters)
26
+ : MLEstimator(data, experiments), m_parameters(parameters)
27
+ {
28
+ m_decisionTreeExperiments.reserve(experiments.size());
29
+ BOOST_FOREACH(shared_ptr<MLExperiment>& experiment, experiments)
30
+ m_decisionTreeExperiments.push_back(make_shared<DecisionTreeExperiment>(experiment));
31
+
32
+ vector<int> experimentIndicies;
33
+ experimentIndicies.reserve(experiments.size());
34
+ BOOST_FOREACH(shared_ptr<MLExperiment>& experiment, experiments)
35
+ experimentIndicies.push_back(experiment->getExperimentIndex());
36
+
37
+ m_output = shared_ptr<GBMOutput>(new GBMOutput(m_data, experimentIndicies, m_parameters));
38
+
39
+ initializeEstimator();
40
+ }
41
+
42
+ GBMEstimator::~GBMEstimator() {}
43
+
44
+ shared_ptr<MLOutput> GBMEstimator::estimate()
45
+ {
46
+ initialiseGBMExperimentData();
47
+
48
+ for (int iteration = 0; iteration < m_parameters->numIterations; ++iteration)
49
+ {
50
+ if (m_parameters->verbose)
51
+ vlcMessage.Begin((string("Iteration ") + lexical_cast<string>(iteration + 1)).c_str());
52
+
53
+ performIteration();
54
+
55
+ if (m_parameters->verbose)
56
+ vlcMessage.End();
57
+ }
58
+ return shared_ptr<MLOutput>(m_output);
59
+ }
60
+
61
+
62
+ void GBMEstimator::estimateMore(int numTrees)
63
+ {
64
+ int numberOfExistingTrees = m_output->getNumTrees();
65
+
66
+ for (int iteration = 0; iteration < numTrees; ++iteration)
67
+ {
68
+ if (m_parameters->verbose)
69
+ vlcMessage.Begin(string("Iteration ") + lexical_cast<string>(numberOfExistingTrees + iteration + 1));
70
+
71
+ performIteration();
72
+
73
+ if (m_parameters->verbose)
74
+ vlcMessage.End();
75
+ }
76
+ }
77
+
78
+
79
+ void GBMEstimator::performIteration()
80
+ {
81
+ // update Z based on latest F
82
+ vector<shared_ptr<DecisionTreeExperiment> >& experiments = m_decisionTreeExperiments;
83
+
84
+ long bagSize = m_parameters->bagFraction * m_decisionTreeExperiments.size();
85
+
86
+ pair<vector<shared_ptr<DecisionTreeExperiment> >,vector<shared_ptr<DecisionTreeExperiment> > > inAndOutOfBagExperiments =
87
+ MLUtils::bagObjectsWithoutReplacement<shared_ptr<DecisionTreeExperiment> >(m_decisionTreeExperiments, (int) bagSize);
88
+
89
+ if (m_parameters->verbose)
90
+ vlcMessage.Begin("Constructing decision tree");
91
+
92
+ if (m_parameters->greedy)
93
+ constructDecisionTree(inAndOutOfBagExperiments.first);
94
+ else
95
+ constructGenerousDecisionTree(inAndOutOfBagExperiments.first, m_parameters->rfToLevel);
96
+
97
+ m_output->addHeadDecisionTreeNode(m_decisionTreeHead);
98
+
99
+ if (m_parameters->verbose)
100
+ vlcMessage.End();
101
+
102
+ // update F
103
+ calculateFIncrementPerDecisionTreeNode();
104
+ m_output->addFIncrements(m_FIncrements);
105
+
106
+ // applyFIncrementToInBagExperiments();
107
+ applyFIncrementToExperiments(experiments);
108
+
109
+ // update predictions and Z
110
+ updatePredictions(experiments);
111
+ updateZ(experiments);
112
+ reportDeviance(experiments);
113
+ deleteRedundantData();
114
+ }
115
+
116
+ void GBMEstimator::constructFeatureIndices()
117
+ {
118
+ BOOST_FOREACH(string& feature, m_parameters->featuresToRun)
119
+ {
120
+ // note that in a given run, we may not "run" with all loaded variables.
121
+ m_featureIndices.push_back(m_data->getFeatureIndex(feature));
122
+ }
123
+ }
124
+
125
+ void GBMEstimator::populateInitialF()
126
+ {
127
+ m_gbmCalculator->populateInitialF(m_decisionTreeExperiments, m_data->initialPredictionsDefined());
128
+
129
+ if (!m_data->initialPredictionsDefined())
130
+ m_output->setMeanY(m_decisionTreeExperiments.front()->getPrediction());
131
+ }
132
+
133
+ void GBMEstimator::updateZ(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
134
+ {
135
+ m_gbmCalculator->updateZ(experiments);
136
+ }
137
+
138
+ void GBMEstimator::updatePredictions(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
139
+ {
140
+ // convert from F to prediction
141
+ m_gbmCalculator->updatePredictions(experiments);
142
+ }
143
+
144
+ void GBMEstimator::constructDecisionTree(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
145
+ {
146
+ // create a head DecisionTreeNode
147
+ double sumZ = 0.0, sumW = 0.0;
148
+ BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& e, experiments)
149
+ {
150
+ double w = e->getWeight();
151
+ sumW += w;
152
+ sumZ += w * e->getZ();
153
+ }
154
+ m_decisionTreeHead = shared_ptr<DecisionTreeNode>(new DecisionTreeNode(experiments, sumZ, sumW, ROOT, shared_ptr<SplitDefinition>()));
155
+
156
+ // m_decisionTreeHead->setSortedExperiments(bagSortedExperiments(experiments));
157
+
158
+ m_terminalNodes.clear();
159
+ m_terminalNodes.insert(m_decisionTreeHead);
160
+ set<shared_ptr<DecisionTreeNode> > nodesToSplit;
161
+ nodesToSplit.insert(m_decisionTreeHead);
162
+
163
+ // map from a decision tree node and feature index to a potential split definition
164
+ map<pair<shared_ptr<DecisionTreeNode>, int>, shared_ptr<SplitDefinition> > potentialSplitDefinitions;
165
+ set<int>& categoricalFeatures = m_data->getCategoricalFeatureIndices();
166
+
167
+ NodeSplitter splitter(m_data, m_parameters->minObservations, m_parameters->scale);
168
+
169
+ for (int k = 0; k < m_parameters->growKDecisionTreeNodes; ++k)
170
+ {
171
+ // choose M variables to test splitting on
172
+ // find terminal node with best improvement for any of those variables
173
+ vector<int> featuresToConsider = getRandomFeatureList();
174
+ // pair<shared_ptr<DecisionTreeNode>, int> bestNodeFeature;
175
+ shared_ptr<SplitDefinition> bestSplit;
176
+ double bestImprovement = 0.0;
177
+
178
+ BOOST_FOREACH(shared_ptr<DecisionTreeNode> node, nodesToSplit)
179
+ {
180
+ if (node->getSumW() == 0)
181
+ continue;
182
+
183
+ BOOST_FOREACH(int featureIndex, featuresToConsider)
184
+ {
185
+ pair<shared_ptr<DecisionTreeNode>, int> e = make_pair(node, featureIndex);
186
+
187
+ if (potentialSplitDefinitions.find(e) == potentialSplitDefinitions.end())
188
+ {
189
+ if (Utils::hasElement(categoricalFeatures, featureIndex))
190
+ potentialSplitDefinitions[e] = splitter.createCategoricalSplitDefinition(node, featureIndex);
191
+ else
192
+ potentialSplitDefinitions[e] = splitter.createContinuousSplitDefinition(node, featureIndex);
193
+ }
194
+
195
+ shared_ptr<SplitDefinition> splitDefinition = potentialSplitDefinitions[e];
196
+
197
+ if (!splitDefinition.get()) // it returned an invalid
198
+ continue;
199
+
200
+
201
+ if (splitDefinition->getImprovement() > bestImprovement)
202
+ {
203
+ bestImprovement = splitDefinition->getImprovement();
204
+ bestSplit = splitDefinition;
205
+ // bestNodeFeature = e;
206
+ }
207
+ }
208
+ }
209
+
210
+ if (bestImprovement == 0.0)
211
+ {
212
+ if (m_parameters->verbose)
213
+ vlcMessage.Write("Can't split the tree any further.", 1);
214
+
215
+ return; // we obviously didn't get any love out of our terminal nodes - probably means they
216
+ // can't split any further.
217
+ }
218
+ // we now have our best split, so do it!!!
219
+
220
+ int featureIndex = bestSplit->getFeatureIndex();
221
+ bool isCategorical = (categoricalFeatures.find(featureIndex) != categoricalFeatures.end());
222
+
223
+ shared_ptr<DecisionTreeNode> lhsChild = splitter.createLhsChild(bestSplit);
224
+ shared_ptr<DecisionTreeNode> rhsChild = splitter.createRhsChild(bestSplit);
225
+ shared_ptr<DecisionTreeNode> missingChild = splitter.createMissingChild(bestSplit);
226
+
227
+ shared_ptr<DecisionTreeNode> nodeToSplit = bestSplit->getNodeToSplit();
228
+
229
+ nodeToSplit->defineSplit(bestSplit, lhsChild,rhsChild,missingChild);
230
+
231
+ // finally, remove the node we just split from the terminal nodes, and add the children
232
+ nodesToSplit.erase(nodeToSplit);
233
+ nodesToSplit.insert(lhsChild);
234
+ nodesToSplit.insert(rhsChild);
235
+ nodesToSplit.insert(missingChild);
236
+
237
+ // if it's categorical, there is a chance a new category will come along, and we won't be able to split on it.
238
+ // which would make this a potential terminal node.
239
+ // so only erase if it's continuous
240
+ //if (!isCategorical)
241
+ // m_terminalNodes.erase(nodeToSplit);
242
+
243
+ m_terminalNodes.insert(lhsChild);
244
+ m_terminalNodes.insert(rhsChild);
245
+ m_terminalNodes.insert(missingChild);
246
+
247
+ if (m_parameters->verbose)
248
+ {
249
+ vlcMessage.Write("Level " + lexical_cast<string>(k+1) + ": Split on feature "
250
+ + m_data->getFeatures().at(featureIndex) + " at "
251
+ + lexical_cast<string>(bestSplit->getSplitValue()) + ". Improvement: "
252
+ + lexical_cast<string>(bestImprovement));
253
+ }
254
+
255
+ }
256
+ }
257
+
258
+
259
+ void GBMEstimator::constructGenerousDecisionTree(vector<shared_ptr<DecisionTreeExperiment> >& experiments, int rfToLevel)
260
+ {
261
+ // set Z to Y for RF part
262
+ BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& experiment, experiments)
263
+ {
264
+ experiment->setZ(experiment->getY());
265
+ }
266
+
267
+ double sumZ = 0.0, sumW = 0.0;
268
+ BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& e, experiments)
269
+ {
270
+ double w = e->getWeight();
271
+ sumW += w;
272
+ sumZ += w * e->getZ();
273
+ }
274
+
275
+ m_decisionTreeHead = shared_ptr<DecisionTreeNode>(new DecisionTreeNode(experiments, sumZ, sumW, ROOT, shared_ptr<SplitDefinition>()));
276
+
277
+ // m_decisionTreeHead->setSortedExperiments(bagSortedExperiments(experiments));
278
+
279
+ m_terminalNodes.clear();
280
+ m_terminalNodes.insert(m_decisionTreeHead);
281
+ vector<shared_ptr<DecisionTreeNode> > nodesToSplit;
282
+ nodesToSplit.push_back(m_decisionTreeHead);
283
+ vector<shared_ptr<DecisionTreeNode> > nextNodesToSplit;
284
+
285
+ // map from a decision tree node and feature index to a potential split definition
286
+ set<int>& categoricalFeatures = m_data->getCategoricalFeatureIndices();
287
+
288
+ NodeSplitter splitter(m_data, m_parameters->minObservations, m_parameters->scale);
289
+
290
+ for (int level = 0; level < rfToLevel; ++level)
291
+ {
292
+ BOOST_FOREACH(shared_ptr<DecisionTreeNode>& nodeToSplit, nodesToSplit)
293
+ {
294
+ vector<int> featuresToConsider = getRandomFeatureList();
295
+ vector<shared_ptr<DecisionTreeNode> > children = splitter.splitNode(nodeToSplit, featuresToConsider);
296
+ BOOST_FOREACH(shared_ptr<DecisionTreeNode>& child, children)
297
+ {
298
+ nextNodesToSplit.push_back(child);
299
+ m_terminalNodes.insert(child);
300
+ }
301
+ }
302
+ nodesToSplit = nextNodesToSplit;
303
+ nextNodesToSplit.clear();
304
+ }
305
+ // have successfully built a random forest to depth rfLevels
306
+ // reset Z to residuals
307
+ updateZ(experiments);
308
+
309
+ // now gradient boost on the nodesToSplit
310
+ BOOST_FOREACH(shared_ptr<DecisionTreeNode>& nodeToSplit, nodesToSplit)
311
+ {
312
+ nodeToSplit->updateSums();
313
+ vector<int> featuresToConsider = getRandomFeatureList();
314
+ vector<shared_ptr<DecisionTreeNode> > children = splitter.splitNode(nodeToSplit, featuresToConsider);
315
+ BOOST_FOREACH(shared_ptr<DecisionTreeNode>& child, children)
316
+ m_terminalNodes.insert(child);
317
+ }
318
+ }
319
+
320
+ vector<int> GBMEstimator::getRandomFeatureList()
321
+ {
322
+ vector<int> randomlySelectedFeatures;
323
+ map<int, bool> featureChosen;
324
+
325
+ unsigned int numberToChoose = std::min((int)m_featureIndices.size(), m_parameters->tryMVariables);
326
+
327
+ while (randomlySelectedFeatures.size() < numberToChoose)
328
+ {
329
+ long r = rand();
330
+ long index = r * (1.0 / (RAND_MAX + 1L)) * m_featureIndices.size();
331
+ if (!featureChosen[index] == 1)
332
+ {
333
+ featureChosen[index] = 1;
334
+ randomlySelectedFeatures.push_back(m_featureIndices.at(index));
335
+ }
336
+
337
+ }
338
+ return randomlySelectedFeatures;
339
+ }
340
+
341
+
342
+
343
+ void GBMEstimator::calculateFIncrementPerDecisionTreeNode()
344
+ {
345
+ m_FIncrements.clear();
346
+ BOOST_FOREACH(shared_ptr<DecisionTreeNode> node, m_terminalNodes)
347
+ {
348
+ vector<shared_ptr<DecisionTreeExperiment> > experiments = node->getExperiments();
349
+
350
+ double fIncrement = m_gbmCalculator->computeFIncrement(experiments);
351
+ m_FIncrements[node] = fIncrement * m_parameters->shrinkageFactor;
352
+ }
353
+ }
354
+
355
+ void GBMEstimator::applyFIncrementToInBagExperiments()
356
+ {
357
+ // THIS IS BAD, because when bagging with replacement, you can increment the same record twice!!
358
+ // we know which experiments are in-bag because they're stored by the terminal nodes!
359
+ BOOST_FOREACH(shared_ptr<DecisionTreeNode> node, m_terminalNodes)
360
+ {
361
+ double increment = m_FIncrements[node];
362
+
363
+ BOOST_FOREACH(shared_ptr<DecisionTreeExperiment> experiment, node->getExperiments())
364
+ {
365
+ // vlcMessage.Write("experiment->getExperimentIndex() => " + ToString(experiment->getExperimentIndex()));
366
+ // vlcMessage.Write("increment => " + ToString(increment));
367
+ experiment->incrementF(increment);
368
+ }
369
+ }
370
+ }
371
+
372
+ void GBMEstimator::applyFIncrementToExperiments(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
373
+ {
374
+ BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& experiment, experiments)
375
+ {
376
+ // get appropriate node
377
+ shared_ptr<DecisionTreeNode> terminalNode =
378
+ (m_decisionTreeHead->isTerminalNode() ? m_decisionTreeHead : m_decisionTreeHead->getTerminalNodeForExperiment(experiment));
379
+
380
+ if (terminalNode.get() == 0)
381
+ terminalNode = m_decisionTreeHead;
382
+
383
+ if (m_FIncrements.find(terminalNode) == m_FIncrements.end())
384
+ throw std::runtime_error("We have no increment for this terminal node!!");
385
+
386
+ double incrementF = m_FIncrements[terminalNode];
387
+ experiment->incrementF(incrementF);
388
+ }
389
+ }
390
+
391
+ void GBMEstimator::initialiseGBMExperimentData()
392
+ {
393
+ populateInitialF();
394
+ updatePredictions(m_decisionTreeExperiments);
395
+ updateZ(m_decisionTreeExperiments);
396
+ }
397
+
398
+ void GBMEstimator::reportDeviance(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
399
+ {
400
+ if (m_parameters->verbose)
401
+ vlcMessage.Write("Deviance: " + lexical_cast<string>(m_gbmCalculator->calculateDeviance(experiments)));
402
+ }
403
+
404
+ void GBMEstimator::deleteRedundantData()
405
+ {
406
+ m_decisionTreeHead->clearExperimentsWithinTree();
407
+ }
408
+
409
+ // map<int, vector<shared_ptr<DecisionTreeExperiment> > >
410
+ // GBMEstimator::partitionSortedExperiments(shared_ptr<SplitDefinition> splitDefinition,
411
+ // Partition partition)
412
+ // {
413
+ // map<int, vector<shared_ptr<DecisionTreeExperiment> > >& sortedExperiments =
414
+ // splitDefinition->getNodeToSplit()->getSortedExperiments();
415
+ //
416
+ // map<int, vector<shared_ptr<DecisionTreeExperiment> > > partitionSortedExperiments;
417
+ //
418
+ // BOOST_FOREACH(auto& e, sortedExperiments)
419
+ // {
420
+ // int featureIndex = e.first;
421
+ // vector<shared_ptr<DecisionTreeExperiment> >& experimentsForFeature = e.second;
422
+ //
423
+ // if (experimentsForFeature.size() == 0)
424
+ // continue;
425
+ // partitionSortedExperiments[featureIndex] = partitionExperiments(experimentsForFeature, splitDefinition, partition);
426
+ // }
427
+ // return partitionSortedExperiments;
428
+ // }
429
+
430
+
431
+
432
+ // void GBMEstimator::sortTrainingExperiments()
433
+ // {
434
+ // BOOST_FOREACH(auto& featureIndex, m_featureIndices)
435
+ // {
436
+ // vector<shared_ptr<DecisionTreeExperiment> > experiments = m_trainingExperiments;
437
+ // featureSorter.featureIndexToSort = featureIndex;
438
+ //
439
+ // sort(experiments.begin(), experiments.end(), featureSorter);
440
+ // m_sortedTrainingExperiments[featureIndex] = experiments;
441
+ // }
442
+ // }
443
+
444
+ // map<int, vector<shared_ptr<DecisionTreeExperiment> > > GBMEstimator::bagSortedExperiments(vector<shared_ptr<DecisionTreeExperiment> >& baggedExperiments)
445
+ // {
446
+ // vector<bool> inBag(m_data->getExperiments().size(), false);
447
+ //
448
+ // BOOST_FOREACH(auto& experiment, baggedExperiments)
449
+ // inBag.at(experiment->getExperimentIndex()) = true;
450
+ //
451
+ // map<int, vector<shared_ptr<DecisionTreeExperiment> > > sortedExperiments;
452
+ // BOOST_FOREACH(auto& e, m_sortedTrainingExperiments)
453
+ // {
454
+ // int featureIndex = e.first;
455
+ // vector<shared_ptr<DecisionTreeExperiment> >& experiments = e.second;
456
+ //
457
+ // vector<shared_ptr<DecisionTreeExperiment> >& baggedSortedExperiments = sortedExperiments[featureIndex];
458
+ // baggedSortedExperiments.reserve(baggedExperiments.size());
459
+ // BOOST_FOREACH(auto& experiment, experiments)
460
+ // {
461
+ // if (inBag.at(experiment->getExperimentIndex()))
462
+ // baggedSortedExperiments.push_back(experiment);
463
+ // }
464
+ // }
465
+ // return sortedExperiments;
466
+ // }
467
+
468
+ void GBMEstimator::initializeEstimator()
469
+ {
470
+ m_missingValueDefined = m_data->missingValueDefined();
471
+ if (m_missingValueDefined)
472
+ m_missingValue = m_data->getMissingValue();
473
+
474
+ constructFeatureIndices();
475
+ // sortTrainingExperiments();
476
+
477
+ if (m_parameters->distribution == GAUSSIAN)
478
+ m_gbmCalculator = make_shared<GaussianCalculator>();
479
+ else if (m_parameters->distribution == BERNOULLI)
480
+ m_gbmCalculator = make_shared<BernoulliCalculator>();
481
+ }
482
+
483
+ struct FeatureInteractionSorter
484
+ {
485
+ FeatureInteractionSorter()
486
+ {}
487
+
488
+ bool operator() (FeatureInteraction a, FeatureInteraction b)
489
+ {
490
+ return a.secondarySplitDefinition->getImprovement() > b.secondarySplitDefinition->getImprovement();
491
+ }
492
+ } featureInteractionSorter;
493
+
494
+ vector<FeatureInteraction> GBMEstimator::findInteractions(int howMany)
495
+ {
496
+ vlcMessage.Write("Finding interactions!");
497
+
498
+ vector<shared_ptr<DecisionTreeExperiment> >& experiments = m_decisionTreeExperiments;
499
+
500
+ vlcMessage.Write("Initialising");
501
+
502
+ initialiseGBMExperimentData();
503
+ // now reset Z to be Y
504
+ BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& experiment, experiments)
505
+ {
506
+ experiment->setZ(experiment->getY());
507
+ }
508
+
509
+ double sumZ = 0.0, sumW = 0.0;
510
+ BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& e, experiments)
511
+ {
512
+ double w = e->getWeight();
513
+ sumW += w;
514
+ sumZ += w * e->getZ();
515
+ }
516
+
517
+ vlcMessage.Write("Creating head");
518
+ m_decisionTreeHead = shared_ptr<DecisionTreeNode>(new DecisionTreeNode(experiments, sumZ, sumW, ROOT, shared_ptr<SplitDefinition>()));
519
+
520
+ // map from a decision tree node and feature index to a potential split definition
521
+ set<int>& categoricalFeatures = m_data->getCategoricalFeatureIndices();
522
+
523
+ NodeSplitter splitter(m_data, m_parameters->minObservations, m_parameters->scale);
524
+
525
+ map<int, vector<shared_ptr<DecisionTreeNode> > > topLevelSplits;
526
+
527
+ vector<int> singleFeature;
528
+ singleFeature.push_back(0);
529
+
530
+ vlcMessage.Write("Creating top level splits");
531
+ vector<string>& featureNames = m_data->getFeatures();
532
+ BOOST_FOREACH(int& featureIndex, m_featureIndices)
533
+ {
534
+ vlcMessage.Write("Top level " + lexical_cast<string>(featureIndex));
535
+ // find the best split definition for this feature index
536
+ singleFeature.at(0) = featureIndex;
537
+ topLevelSplits[featureIndex] = splitter.splitNode(m_decisionTreeHead, singleFeature);
538
+ if (featureNames.at(featureIndex) == "Quan_4")
539
+ {
540
+ shared_ptr<SplitDefinition> def = topLevelSplits[featureIndex].at(0)->getParentSplitDefinition();
541
+ vlcMessage.Write("Splitting Quan_4");
542
+ vlcMessage.Write("Imp: " + lexical_cast<string>(def->getImprovement()));
543
+ vlcMessage.Write("Split value " + lexical_cast<string>(def->getSplitValue()));
544
+ vlcMessage.Write("LhsSumZ: " + lexical_cast<string>(def->getLhsSumZ()));
545
+ vlcMessage.Write("LhsSumW: " + lexical_cast<string>(def->getLhsSumW()));
546
+ vlcMessage.Write("RhsSumZ: " + lexical_cast<string>(def->getRhsSumZ()));
547
+ vlcMessage.Write("RhsSumW: " + lexical_cast<string>(def->getRhsSumW()));
548
+ vlcMessage.Write("MissingSumZ: " + lexical_cast<string>(def->getMissingSumZ()));
549
+ vlcMessage.Write("MissingSumW: " + lexical_cast<string>(def->getMissingSumW()));
550
+ }
551
+ }
552
+
553
+ vlcMessage.Write("Updating Z");
554
+ // reset Z to residuals
555
+ updateZ(m_decisionTreeExperiments);
556
+
557
+ vlcMessage.Write("Allocating mem");
558
+ vector<FeatureInteraction> featureInteractions;
559
+ featureInteractions.reserve(m_featureIndices.size() * m_featureIndices.size() * 3);
560
+
561
+
562
+ typedef pair<int, vector<shared_ptr<DecisionTreeNode> > > ElementType;
563
+ BOOST_FOREACH(ElementType e, topLevelSplits)
564
+ {
565
+ int primaryFeatureIndex = e.first;
566
+ vector<shared_ptr<DecisionTreeNode> > children = e.second;
567
+
568
+ if (children.size() == 0)
569
+ continue;
570
+
571
+ vlcMessage.Write("Secondary splits on " + lexical_cast<string>(primaryFeatureIndex));
572
+ shared_ptr<SplitDefinition> primarySplitDefinition = children.at(0)->getParentSplitDefinition();
573
+ BOOST_FOREACH(shared_ptr<DecisionTreeNode>& child, children)
574
+ {
575
+ if (child->getSumW() == 0)
576
+ continue;
577
+
578
+ // update sumZ / sumW
579
+ child->updateSums();
580
+
581
+ BOOST_FOREACH(int& secondaryFeatureIndex, m_featureIndices)
582
+ {
583
+ if (secondaryFeatureIndex == primaryFeatureIndex)
584
+ continue;
585
+
586
+ shared_ptr<SplitDefinition> secondarySplitDefinition = splitter.createSplitDefinition(child,secondaryFeatureIndex);
587
+ if (secondarySplitDefinition.get() == 0)
588
+ continue;
589
+
590
+ // vlcMessage.Write("Secondary split on with imp " + ToString(secondarySplitDefinition->getImprovement()) + " lhsSumZ: " + ToString(secondarySplitDefinition->getLhsSumZ()) + " lhsSumW: " + ToString(secondarySplitDefinition->getLhsSumW()) + " rhsSumZ: " + ToString(secondarySplitDefinition->getRhsSumZ()) + " rhsSumW: " + ToString(secondarySplitDefinition->getRhsSumW()) + " missingSumZ: " + ToString(secondarySplitDefinition->getMissingSumZ()) + " missingSumZ: " + ToString(secondarySplitDefinition->getMissingSumW()));
591
+ FeatureInteraction interaction(primarySplitDefinition,secondarySplitDefinition,child->getPartition());
592
+ featureInteractions.push_back(interaction);
593
+ }
594
+ }
595
+ }
596
+ vlcMessage.Write("Sorting...");
597
+
598
+ sort(featureInteractions.begin(), featureInteractions.end(), featureInteractionSorter);
599
+
600
+ return vector<FeatureInteraction>(featureInteractions.begin(), featureInteractions.begin() + howMany);
601
+ }
@@ -0,0 +1,86 @@
1
+ #include "MachineLearning/GBM/GBMOutput.h"
2
+ #include "MachineLearning/GBM/BernoulliCalculator.h"
3
+ #include "MachineLearning/GBM/GaussianCalculator.h"
4
+ #include "MachineLearning/DecisionTree/DecisionTreeExperiment.h"
5
+ #include "MachineLearning/DecisionTree/DecisionTreeNode.h"
6
+
7
+ #include <boost/foreach.hpp>
8
+ #include <boost/make_shared.hpp>
9
+ using boost::make_shared;
10
+
11
+ GBMOutput::GBMOutput(MLData* trainingData, vector<int> trainingExperimentIndicies, shared_ptr<GBMParameters> parameters)
12
+ : MLOutput(trainingData, trainingExperimentIndicies), m_parameters(parameters), m_useMeanY(false)
13
+ {
14
+ if (m_parameters->distribution == GAUSSIAN)
15
+ m_gbmCalculator = make_shared<GaussianCalculator>();
16
+ else if (m_parameters->distribution == BERNOULLI)
17
+ m_gbmCalculator = make_shared<BernoulliCalculator>();
18
+ }
19
+
20
+ GBMOutput::~GBMOutput()
21
+ {
22
+
23
+ }
24
+
25
+ void GBMOutput::addHeadDecisionTreeNode(shared_ptr<DecisionTreeNode> node)
26
+ {
27
+ m_headNodes.push_back(node);
28
+ }
29
+
30
+ double GBMOutput::predictForExperiment(shared_ptr<MLExperiment> experiment)
31
+ {
32
+ shared_ptr<DecisionTreeExperiment> dtExperiment = make_shared<DecisionTreeExperiment>(experiment);
33
+ setPredictionForDecisionTreeExperiment(dtExperiment);
34
+ return dtExperiment->getPrediction();
35
+ }
36
+
37
+ void GBMOutput::addFIncrements(map<shared_ptr<DecisionTreeNode>, double> fIncrements)
38
+ {
39
+ m_fIncrements.push_back(fIncrements);
40
+ }
41
+
42
+ void GBMOutput::setMeanY(double y)
43
+ {
44
+ m_meanY = y;
45
+ m_useMeanY = true;
46
+ }
47
+
48
+ int GBMOutput::getNumTrees()
49
+ {
50
+ return (int) m_headNodes.size();
51
+ }
52
+
53
+ shared_ptr<GBMParameters> GBMOutput::getParameters()
54
+ {
55
+ return m_parameters;
56
+ }
57
+
58
+ void GBMOutput::capTrees( int numTrees )
59
+ {
60
+ m_headNodes.resize(numTrees);
61
+ m_fIncrements.resize(numTrees);
62
+ }
63
+
64
+ void GBMOutput::setPredictionForDecisionTreeExperiment( shared_ptr<DecisionTreeExperiment> experiment )
65
+ {
66
+ // determine initial F
67
+ double initialPrediction = (m_useMeanY ? m_meanY : experiment->getPrediction());
68
+
69
+ experiment->setF(m_gbmCalculator->calculateF(initialPrediction));
70
+
71
+ int index = -1;
72
+ BOOST_FOREACH(shared_ptr<DecisionTreeNode> node, m_headNodes)
73
+ {
74
+ ++index;
75
+ shared_ptr<DecisionTreeNode> terminalNode = (node->isTerminalNode() ? node : node->getTerminalNodeForExperiment(experiment));
76
+ if (terminalNode.get() == 0)
77
+ terminalNode = node;
78
+
79
+ if (m_fIncrements.at(index).find(terminalNode) == m_fIncrements.at(index).end())
80
+ throw std::runtime_error("We have no increment for this terminal node!!");
81
+
82
+ double incrementF = m_fIncrements.at(index)[terminalNode];
83
+ experiment->incrementF(incrementF);
84
+ }
85
+ experiment->setPrediction(m_gbmCalculator->calculatePrediction(experiment->getF()));
86
+ }