ml4r 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. data/ext/ml4r/LinearRegression/LinearRegression.cpp +305 -0
  2. data/ext/ml4r/LinearRegression/OLSLinearRegression.cpp +75 -0
  3. data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.cpp +50 -0
  4. data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.cpp +195 -0
  5. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.cpp +551 -0
  6. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.cpp +22 -0
  7. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.cpp +21 -0
  8. data/ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.cpp +142 -0
  9. data/ext/ml4r/MachineLearning/GBM/BernoulliCalculator.cpp +95 -0
  10. data/ext/ml4r/MachineLearning/GBM/GBMEstimator.cpp +601 -0
  11. data/ext/ml4r/MachineLearning/GBM/GBMOutput.cpp +86 -0
  12. data/ext/ml4r/MachineLearning/GBM/GBMRunner.cpp +117 -0
  13. data/ext/ml4r/MachineLearning/GBM/GaussianCalculator.cpp +94 -0
  14. data/ext/ml4r/MachineLearning/GBM/ZenithGBM.cpp +317 -0
  15. data/ext/ml4r/MachineLearning/MLData/MLData.cpp +232 -0
  16. data/ext/ml4r/MachineLearning/MLData/MLDataFields.cpp +1 -0
  17. data/ext/ml4r/MachineLearning/MLData/MLDataReader.cpp +139 -0
  18. data/ext/ml4r/MachineLearning/MLData/ZenithMLData.cpp +96 -0
  19. data/ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.cpp +113 -0
  20. data/ext/ml4r/MachineLearning/MLExperiment.cpp +69 -0
  21. data/ext/ml4r/MachineLearning/MLRunner.cpp +183 -0
  22. data/ext/ml4r/MachineLearning/MLUtils.cpp +15 -0
  23. data/ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.cpp +172 -0
  24. data/ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.cpp +66 -0
  25. data/ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.cpp +84 -0
  26. data/ext/ml4r/MachineLearning/RandomForest/ZenithRandomForest.cpp +184 -0
  27. data/ext/ml4r/ml4r.cpp +34 -0
  28. data/ext/ml4r/ml4r_wrap.cpp +15727 -0
  29. data/ext/ml4r/utils/MathUtils.cpp +204 -0
  30. data/ext/ml4r/utils/StochasticUtils.cpp +73 -0
  31. data/ext/ml4r/utils/Utils.cpp +14 -0
  32. data/ext/ml4r/utils/VlcMessage.cpp +3 -0
  33. metadata +33 -1
@@ -0,0 +1,601 @@
1
+ #include "MachineLearning/GBM/GBMEstimator.h"
2
+ #include "MachineLearning/GBM/GBMParameters.h"
3
+ #include "MachineLearning/GBM/GBMOutput.h"
4
+ #include "MachineLearning/GBM/BernoulliCalculator.h"
5
+ #include "MachineLearning/GBM/GaussianCalculator.h"
6
+ #include "MachineLearning/MLData/MLData.h"
7
+ #include "MachineLearning/MLUtils.h"
8
+ #include "MachineLearning/DecisionTree/SplitDefinition.h"
9
+ #include "MachineLearning/DecisionTree/NodeSplitterCategorical.h"
10
+ #include "MachineLearning/DecisionTree/NodeSplitterContinuous.h"
11
+ #include "MachineLearning/DecisionTree/DecisionTreeExperiment.h"
12
+ #include "MachineLearning/DecisionTree/DecisionTreeNode.h"
13
+ #include "MachineLearning/DecisionTree/FeatureInteraction.h"
14
+
15
+ #include <algorithm>
16
+ #include <boost/foreach.hpp>
17
+ #include <boost/make_shared.hpp>
18
+ #include <boost/lexical_cast.hpp>
19
+ using boost::lexical_cast;
20
+ using boost::make_shared;
21
+ using std::make_pair;
22
+
23
+ #include "utils/VlcMessage.h"
24
+
25
+ GBMEstimator::GBMEstimator(MLData* data, vector<shared_ptr<MLExperiment> > experiments, shared_ptr<GBMParameters> parameters)
26
+ : MLEstimator(data, experiments), m_parameters(parameters)
27
+ {
28
+ m_decisionTreeExperiments.reserve(experiments.size());
29
+ BOOST_FOREACH(shared_ptr<MLExperiment>& experiment, experiments)
30
+ m_decisionTreeExperiments.push_back(make_shared<DecisionTreeExperiment>(experiment));
31
+
32
+ vector<int> experimentIndicies;
33
+ experimentIndicies.reserve(experiments.size());
34
+ BOOST_FOREACH(shared_ptr<MLExperiment>& experiment, experiments)
35
+ experimentIndicies.push_back(experiment->getExperimentIndex());
36
+
37
+ m_output = shared_ptr<GBMOutput>(new GBMOutput(m_data, experimentIndicies, m_parameters));
38
+
39
+ initializeEstimator();
40
+ }
41
+
42
+ GBMEstimator::~GBMEstimator() {}
43
+
44
+ shared_ptr<MLOutput> GBMEstimator::estimate()
45
+ {
46
+ initialiseGBMExperimentData();
47
+
48
+ for (int iteration = 0; iteration < m_parameters->numIterations; ++iteration)
49
+ {
50
+ if (m_parameters->verbose)
51
+ vlcMessage.Begin((string("Iteration ") + lexical_cast<string>(iteration + 1)).c_str());
52
+
53
+ performIteration();
54
+
55
+ if (m_parameters->verbose)
56
+ vlcMessage.End();
57
+ }
58
+ return shared_ptr<MLOutput>(m_output);
59
+ }
60
+
61
+
62
+ void GBMEstimator::estimateMore(int numTrees)
63
+ {
64
+ int numberOfExistingTrees = m_output->getNumTrees();
65
+
66
+ for (int iteration = 0; iteration < numTrees; ++iteration)
67
+ {
68
+ if (m_parameters->verbose)
69
+ vlcMessage.Begin(string("Iteration ") + lexical_cast<string>(numberOfExistingTrees + iteration + 1));
70
+
71
+ performIteration();
72
+
73
+ if (m_parameters->verbose)
74
+ vlcMessage.End();
75
+ }
76
+ }
77
+
78
+
79
+ void GBMEstimator::performIteration()
80
+ {
81
+ // update Z based on latest F
82
+ vector<shared_ptr<DecisionTreeExperiment> >& experiments = m_decisionTreeExperiments;
83
+
84
+ long bagSize = m_parameters->bagFraction * m_decisionTreeExperiments.size();
85
+
86
+ pair<vector<shared_ptr<DecisionTreeExperiment> >,vector<shared_ptr<DecisionTreeExperiment> > > inAndOutOfBagExperiments =
87
+ MLUtils::bagObjectsWithoutReplacement<shared_ptr<DecisionTreeExperiment> >(m_decisionTreeExperiments, (int) bagSize);
88
+
89
+ if (m_parameters->verbose)
90
+ vlcMessage.Begin("Constructing decision tree");
91
+
92
+ if (m_parameters->greedy)
93
+ constructDecisionTree(inAndOutOfBagExperiments.first);
94
+ else
95
+ constructGenerousDecisionTree(inAndOutOfBagExperiments.first, m_parameters->rfToLevel);
96
+
97
+ m_output->addHeadDecisionTreeNode(m_decisionTreeHead);
98
+
99
+ if (m_parameters->verbose)
100
+ vlcMessage.End();
101
+
102
+ // update F
103
+ calculateFIncrementPerDecisionTreeNode();
104
+ m_output->addFIncrements(m_FIncrements);
105
+
106
+ // applyFIncrementToInBagExperiments();
107
+ applyFIncrementToExperiments(experiments);
108
+
109
+ // update predictions and Z
110
+ updatePredictions(experiments);
111
+ updateZ(experiments);
112
+ reportDeviance(experiments);
113
+ deleteRedundantData();
114
+ }
115
+
116
+ void GBMEstimator::constructFeatureIndices()
117
+ {
118
+ BOOST_FOREACH(string& feature, m_parameters->featuresToRun)
119
+ {
120
+ // note that in a given run, we may not "run" with all loaded variables.
121
+ m_featureIndices.push_back(m_data->getFeatureIndex(feature));
122
+ }
123
+ }
124
+
125
+ void GBMEstimator::populateInitialF()
126
+ {
127
+ m_gbmCalculator->populateInitialF(m_decisionTreeExperiments, m_data->initialPredictionsDefined());
128
+
129
+ if (!m_data->initialPredictionsDefined())
130
+ m_output->setMeanY(m_decisionTreeExperiments.front()->getPrediction());
131
+ }
132
+
133
+ void GBMEstimator::updateZ(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
134
+ {
135
+ m_gbmCalculator->updateZ(experiments);
136
+ }
137
+
138
+ void GBMEstimator::updatePredictions(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
139
+ {
140
+ // convert from F to prediction
141
+ m_gbmCalculator->updatePredictions(experiments);
142
+ }
143
+
144
+ void GBMEstimator::constructDecisionTree(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
145
+ {
146
+ // create a head DecisionTreeNode
147
+ double sumZ = 0.0, sumW = 0.0;
148
+ BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& e, experiments)
149
+ {
150
+ double w = e->getWeight();
151
+ sumW += w;
152
+ sumZ += w * e->getZ();
153
+ }
154
+ m_decisionTreeHead = shared_ptr<DecisionTreeNode>(new DecisionTreeNode(experiments, sumZ, sumW, ROOT, shared_ptr<SplitDefinition>()));
155
+
156
+ // m_decisionTreeHead->setSortedExperiments(bagSortedExperiments(experiments));
157
+
158
+ m_terminalNodes.clear();
159
+ m_terminalNodes.insert(m_decisionTreeHead);
160
+ set<shared_ptr<DecisionTreeNode> > nodesToSplit;
161
+ nodesToSplit.insert(m_decisionTreeHead);
162
+
163
+ // map from a decision tree node and feature index to a potential split definition
164
+ map<pair<shared_ptr<DecisionTreeNode>, int>, shared_ptr<SplitDefinition> > potentialSplitDefinitions;
165
+ set<int>& categoricalFeatures = m_data->getCategoricalFeatureIndices();
166
+
167
+ NodeSplitter splitter(m_data, m_parameters->minObservations, m_parameters->scale);
168
+
169
+ for (int k = 0; k < m_parameters->growKDecisionTreeNodes; ++k)
170
+ {
171
+ // choose M variables to test splitting on
172
+ // find terminal node with best improvement for any of those variables
173
+ vector<int> featuresToConsider = getRandomFeatureList();
174
+ // pair<shared_ptr<DecisionTreeNode>, int> bestNodeFeature;
175
+ shared_ptr<SplitDefinition> bestSplit;
176
+ double bestImprovement = 0.0;
177
+
178
+ BOOST_FOREACH(shared_ptr<DecisionTreeNode> node, nodesToSplit)
179
+ {
180
+ if (node->getSumW() == 0)
181
+ continue;
182
+
183
+ BOOST_FOREACH(int featureIndex, featuresToConsider)
184
+ {
185
+ pair<shared_ptr<DecisionTreeNode>, int> e = make_pair(node, featureIndex);
186
+
187
+ if (potentialSplitDefinitions.find(e) == potentialSplitDefinitions.end())
188
+ {
189
+ if (Utils::hasElement(categoricalFeatures, featureIndex))
190
+ potentialSplitDefinitions[e] = splitter.createCategoricalSplitDefinition(node, featureIndex);
191
+ else
192
+ potentialSplitDefinitions[e] = splitter.createContinuousSplitDefinition(node, featureIndex);
193
+ }
194
+
195
+ shared_ptr<SplitDefinition> splitDefinition = potentialSplitDefinitions[e];
196
+
197
+ if (!splitDefinition.get()) // it returned an invalid
198
+ continue;
199
+
200
+
201
+ if (splitDefinition->getImprovement() > bestImprovement)
202
+ {
203
+ bestImprovement = splitDefinition->getImprovement();
204
+ bestSplit = splitDefinition;
205
+ // bestNodeFeature = e;
206
+ }
207
+ }
208
+ }
209
+
210
+ if (bestImprovement == 0.0)
211
+ {
212
+ if (m_parameters->verbose)
213
+ vlcMessage.Write("Can't split the tree any further.", 1);
214
+
215
+ return; // we obviously didn't get any love out of our terminal nodes - probably means they
216
+ // can't split any further.
217
+ }
218
+ // we now have our best split, so do it!!!
219
+
220
+ int featureIndex = bestSplit->getFeatureIndex();
221
+ bool isCategorical = (categoricalFeatures.find(featureIndex) != categoricalFeatures.end());
222
+
223
+ shared_ptr<DecisionTreeNode> lhsChild = splitter.createLhsChild(bestSplit);
224
+ shared_ptr<DecisionTreeNode> rhsChild = splitter.createRhsChild(bestSplit);
225
+ shared_ptr<DecisionTreeNode> missingChild = splitter.createMissingChild(bestSplit);
226
+
227
+ shared_ptr<DecisionTreeNode> nodeToSplit = bestSplit->getNodeToSplit();
228
+
229
+ nodeToSplit->defineSplit(bestSplit, lhsChild,rhsChild,missingChild);
230
+
231
+ // finally, remove the node we just split from the terminal nodes, and add the children
232
+ nodesToSplit.erase(nodeToSplit);
233
+ nodesToSplit.insert(lhsChild);
234
+ nodesToSplit.insert(rhsChild);
235
+ nodesToSplit.insert(missingChild);
236
+
237
+ // if it's categorical, there is a chance a new category will come along, and we won't be able to split on it.
238
+ // which would make this a potential terminal node.
239
+ // so only erase if it's continuous
240
+ //if (!isCategorical)
241
+ // m_terminalNodes.erase(nodeToSplit);
242
+
243
+ m_terminalNodes.insert(lhsChild);
244
+ m_terminalNodes.insert(rhsChild);
245
+ m_terminalNodes.insert(missingChild);
246
+
247
+ if (m_parameters->verbose)
248
+ {
249
+ vlcMessage.Write("Level " + lexical_cast<string>(k+1) + ": Split on feature "
250
+ + m_data->getFeatures().at(featureIndex) + " at "
251
+ + lexical_cast<string>(bestSplit->getSplitValue()) + ". Improvement: "
252
+ + lexical_cast<string>(bestImprovement));
253
+ }
254
+
255
+ }
256
+ }
257
+
258
+
259
+ void GBMEstimator::constructGenerousDecisionTree(vector<shared_ptr<DecisionTreeExperiment> >& experiments, int rfToLevel)
260
+ {
261
+ // set Z to Y for RF part
262
+ BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& experiment, experiments)
263
+ {
264
+ experiment->setZ(experiment->getY());
265
+ }
266
+
267
+ double sumZ = 0.0, sumW = 0.0;
268
+ BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& e, experiments)
269
+ {
270
+ double w = e->getWeight();
271
+ sumW += w;
272
+ sumZ += w * e->getZ();
273
+ }
274
+
275
+ m_decisionTreeHead = shared_ptr<DecisionTreeNode>(new DecisionTreeNode(experiments, sumZ, sumW, ROOT, shared_ptr<SplitDefinition>()));
276
+
277
+ // m_decisionTreeHead->setSortedExperiments(bagSortedExperiments(experiments));
278
+
279
+ m_terminalNodes.clear();
280
+ m_terminalNodes.insert(m_decisionTreeHead);
281
+ vector<shared_ptr<DecisionTreeNode> > nodesToSplit;
282
+ nodesToSplit.push_back(m_decisionTreeHead);
283
+ vector<shared_ptr<DecisionTreeNode> > nextNodesToSplit;
284
+
285
+ // map from a decision tree node and feature index to a potential split definition
286
+ set<int>& categoricalFeatures = m_data->getCategoricalFeatureIndices();
287
+
288
+ NodeSplitter splitter(m_data, m_parameters->minObservations, m_parameters->scale);
289
+
290
+ for (int level = 0; level < rfToLevel; ++level)
291
+ {
292
+ BOOST_FOREACH(shared_ptr<DecisionTreeNode>& nodeToSplit, nodesToSplit)
293
+ {
294
+ vector<int> featuresToConsider = getRandomFeatureList();
295
+ vector<shared_ptr<DecisionTreeNode> > children = splitter.splitNode(nodeToSplit, featuresToConsider);
296
+ BOOST_FOREACH(shared_ptr<DecisionTreeNode>& child, children)
297
+ {
298
+ nextNodesToSplit.push_back(child);
299
+ m_terminalNodes.insert(child);
300
+ }
301
+ }
302
+ nodesToSplit = nextNodesToSplit;
303
+ nextNodesToSplit.clear();
304
+ }
305
+ // have successfully built a random forest to depth rfLevels
306
+ // reset Z to residuals
307
+ updateZ(experiments);
308
+
309
+ // now gradient boost on the nodesToSplit
310
+ BOOST_FOREACH(shared_ptr<DecisionTreeNode>& nodeToSplit, nodesToSplit)
311
+ {
312
+ nodeToSplit->updateSums();
313
+ vector<int> featuresToConsider = getRandomFeatureList();
314
+ vector<shared_ptr<DecisionTreeNode> > children = splitter.splitNode(nodeToSplit, featuresToConsider);
315
+ BOOST_FOREACH(shared_ptr<DecisionTreeNode>& child, children)
316
+ m_terminalNodes.insert(child);
317
+ }
318
+ }
319
+
320
+ vector<int> GBMEstimator::getRandomFeatureList()
321
+ {
322
+ vector<int> randomlySelectedFeatures;
323
+ map<int, bool> featureChosen;
324
+
325
+ unsigned int numberToChoose = std::min((int)m_featureIndices.size(), m_parameters->tryMVariables);
326
+
327
+ while (randomlySelectedFeatures.size() < numberToChoose)
328
+ {
329
+ long r = rand();
330
+ long index = r * (1.0 / (RAND_MAX + 1L)) * m_featureIndices.size();
331
+ if (!featureChosen[index] == 1)
332
+ {
333
+ featureChosen[index] = 1;
334
+ randomlySelectedFeatures.push_back(m_featureIndices.at(index));
335
+ }
336
+
337
+ }
338
+ return randomlySelectedFeatures;
339
+ }
340
+
341
+
342
+
343
+ void GBMEstimator::calculateFIncrementPerDecisionTreeNode()
344
+ {
345
+ m_FIncrements.clear();
346
+ BOOST_FOREACH(shared_ptr<DecisionTreeNode> node, m_terminalNodes)
347
+ {
348
+ vector<shared_ptr<DecisionTreeExperiment> > experiments = node->getExperiments();
349
+
350
+ double fIncrement = m_gbmCalculator->computeFIncrement(experiments);
351
+ m_FIncrements[node] = fIncrement * m_parameters->shrinkageFactor;
352
+ }
353
+ }
354
+
355
+ void GBMEstimator::applyFIncrementToInBagExperiments()
356
+ {
357
+ // THIS IS BAD, because when bagging with replacement, you can increment the same record twice!!
358
+ // we know which experiments are in-bag because they're stored by the terminal nodes!
359
+ BOOST_FOREACH(shared_ptr<DecisionTreeNode> node, m_terminalNodes)
360
+ {
361
+ double increment = m_FIncrements[node];
362
+
363
+ BOOST_FOREACH(shared_ptr<DecisionTreeExperiment> experiment, node->getExperiments())
364
+ {
365
+ // vlcMessage.Write("experiment->getExperimentIndex() => " + ToString(experiment->getExperimentIndex()));
366
+ // vlcMessage.Write("increment => " + ToString(increment));
367
+ experiment->incrementF(increment);
368
+ }
369
+ }
370
+ }
371
+
372
+ void GBMEstimator::applyFIncrementToExperiments(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
373
+ {
374
+ BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& experiment, experiments)
375
+ {
376
+ // get appropriate node
377
+ shared_ptr<DecisionTreeNode> terminalNode =
378
+ (m_decisionTreeHead->isTerminalNode() ? m_decisionTreeHead : m_decisionTreeHead->getTerminalNodeForExperiment(experiment));
379
+
380
+ if (terminalNode.get() == 0)
381
+ terminalNode = m_decisionTreeHead;
382
+
383
+ if (m_FIncrements.find(terminalNode) == m_FIncrements.end())
384
+ throw std::runtime_error("We have no increment for this terminal node!!");
385
+
386
+ double incrementF = m_FIncrements[terminalNode];
387
+ experiment->incrementF(incrementF);
388
+ }
389
+ }
390
+
391
+ void GBMEstimator::initialiseGBMExperimentData()
392
+ {
393
+ populateInitialF();
394
+ updatePredictions(m_decisionTreeExperiments);
395
+ updateZ(m_decisionTreeExperiments);
396
+ }
397
+
398
+ void GBMEstimator::reportDeviance(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
399
+ {
400
+ if (m_parameters->verbose)
401
+ vlcMessage.Write("Deviance: " + lexical_cast<string>(m_gbmCalculator->calculateDeviance(experiments)));
402
+ }
403
+
404
+ void GBMEstimator::deleteRedundantData()
405
+ {
406
+ m_decisionTreeHead->clearExperimentsWithinTree();
407
+ }
408
+
409
+ // map<int, vector<shared_ptr<DecisionTreeExperiment> > >
410
+ // GBMEstimator::partitionSortedExperiments(shared_ptr<SplitDefinition> splitDefinition,
411
+ // Partition partition)
412
+ // {
413
+ // map<int, vector<shared_ptr<DecisionTreeExperiment> > >& sortedExperiments =
414
+ // splitDefinition->getNodeToSplit()->getSortedExperiments();
415
+ //
416
+ // map<int, vector<shared_ptr<DecisionTreeExperiment> > > partitionSortedExperiments;
417
+ //
418
+ // BOOST_FOREACH(auto& e, sortedExperiments)
419
+ // {
420
+ // int featureIndex = e.first;
421
+ // vector<shared_ptr<DecisionTreeExperiment> >& experimentsForFeature = e.second;
422
+ //
423
+ // if (experimentsForFeature.size() == 0)
424
+ // continue;
425
+ // partitionSortedExperiments[featureIndex] = partitionExperiments(experimentsForFeature, splitDefinition, partition);
426
+ // }
427
+ // return partitionSortedExperiments;
428
+ // }
429
+
430
+
431
+
432
+ // void GBMEstimator::sortTrainingExperiments()
433
+ // {
434
+ // BOOST_FOREACH(auto& featureIndex, m_featureIndices)
435
+ // {
436
+ // vector<shared_ptr<DecisionTreeExperiment> > experiments = m_trainingExperiments;
437
+ // featureSorter.featureIndexToSort = featureIndex;
438
+ //
439
+ // sort(experiments.begin(), experiments.end(), featureSorter);
440
+ // m_sortedTrainingExperiments[featureIndex] = experiments;
441
+ // }
442
+ // }
443
+
444
+ // map<int, vector<shared_ptr<DecisionTreeExperiment> > > GBMEstimator::bagSortedExperiments(vector<shared_ptr<DecisionTreeExperiment> >& baggedExperiments)
445
+ // {
446
+ // vector<bool> inBag(m_data->getExperiments().size(), false);
447
+ //
448
+ // BOOST_FOREACH(auto& experiment, baggedExperiments)
449
+ // inBag.at(experiment->getExperimentIndex()) = true;
450
+ //
451
+ // map<int, vector<shared_ptr<DecisionTreeExperiment> > > sortedExperiments;
452
+ // BOOST_FOREACH(auto& e, m_sortedTrainingExperiments)
453
+ // {
454
+ // int featureIndex = e.first;
455
+ // vector<shared_ptr<DecisionTreeExperiment> >& experiments = e.second;
456
+ //
457
+ // vector<shared_ptr<DecisionTreeExperiment> >& baggedSortedExperiments = sortedExperiments[featureIndex];
458
+ // baggedSortedExperiments.reserve(baggedExperiments.size());
459
+ // BOOST_FOREACH(auto& experiment, experiments)
460
+ // {
461
+ // if (inBag.at(experiment->getExperimentIndex()))
462
+ // baggedSortedExperiments.push_back(experiment);
463
+ // }
464
+ // }
465
+ // return sortedExperiments;
466
+ // }
467
+
468
+ void GBMEstimator::initializeEstimator()
469
+ {
470
+ m_missingValueDefined = m_data->missingValueDefined();
471
+ if (m_missingValueDefined)
472
+ m_missingValue = m_data->getMissingValue();
473
+
474
+ constructFeatureIndices();
475
+ // sortTrainingExperiments();
476
+
477
+ if (m_parameters->distribution == GAUSSIAN)
478
+ m_gbmCalculator = make_shared<GaussianCalculator>();
479
+ else if (m_parameters->distribution == BERNOULLI)
480
+ m_gbmCalculator = make_shared<BernoulliCalculator>();
481
+ }
482
+
483
+ struct FeatureInteractionSorter
484
+ {
485
+ FeatureInteractionSorter()
486
+ {}
487
+
488
+ bool operator() (FeatureInteraction a, FeatureInteraction b)
489
+ {
490
+ return a.secondarySplitDefinition->getImprovement() > b.secondarySplitDefinition->getImprovement();
491
+ }
492
+ } featureInteractionSorter;
493
+
494
+ vector<FeatureInteraction> GBMEstimator::findInteractions(int howMany)
495
+ {
496
+ vlcMessage.Write("Finding interactions!");
497
+
498
+ vector<shared_ptr<DecisionTreeExperiment> >& experiments = m_decisionTreeExperiments;
499
+
500
+ vlcMessage.Write("Initialising");
501
+
502
+ initialiseGBMExperimentData();
503
+ // now reset Z to be Y
504
+ BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& experiment, experiments)
505
+ {
506
+ experiment->setZ(experiment->getY());
507
+ }
508
+
509
+ double sumZ = 0.0, sumW = 0.0;
510
+ BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& e, experiments)
511
+ {
512
+ double w = e->getWeight();
513
+ sumW += w;
514
+ sumZ += w * e->getZ();
515
+ }
516
+
517
+ vlcMessage.Write("Creating head");
518
+ m_decisionTreeHead = shared_ptr<DecisionTreeNode>(new DecisionTreeNode(experiments, sumZ, sumW, ROOT, shared_ptr<SplitDefinition>()));
519
+
520
+ // map from a decision tree node and feature index to a potential split definition
521
+ set<int>& categoricalFeatures = m_data->getCategoricalFeatureIndices();
522
+
523
+ NodeSplitter splitter(m_data, m_parameters->minObservations, m_parameters->scale);
524
+
525
+ map<int, vector<shared_ptr<DecisionTreeNode> > > topLevelSplits;
526
+
527
+ vector<int> singleFeature;
528
+ singleFeature.push_back(0);
529
+
530
+ vlcMessage.Write("Creating top level splits");
531
+ vector<string>& featureNames = m_data->getFeatures();
532
+ BOOST_FOREACH(int& featureIndex, m_featureIndices)
533
+ {
534
+ vlcMessage.Write("Top level " + lexical_cast<string>(featureIndex));
535
+ // find the best split definition for this feature index
536
+ singleFeature.at(0) = featureIndex;
537
+ topLevelSplits[featureIndex] = splitter.splitNode(m_decisionTreeHead, singleFeature);
538
+ if (featureNames.at(featureIndex) == "Quan_4")
539
+ {
540
+ shared_ptr<SplitDefinition> def = topLevelSplits[featureIndex].at(0)->getParentSplitDefinition();
541
+ vlcMessage.Write("Splitting Quan_4");
542
+ vlcMessage.Write("Imp: " + lexical_cast<string>(def->getImprovement()));
543
+ vlcMessage.Write("Split value " + lexical_cast<string>(def->getSplitValue()));
544
+ vlcMessage.Write("LhsSumZ: " + lexical_cast<string>(def->getLhsSumZ()));
545
+ vlcMessage.Write("LhsSumW: " + lexical_cast<string>(def->getLhsSumW()));
546
+ vlcMessage.Write("RhsSumZ: " + lexical_cast<string>(def->getRhsSumZ()));
547
+ vlcMessage.Write("RhsSumW: " + lexical_cast<string>(def->getRhsSumW()));
548
+ vlcMessage.Write("MissingSumZ: " + lexical_cast<string>(def->getMissingSumZ()));
549
+ vlcMessage.Write("MissingSumW: " + lexical_cast<string>(def->getMissingSumW()));
550
+ }
551
+ }
552
+
553
+ vlcMessage.Write("Updating Z");
554
+ // reset Z to residuals
555
+ updateZ(m_decisionTreeExperiments);
556
+
557
+ vlcMessage.Write("Allocating mem");
558
+ vector<FeatureInteraction> featureInteractions;
559
+ featureInteractions.reserve(m_featureIndices.size() * m_featureIndices.size() * 3);
560
+
561
+
562
+ typedef pair<int, vector<shared_ptr<DecisionTreeNode> > > ElementType;
563
+ BOOST_FOREACH(ElementType e, topLevelSplits)
564
+ {
565
+ int primaryFeatureIndex = e.first;
566
+ vector<shared_ptr<DecisionTreeNode> > children = e.second;
567
+
568
+ if (children.size() == 0)
569
+ continue;
570
+
571
+ vlcMessage.Write("Secondary splits on " + lexical_cast<string>(primaryFeatureIndex));
572
+ shared_ptr<SplitDefinition> primarySplitDefinition = children.at(0)->getParentSplitDefinition();
573
+ BOOST_FOREACH(shared_ptr<DecisionTreeNode>& child, children)
574
+ {
575
+ if (child->getSumW() == 0)
576
+ continue;
577
+
578
+ // update sumZ / sumW
579
+ child->updateSums();
580
+
581
+ BOOST_FOREACH(int& secondaryFeatureIndex, m_featureIndices)
582
+ {
583
+ if (secondaryFeatureIndex == primaryFeatureIndex)
584
+ continue;
585
+
586
+ shared_ptr<SplitDefinition> secondarySplitDefinition = splitter.createSplitDefinition(child,secondaryFeatureIndex);
587
+ if (secondarySplitDefinition.get() == 0)
588
+ continue;
589
+
590
+ // vlcMessage.Write("Secondary split on with imp " + ToString(secondarySplitDefinition->getImprovement()) + " lhsSumZ: " + ToString(secondarySplitDefinition->getLhsSumZ()) + " lhsSumW: " + ToString(secondarySplitDefinition->getLhsSumW()) + " rhsSumZ: " + ToString(secondarySplitDefinition->getRhsSumZ()) + " rhsSumW: " + ToString(secondarySplitDefinition->getRhsSumW()) + " missingSumZ: " + ToString(secondarySplitDefinition->getMissingSumZ()) + " missingSumZ: " + ToString(secondarySplitDefinition->getMissingSumW()));
591
+ FeatureInteraction interaction(primarySplitDefinition,secondarySplitDefinition,child->getPartition());
592
+ featureInteractions.push_back(interaction);
593
+ }
594
+ }
595
+ }
596
+ vlcMessage.Write("Sorting...");
597
+
598
+ sort(featureInteractions.begin(), featureInteractions.end(), featureInteractionSorter);
599
+
600
+ return vector<FeatureInteraction>(featureInteractions.begin(), featureInteractions.begin() + howMany);
601
+ }
@@ -0,0 +1,86 @@
1
+ #include "MachineLearning/GBM/GBMOutput.h"
2
+ #include "MachineLearning/GBM/BernoulliCalculator.h"
3
+ #include "MachineLearning/GBM/GaussianCalculator.h"
4
+ #include "MachineLearning/DecisionTree/DecisionTreeExperiment.h"
5
+ #include "MachineLearning/DecisionTree/DecisionTreeNode.h"
6
+
7
+ #include <boost/foreach.hpp>
8
+ #include <boost/make_shared.hpp>
9
+ using boost::make_shared;
10
+
11
+ GBMOutput::GBMOutput(MLData* trainingData, vector<int> trainingExperimentIndicies, shared_ptr<GBMParameters> parameters)
12
+ : MLOutput(trainingData, trainingExperimentIndicies), m_parameters(parameters), m_useMeanY(false)
13
+ {
14
+ if (m_parameters->distribution == GAUSSIAN)
15
+ m_gbmCalculator = make_shared<GaussianCalculator>();
16
+ else if (m_parameters->distribution == BERNOULLI)
17
+ m_gbmCalculator = make_shared<BernoulliCalculator>();
18
+ }
19
+
20
+ GBMOutput::~GBMOutput()
21
+ {
22
+
23
+ }
24
+
25
+ void GBMOutput::addHeadDecisionTreeNode(shared_ptr<DecisionTreeNode> node)
26
+ {
27
+ m_headNodes.push_back(node);
28
+ }
29
+
30
+ double GBMOutput::predictForExperiment(shared_ptr<MLExperiment> experiment)
31
+ {
32
+ shared_ptr<DecisionTreeExperiment> dtExperiment = make_shared<DecisionTreeExperiment>(experiment);
33
+ setPredictionForDecisionTreeExperiment(dtExperiment);
34
+ return dtExperiment->getPrediction();
35
+ }
36
+
37
+ void GBMOutput::addFIncrements(map<shared_ptr<DecisionTreeNode>, double> fIncrements)
38
+ {
39
+ m_fIncrements.push_back(fIncrements);
40
+ }
41
+
42
+ void GBMOutput::setMeanY(double y)
43
+ {
44
+ m_meanY = y;
45
+ m_useMeanY = true;
46
+ }
47
+
48
+ int GBMOutput::getNumTrees()
49
+ {
50
+ return (int) m_headNodes.size();
51
+ }
52
+
53
+ shared_ptr<GBMParameters> GBMOutput::getParameters()
54
+ {
55
+ return m_parameters;
56
+ }
57
+
58
+ void GBMOutput::capTrees( int numTrees )
59
+ {
60
+ m_headNodes.resize(numTrees);
61
+ m_fIncrements.resize(numTrees);
62
+ }
63
+
64
+ void GBMOutput::setPredictionForDecisionTreeExperiment( shared_ptr<DecisionTreeExperiment> experiment )
65
+ {
66
+ // determine initial F
67
+ double initialPrediction = (m_useMeanY ? m_meanY : experiment->getPrediction());
68
+
69
+ experiment->setF(m_gbmCalculator->calculateF(initialPrediction));
70
+
71
+ int index = -1;
72
+ BOOST_FOREACH(shared_ptr<DecisionTreeNode> node, m_headNodes)
73
+ {
74
+ ++index;
75
+ shared_ptr<DecisionTreeNode> terminalNode = (node->isTerminalNode() ? node : node->getTerminalNodeForExperiment(experiment));
76
+ if (terminalNode.get() == 0)
77
+ terminalNode = node;
78
+
79
+ if (m_fIncrements.at(index).find(terminalNode) == m_fIncrements.at(index).end())
80
+ throw std::runtime_error("We have no increment for this terminal node!!");
81
+
82
+ double incrementF = m_fIncrements.at(index)[terminalNode];
83
+ experiment->incrementF(incrementF);
84
+ }
85
+ experiment->setPrediction(m_gbmCalculator->calculatePrediction(experiment->getF()));
86
+ }