ml4r 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. data/ext/ml4r/LinearRegression/LinearRegression.cpp +305 -0
  2. data/ext/ml4r/LinearRegression/OLSLinearRegression.cpp +75 -0
  3. data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.cpp +50 -0
  4. data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.cpp +195 -0
  5. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.cpp +551 -0
  6. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.cpp +22 -0
  7. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.cpp +21 -0
  8. data/ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.cpp +142 -0
  9. data/ext/ml4r/MachineLearning/GBM/BernoulliCalculator.cpp +95 -0
  10. data/ext/ml4r/MachineLearning/GBM/GBMEstimator.cpp +601 -0
  11. data/ext/ml4r/MachineLearning/GBM/GBMOutput.cpp +86 -0
  12. data/ext/ml4r/MachineLearning/GBM/GBMRunner.cpp +117 -0
  13. data/ext/ml4r/MachineLearning/GBM/GaussianCalculator.cpp +94 -0
  14. data/ext/ml4r/MachineLearning/GBM/ZenithGBM.cpp +317 -0
  15. data/ext/ml4r/MachineLearning/MLData/MLData.cpp +232 -0
  16. data/ext/ml4r/MachineLearning/MLData/MLDataFields.cpp +1 -0
  17. data/ext/ml4r/MachineLearning/MLData/MLDataReader.cpp +139 -0
  18. data/ext/ml4r/MachineLearning/MLData/ZenithMLData.cpp +96 -0
  19. data/ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.cpp +113 -0
  20. data/ext/ml4r/MachineLearning/MLExperiment.cpp +69 -0
  21. data/ext/ml4r/MachineLearning/MLRunner.cpp +183 -0
  22. data/ext/ml4r/MachineLearning/MLUtils.cpp +15 -0
  23. data/ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.cpp +172 -0
  24. data/ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.cpp +66 -0
  25. data/ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.cpp +84 -0
  26. data/ext/ml4r/MachineLearning/RandomForest/ZenithRandomForest.cpp +184 -0
  27. data/ext/ml4r/ml4r.cpp +34 -0
  28. data/ext/ml4r/ml4r_wrap.cpp +15727 -0
  29. data/ext/ml4r/utils/MathUtils.cpp +204 -0
  30. data/ext/ml4r/utils/StochasticUtils.cpp +73 -0
  31. data/ext/ml4r/utils/Utils.cpp +14 -0
  32. data/ext/ml4r/utils/VlcMessage.cpp +3 -0
  33. metadata +33 -1
@@ -0,0 +1,142 @@
1
+ #include "MachineLearning/DecisionTree/SplitDefinition.h"
2
+ #include <stdexcept>
3
+ using std::runtime_error;
4
+
5
+ SplitDefinition::SplitDefinition( shared_ptr<DecisionTreeNode> nodeToSplit,
6
+ int featureIndex,
7
+ set<double>& lhsCategories,
8
+ set<double>& rhsCategories,
9
+ double lhsSumZ,
10
+ double lhsSumW,
11
+ int lhsCount,
12
+ double rhsSumZ,
13
+ double rhsSumW,
14
+ int rhsCount,
15
+ double missingSumZ,
16
+ double missingSumW,
17
+ int missingCount,
18
+ double improvement)
19
+ : m_nodeToSplit(nodeToSplit), m_splitFeatureIndex(featureIndex), m_lhsCategories(lhsCategories), m_rhsCategories(rhsCategories),
20
+ m_lhsSumZ(lhsSumZ), m_lhsSumW(lhsSumW), m_lhsCount(lhsCount),
21
+ m_rhsSumZ(rhsSumZ), m_rhsSumW(rhsSumW), m_rhsCount(rhsCount),
22
+ m_missingSumZ(missingSumZ), m_missingSumW(missingSumW), m_missingCount(missingCount),
23
+ m_improvement(improvement)
24
+ {
25
+ m_splitValue = -1;
26
+ m_featureIsCategorical = true;
27
+ }
28
+
29
+ SplitDefinition::SplitDefinition( shared_ptr<DecisionTreeNode> nodeToSplit,
30
+ int featureIndex,
31
+ double splitValue,
32
+ double lhsSumZ,
33
+ double lhsSumW,
34
+ int lhsCount,
35
+ double rhsSumZ,
36
+ double rhsSumW,
37
+ int rhsCount,
38
+ double missingSumZ,
39
+ double missingSumW,
40
+ int missingCount,
41
+ double improvement)
42
+ : m_nodeToSplit(nodeToSplit), m_splitFeatureIndex(featureIndex), m_splitValue(splitValue),
43
+ m_lhsSumZ(lhsSumZ), m_lhsSumW(lhsSumW), m_lhsCount(lhsCount),
44
+ m_rhsSumZ(rhsSumZ), m_rhsSumW(rhsSumW), m_rhsCount(rhsCount),
45
+ m_missingSumZ(missingSumZ), m_missingSumW(missingSumW), m_missingCount(missingCount),
46
+ m_improvement(improvement)
47
+ {
48
+ m_featureIsCategorical = false;
49
+ }
50
+
51
+ SplitDefinition::~SplitDefinition()
52
+ {
53
+
54
+ }
55
+
56
+ double SplitDefinition::getImprovement()
57
+ {
58
+ return m_improvement;
59
+ }
60
+
61
+ shared_ptr<DecisionTreeNode> SplitDefinition::getNodeToSplit()
62
+ {
63
+ return m_nodeToSplit;
64
+ }
65
+
66
+ int SplitDefinition::getFeatureIndex()
67
+ {
68
+ return m_splitFeatureIndex;
69
+ }
70
+
71
+ set<double>& SplitDefinition::getLhsCategories()
72
+ {
73
+ if (m_lhsCategories.empty() && m_missingCount == 0)
74
+ throw std::runtime_error("LhsCategories are empty!");
75
+
76
+ return m_lhsCategories;
77
+ }
78
+
79
+ set<double>& SplitDefinition::getRhsCategories()
80
+ {
81
+ if (m_rhsCategories.empty() && m_missingCount == 0)
82
+ throw std::runtime_error("RhsCategories are empty!");
83
+
84
+ return m_rhsCategories;
85
+ }
86
+
87
+ double SplitDefinition::getSplitValue()
88
+ {
89
+ return m_splitValue;
90
+ }
91
+
92
+ int SplitDefinition::getLhsExperimentCount()
93
+ {
94
+ return m_lhsCount;
95
+ }
96
+
97
+ bool SplitDefinition::isCategorical()
98
+ {
99
+ return m_featureIsCategorical;
100
+ }
101
+
102
+ int SplitDefinition::getRhsExperimentCount()
103
+ {
104
+ return m_rhsCount;
105
+ }
106
+
107
+ int SplitDefinition::getMissingExperimentCount()
108
+ {
109
+ return m_missingCount;
110
+ }
111
+
112
+ double SplitDefinition::getLhsSumZ()
113
+ {
114
+ return m_lhsSumZ;
115
+ }
116
+
117
+ double SplitDefinition::getLhsSumW()
118
+ {
119
+ return m_lhsSumW;
120
+ }
121
+
122
+ double SplitDefinition::getRhsSumZ()
123
+ {
124
+ return m_rhsSumZ;
125
+ }
126
+
127
+ double SplitDefinition::getRhsSumW()
128
+ {
129
+ return m_rhsSumW;
130
+ }
131
+
132
+ double SplitDefinition::getMissingSumZ()
133
+ {
134
+ return m_missingSumZ;
135
+ }
136
+
137
+ double SplitDefinition::getMissingSumW()
138
+ {
139
+ return m_missingSumW;
140
+ }
141
+
142
+
@@ -0,0 +1,95 @@
1
+ #include "MachineLearning/GBM/BernoulliCalculator.h"
2
+ #include "MachineLearning/DecisionTree/DecisionTreeExperiment.h"
3
+
4
+ #include <cmath>
5
+ #include <boost/foreach.hpp>
6
+
7
+ BernoulliCalculator::BernoulliCalculator()
8
+ {}
9
+
10
+ BernoulliCalculator::~BernoulliCalculator()
11
+ {}
12
+
13
+ double BernoulliCalculator::calculateDeviance(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
14
+ {
15
+ double sumL = 0.0;
16
+ double sumW = 0.0;
17
+
18
+ BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& e, experiments)
19
+ {
20
+ double f = e->getF();
21
+ sumL += e->getWeight() * (e->getY() * f - log(1.0 + exp(f)));
22
+ sumW += e->getWeight();
23
+ }
24
+ return -2.0 * sumL / sumW;
25
+ }
26
+
27
+ void BernoulliCalculator::populateInitialF(vector<shared_ptr<DecisionTreeExperiment> >& experiments, bool useInitialPredictions)
28
+ {
29
+ if (!useInitialPredictions)
30
+ {
31
+ double sumY = 0.0, sumWeight = 0.0;
32
+
33
+ BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& e, experiments)
34
+ {
35
+ sumY += e->getY() * e->getWeight();
36
+ sumWeight += e->getWeight();
37
+ }
38
+
39
+ double meanY = sumY / sumWeight;
40
+
41
+ // the output object needs to know this value for applying to new experiments
42
+ // m_output->setMeanY(meanY);
43
+
44
+ BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& e, experiments)
45
+ e->setPrediction(meanY);
46
+ }
47
+
48
+ // now, update F for all our experiments
49
+ BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& e, experiments)
50
+ {
51
+ double utility = calculateF(e->getPrediction());
52
+ e->setF(utility);
53
+ }
54
+ }
55
+
56
+ void BernoulliCalculator::updateZ(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
57
+ {
58
+ BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& experiment, experiments)
59
+ {
60
+ experiment->setZ(experiment->getY() - experiment->getPrediction());
61
+ }
62
+ }
63
+
64
+ double BernoulliCalculator::computeFIncrement(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
65
+ {
66
+ // TV - I don't know much about this maths except that it's the standard for bernoulli (logit)
67
+ double numerator = 0.0, denominator = 0.0;
68
+ BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& experiment, experiments)
69
+ {
70
+ numerator += experiment->getWeight() * experiment->getZ();
71
+ double p = experiment->getPrediction();
72
+
73
+ denominator += experiment->getWeight() * p * (1.0 - p);
74
+ }
75
+
76
+ return numerator / denominator;
77
+ }
78
+
79
+ void BernoulliCalculator::updatePredictions(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
80
+ {
81
+ BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& experiment, experiments)
82
+ {
83
+ experiment->setPrediction(calculatePrediction(experiment->getF()));
84
+ }
85
+ }
86
+
87
+ double BernoulliCalculator::calculatePrediction(double f)
88
+ {
89
+ return 1.0 / (1.0 + exp(-f));
90
+ }
91
+
92
+ double BernoulliCalculator::calculateF(double prediction)
93
+ {
94
+ return log(prediction / (1.0 - prediction));
95
+ }