ml4r 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/ml4r/LinearRegression/LinearRegression.cpp +305 -0
- data/ext/ml4r/LinearRegression/OLSLinearRegression.cpp +75 -0
- data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.cpp +50 -0
- data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.cpp +195 -0
- data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.cpp +551 -0
- data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.cpp +22 -0
- data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.cpp +21 -0
- data/ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.cpp +142 -0
- data/ext/ml4r/MachineLearning/GBM/BernoulliCalculator.cpp +95 -0
- data/ext/ml4r/MachineLearning/GBM/GBMEstimator.cpp +601 -0
- data/ext/ml4r/MachineLearning/GBM/GBMOutput.cpp +86 -0
- data/ext/ml4r/MachineLearning/GBM/GBMRunner.cpp +117 -0
- data/ext/ml4r/MachineLearning/GBM/GaussianCalculator.cpp +94 -0
- data/ext/ml4r/MachineLearning/GBM/ZenithGBM.cpp +317 -0
- data/ext/ml4r/MachineLearning/MLData/MLData.cpp +232 -0
- data/ext/ml4r/MachineLearning/MLData/MLDataFields.cpp +1 -0
- data/ext/ml4r/MachineLearning/MLData/MLDataReader.cpp +139 -0
- data/ext/ml4r/MachineLearning/MLData/ZenithMLData.cpp +96 -0
- data/ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.cpp +113 -0
- data/ext/ml4r/MachineLearning/MLExperiment.cpp +69 -0
- data/ext/ml4r/MachineLearning/MLRunner.cpp +183 -0
- data/ext/ml4r/MachineLearning/MLUtils.cpp +15 -0
- data/ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.cpp +172 -0
- data/ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.cpp +66 -0
- data/ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.cpp +84 -0
- data/ext/ml4r/MachineLearning/RandomForest/ZenithRandomForest.cpp +184 -0
- data/ext/ml4r/ml4r.cpp +34 -0
- data/ext/ml4r/ml4r_wrap.cpp +15727 -0
- data/ext/ml4r/utils/MathUtils.cpp +204 -0
- data/ext/ml4r/utils/StochasticUtils.cpp +73 -0
- data/ext/ml4r/utils/Utils.cpp +14 -0
- data/ext/ml4r/utils/VlcMessage.cpp +3 -0
- metadata +33 -1
@@ -0,0 +1,142 @@
|
|
1
|
+
#include "MachineLearning/DecisionTree/SplitDefinition.h"
|
2
|
+
#include <stdexcept>
|
3
|
+
using std::runtime_error;
|
4
|
+
|
5
|
+
SplitDefinition::SplitDefinition( shared_ptr<DecisionTreeNode> nodeToSplit,
|
6
|
+
int featureIndex,
|
7
|
+
set<double>& lhsCategories,
|
8
|
+
set<double>& rhsCategories,
|
9
|
+
double lhsSumZ,
|
10
|
+
double lhsSumW,
|
11
|
+
int lhsCount,
|
12
|
+
double rhsSumZ,
|
13
|
+
double rhsSumW,
|
14
|
+
int rhsCount,
|
15
|
+
double missingSumZ,
|
16
|
+
double missingSumW,
|
17
|
+
int missingCount,
|
18
|
+
double improvement)
|
19
|
+
: m_nodeToSplit(nodeToSplit), m_splitFeatureIndex(featureIndex), m_lhsCategories(lhsCategories), m_rhsCategories(rhsCategories),
|
20
|
+
m_lhsSumZ(lhsSumZ), m_lhsSumW(lhsSumW), m_lhsCount(lhsCount),
|
21
|
+
m_rhsSumZ(rhsSumZ), m_rhsSumW(rhsSumW), m_rhsCount(rhsCount),
|
22
|
+
m_missingSumZ(missingSumZ), m_missingSumW(missingSumW), m_missingCount(missingCount),
|
23
|
+
m_improvement(improvement)
|
24
|
+
{
|
25
|
+
m_splitValue = -1;
|
26
|
+
m_featureIsCategorical = true;
|
27
|
+
}
|
28
|
+
|
29
|
+
SplitDefinition::SplitDefinition( shared_ptr<DecisionTreeNode> nodeToSplit,
|
30
|
+
int featureIndex,
|
31
|
+
double splitValue,
|
32
|
+
double lhsSumZ,
|
33
|
+
double lhsSumW,
|
34
|
+
int lhsCount,
|
35
|
+
double rhsSumZ,
|
36
|
+
double rhsSumW,
|
37
|
+
int rhsCount,
|
38
|
+
double missingSumZ,
|
39
|
+
double missingSumW,
|
40
|
+
int missingCount,
|
41
|
+
double improvement)
|
42
|
+
: m_nodeToSplit(nodeToSplit), m_splitFeatureIndex(featureIndex), m_splitValue(splitValue),
|
43
|
+
m_lhsSumZ(lhsSumZ), m_lhsSumW(lhsSumW), m_lhsCount(lhsCount),
|
44
|
+
m_rhsSumZ(rhsSumZ), m_rhsSumW(rhsSumW), m_rhsCount(rhsCount),
|
45
|
+
m_missingSumZ(missingSumZ), m_missingSumW(missingSumW), m_missingCount(missingCount),
|
46
|
+
m_improvement(improvement)
|
47
|
+
{
|
48
|
+
m_featureIsCategorical = false;
|
49
|
+
}
|
50
|
+
|
51
|
+
SplitDefinition::~SplitDefinition()
|
52
|
+
{
|
53
|
+
|
54
|
+
}
|
55
|
+
|
56
|
+
double SplitDefinition::getImprovement()
|
57
|
+
{
|
58
|
+
return m_improvement;
|
59
|
+
}
|
60
|
+
|
61
|
+
shared_ptr<DecisionTreeNode> SplitDefinition::getNodeToSplit()
|
62
|
+
{
|
63
|
+
return m_nodeToSplit;
|
64
|
+
}
|
65
|
+
|
66
|
+
int SplitDefinition::getFeatureIndex()
|
67
|
+
{
|
68
|
+
return m_splitFeatureIndex;
|
69
|
+
}
|
70
|
+
|
71
|
+
set<double>& SplitDefinition::getLhsCategories()
|
72
|
+
{
|
73
|
+
if (m_lhsCategories.empty() && m_missingCount == 0)
|
74
|
+
throw std::runtime_error("LhsCategories are empty!");
|
75
|
+
|
76
|
+
return m_lhsCategories;
|
77
|
+
}
|
78
|
+
|
79
|
+
set<double>& SplitDefinition::getRhsCategories()
|
80
|
+
{
|
81
|
+
if (m_rhsCategories.empty() && m_missingCount == 0)
|
82
|
+
throw std::runtime_error("RhsCategories are empty!");
|
83
|
+
|
84
|
+
return m_rhsCategories;
|
85
|
+
}
|
86
|
+
|
87
|
+
double SplitDefinition::getSplitValue()
|
88
|
+
{
|
89
|
+
return m_splitValue;
|
90
|
+
}
|
91
|
+
|
92
|
+
int SplitDefinition::getLhsExperimentCount()
|
93
|
+
{
|
94
|
+
return m_lhsCount;
|
95
|
+
}
|
96
|
+
|
97
|
+
bool SplitDefinition::isCategorical()
|
98
|
+
{
|
99
|
+
return m_featureIsCategorical;
|
100
|
+
}
|
101
|
+
|
102
|
+
int SplitDefinition::getRhsExperimentCount()
|
103
|
+
{
|
104
|
+
return m_rhsCount;
|
105
|
+
}
|
106
|
+
|
107
|
+
int SplitDefinition::getMissingExperimentCount()
|
108
|
+
{
|
109
|
+
return m_missingCount;
|
110
|
+
}
|
111
|
+
|
112
|
+
double SplitDefinition::getLhsSumZ()
|
113
|
+
{
|
114
|
+
return m_lhsSumZ;
|
115
|
+
}
|
116
|
+
|
117
|
+
double SplitDefinition::getLhsSumW()
|
118
|
+
{
|
119
|
+
return m_lhsSumW;
|
120
|
+
}
|
121
|
+
|
122
|
+
double SplitDefinition::getRhsSumZ()
|
123
|
+
{
|
124
|
+
return m_rhsSumZ;
|
125
|
+
}
|
126
|
+
|
127
|
+
double SplitDefinition::getRhsSumW()
|
128
|
+
{
|
129
|
+
return m_rhsSumW;
|
130
|
+
}
|
131
|
+
|
132
|
+
double SplitDefinition::getMissingSumZ()
|
133
|
+
{
|
134
|
+
return m_missingSumZ;
|
135
|
+
}
|
136
|
+
|
137
|
+
double SplitDefinition::getMissingSumW()
|
138
|
+
{
|
139
|
+
return m_missingSumW;
|
140
|
+
}
|
141
|
+
|
142
|
+
|
@@ -0,0 +1,95 @@
|
|
1
|
+
#include "MachineLearning/GBM/BernoulliCalculator.h"
|
2
|
+
#include "MachineLearning/DecisionTree/DecisionTreeExperiment.h"
|
3
|
+
|
4
|
+
#include <cmath>
|
5
|
+
#include <boost/foreach.hpp>
|
6
|
+
|
7
|
+
BernoulliCalculator::BernoulliCalculator()
|
8
|
+
{}
|
9
|
+
|
10
|
+
BernoulliCalculator::~BernoulliCalculator()
|
11
|
+
{}
|
12
|
+
|
13
|
+
double BernoulliCalculator::calculateDeviance(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
|
14
|
+
{
|
15
|
+
double sumL = 0.0;
|
16
|
+
double sumW = 0.0;
|
17
|
+
|
18
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& e, experiments)
|
19
|
+
{
|
20
|
+
double f = e->getF();
|
21
|
+
sumL += e->getWeight() * (e->getY() * f - log(1.0 + exp(f)));
|
22
|
+
sumW += e->getWeight();
|
23
|
+
}
|
24
|
+
return -2.0 * sumL / sumW;
|
25
|
+
}
|
26
|
+
|
27
|
+
void BernoulliCalculator::populateInitialF(vector<shared_ptr<DecisionTreeExperiment> >& experiments, bool useInitialPredictions)
|
28
|
+
{
|
29
|
+
if (!useInitialPredictions)
|
30
|
+
{
|
31
|
+
double sumY = 0.0, sumWeight = 0.0;
|
32
|
+
|
33
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& e, experiments)
|
34
|
+
{
|
35
|
+
sumY += e->getY() * e->getWeight();
|
36
|
+
sumWeight += e->getWeight();
|
37
|
+
}
|
38
|
+
|
39
|
+
double meanY = sumY / sumWeight;
|
40
|
+
|
41
|
+
// the output object needs to know this value for applying to new experiments
|
42
|
+
// m_output->setMeanY(meanY);
|
43
|
+
|
44
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& e, experiments)
|
45
|
+
e->setPrediction(meanY);
|
46
|
+
}
|
47
|
+
|
48
|
+
// now, update F for all our experiments
|
49
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& e, experiments)
|
50
|
+
{
|
51
|
+
double utility = calculateF(e->getPrediction());
|
52
|
+
e->setF(utility);
|
53
|
+
}
|
54
|
+
}
|
55
|
+
|
56
|
+
void BernoulliCalculator::updateZ(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
|
57
|
+
{
|
58
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& experiment, experiments)
|
59
|
+
{
|
60
|
+
experiment->setZ(experiment->getY() - experiment->getPrediction());
|
61
|
+
}
|
62
|
+
}
|
63
|
+
|
64
|
+
double BernoulliCalculator::computeFIncrement(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
|
65
|
+
{
|
66
|
+
// TV - I don't know much about this maths except that it's the standard for bernoulli (logit)
|
67
|
+
double numerator = 0.0, denominator = 0.0;
|
68
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& experiment, experiments)
|
69
|
+
{
|
70
|
+
numerator += experiment->getWeight() * experiment->getZ();
|
71
|
+
double p = experiment->getPrediction();
|
72
|
+
|
73
|
+
denominator += experiment->getWeight() * p * (1.0 - p);
|
74
|
+
}
|
75
|
+
|
76
|
+
return numerator / denominator;
|
77
|
+
}
|
78
|
+
|
79
|
+
void BernoulliCalculator::updatePredictions(vector<shared_ptr<DecisionTreeExperiment> >& experiments)
|
80
|
+
{
|
81
|
+
BOOST_FOREACH(shared_ptr<DecisionTreeExperiment>& experiment, experiments)
|
82
|
+
{
|
83
|
+
experiment->setPrediction(calculatePrediction(experiment->getF()));
|
84
|
+
}
|
85
|
+
}
|
86
|
+
|
87
|
+
double BernoulliCalculator::calculatePrediction(double f)
|
88
|
+
{
|
89
|
+
return 1.0 / (1.0 + exp(-f));
|
90
|
+
}
|
91
|
+
|
92
|
+
double BernoulliCalculator::calculateF(double prediction)
|
93
|
+
{
|
94
|
+
return log(prediction / (1.0 - prediction));
|
95
|
+
}
|