ml4r 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/ml4r/LinearRegression/LinearRegression.cpp +305 -0
- data/ext/ml4r/LinearRegression/OLSLinearRegression.cpp +75 -0
- data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.cpp +50 -0
- data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.cpp +195 -0
- data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.cpp +551 -0
- data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.cpp +22 -0
- data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.cpp +21 -0
- data/ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.cpp +142 -0
- data/ext/ml4r/MachineLearning/GBM/BernoulliCalculator.cpp +95 -0
- data/ext/ml4r/MachineLearning/GBM/GBMEstimator.cpp +601 -0
- data/ext/ml4r/MachineLearning/GBM/GBMOutput.cpp +86 -0
- data/ext/ml4r/MachineLearning/GBM/GBMRunner.cpp +117 -0
- data/ext/ml4r/MachineLearning/GBM/GaussianCalculator.cpp +94 -0
- data/ext/ml4r/MachineLearning/GBM/ZenithGBM.cpp +317 -0
- data/ext/ml4r/MachineLearning/MLData/MLData.cpp +232 -0
- data/ext/ml4r/MachineLearning/MLData/MLDataFields.cpp +1 -0
- data/ext/ml4r/MachineLearning/MLData/MLDataReader.cpp +139 -0
- data/ext/ml4r/MachineLearning/MLData/ZenithMLData.cpp +96 -0
- data/ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.cpp +113 -0
- data/ext/ml4r/MachineLearning/MLExperiment.cpp +69 -0
- data/ext/ml4r/MachineLearning/MLRunner.cpp +183 -0
- data/ext/ml4r/MachineLearning/MLUtils.cpp +15 -0
- data/ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.cpp +172 -0
- data/ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.cpp +66 -0
- data/ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.cpp +84 -0
- data/ext/ml4r/MachineLearning/RandomForest/ZenithRandomForest.cpp +184 -0
- data/ext/ml4r/ml4r.cpp +34 -0
- data/ext/ml4r/ml4r_wrap.cpp +15727 -0
- data/ext/ml4r/utils/MathUtils.cpp +204 -0
- data/ext/ml4r/utils/StochasticUtils.cpp +73 -0
- data/ext/ml4r/utils/Utils.cpp +14 -0
- data/ext/ml4r/utils/VlcMessage.cpp +3 -0
- metadata +33 -1
@@ -0,0 +1,204 @@
|
|
1
|
+
#include "MathUtils.h"
|
2
|
+
#include <boost/lexical_cast.hpp>
|
3
|
+
#include <boost/foreach.hpp>
|
4
|
+
#include "Utils.h"
|
5
|
+
#include <stdlib.h>
|
6
|
+
#include <cmath>
|
7
|
+
#include <stdexcept>
|
8
|
+
#include <map>
|
9
|
+
using std::map;
|
10
|
+
using std::runtime_error;
|
11
|
+
|
12
|
+
void MathUtils::checkSystemDimensions(vector<vector<double> >& a, vector<double>& b)
|
13
|
+
{
|
14
|
+
if (b.size() != a.size())
|
15
|
+
throw std::runtime_error("[MathUtils::solveSystemOfLinearEquations]: A and b must have the same number of rows");
|
16
|
+
unsigned long n = a.size();
|
17
|
+
BOOST_FOREACH(vector<double>& row, a)
|
18
|
+
if (row.size() != n)
|
19
|
+
runtime_error("[MathUtils::solveSystemOfLinearEquations]: A must be a square matrix");
|
20
|
+
}
|
21
|
+
|
22
|
+
vector<double> MathUtils::solveSystemOfLinearEquations(vector<vector<double> > a, vector<double> b)
|
23
|
+
{
|
24
|
+
if (!a.size()) throw std::runtime_error("[MathUtils::solveSystemOfLinearEquations] called with empty input");
|
25
|
+
|
26
|
+
// used to solve
|
27
|
+
// A.x = b
|
28
|
+
// where:
|
29
|
+
// A is an n x n matrix
|
30
|
+
// x and b are n x 1 vectors
|
31
|
+
MathUtils::checkSystemDimensions(a,b);
|
32
|
+
unsigned long n = a.size();
|
33
|
+
|
34
|
+
// perform Gaussian elimination
|
35
|
+
for (unsigned long i = 0; i < n - 1; ++i)
|
36
|
+
{
|
37
|
+
// eliminate everything in column i, below row i
|
38
|
+
MathUtils::eliminate(a, b, i);
|
39
|
+
}
|
40
|
+
|
41
|
+
vector<double> x(n);
|
42
|
+
|
43
|
+
// now solve, from the bottom up
|
44
|
+
for (long index = n - 1; index >= 0; --index)
|
45
|
+
{
|
46
|
+
double otherVarSum = 0.0;
|
47
|
+
for (unsigned long column = index +1; column < n; ++column)
|
48
|
+
{
|
49
|
+
otherVarSum += a.at(index).at(column) * x.at(column);
|
50
|
+
}
|
51
|
+
double val = b.at(index) - otherVarSum;
|
52
|
+
double denominator = a.at(index).at(index);
|
53
|
+
if (!denominator)
|
54
|
+
throw std::runtime_error("[MathUtils::solveSystemOfLinearEquations] Cannot divide by zero ==> no unique solution. Is system identified? Have you included too many variables?");
|
55
|
+
|
56
|
+
double coefficient = val / denominator;
|
57
|
+
x.at(index) = coefficient;
|
58
|
+
}
|
59
|
+
return x;
|
60
|
+
}
|
61
|
+
|
62
|
+
// void MathUtils::eliminate(TOtMatrix& a, TOtMatrix& b, int index)
|
63
|
+
void MathUtils::eliminate(vector<vector<double> >& a, vector<double>& b, unsigned long index)
|
64
|
+
{
|
65
|
+
unsigned long n = a.size();
|
66
|
+
if (a.at(index).at(index) == 0)
|
67
|
+
{
|
68
|
+
// find a non-zero value in the rows below.
|
69
|
+
bool found = false;
|
70
|
+
for (unsigned long row = index + 1; row < n; ++row)
|
71
|
+
{
|
72
|
+
if (a.at(row).at(index) != 0)
|
73
|
+
{
|
74
|
+
// found a non-zero value
|
75
|
+
found = true;
|
76
|
+
for (unsigned long column = index; column < n; ++column)
|
77
|
+
{
|
78
|
+
a.at(index).at(column) = a.at(index).at(column) + a.at(row).at(column);
|
79
|
+
}
|
80
|
+
b.at(index) = b.at(index) + b.at(row);
|
81
|
+
break;
|
82
|
+
}
|
83
|
+
}
|
84
|
+
if (!found)
|
85
|
+
throw std::runtime_error("Could not eliminate on index " + boost::lexical_cast<std::string>(index));
|
86
|
+
}
|
87
|
+
|
88
|
+
// now eliminate in all the rows below index
|
89
|
+
for (unsigned long row = index + 1; row < n; ++row)
|
90
|
+
{
|
91
|
+
double factor = a.at(row).at(index) / a.at(index).at(index);
|
92
|
+
if (factor != 0)
|
93
|
+
{
|
94
|
+
for (unsigned long column = index; column < n; ++column)
|
95
|
+
{
|
96
|
+
a.at(row).at(column) = a.at(row).at(column) - factor * a.at(index).at(column);
|
97
|
+
}
|
98
|
+
b.at(row) = b.at(row) - factor * b.at(index);
|
99
|
+
}
|
100
|
+
}
|
101
|
+
}
|
102
|
+
|
103
|
+
vector<int> MathUtils::identifyLinearlyDependentMatrixRows(vector<vector<double> >& matrix)
|
104
|
+
{
|
105
|
+
// this method identifies rows which need to be removed to achieve full row rank.
|
106
|
+
// rows which can be fully eliminated (as a linear combination of other rows) are what we're looking for.
|
107
|
+
|
108
|
+
unsigned long rows = matrix.size();
|
109
|
+
unsigned long cols = matrix.front().size();
|
110
|
+
// vlcMessage.Write(string("We have " + ToString(rows) + " rows.").c_str());
|
111
|
+
// vlcMessage.Write(string("We have " + ToString(cols) + " cols.").c_str());
|
112
|
+
|
113
|
+
BOOST_FOREACH(vector<double>& row, matrix)
|
114
|
+
{
|
115
|
+
if (row.size() != cols)
|
116
|
+
throw runtime_error("[MathUtils::identifyLinearlyDependentMatrixRows] Matrix is not rectangular");
|
117
|
+
}
|
118
|
+
|
119
|
+
unsigned long numberOfColumnsEliminated = 0;
|
120
|
+
map<int, int> rowHasBeenUsed;
|
121
|
+
|
122
|
+
for (unsigned long row = 0; row < rows; ++row)
|
123
|
+
{
|
124
|
+
double rowAbsSum = Utils::vectorSum(Utils::vectorAbs(matrix.at(row)));
|
125
|
+
// vlcMessage.Write(string("We have initial row abs sum " + ToString(rowAbsSum) + " for row " + ToString(row)).c_str());
|
126
|
+
}
|
127
|
+
|
128
|
+
unsigned long columnToEliminateWith = -1;
|
129
|
+
while (numberOfColumnsEliminated < rows-1 && columnToEliminateWith < cols - 1)
|
130
|
+
{
|
131
|
+
columnToEliminateWith += 1;
|
132
|
+
|
133
|
+
unsigned long rowToEliminateWith = -1;
|
134
|
+
double eliminationCellValue;
|
135
|
+
for (rowToEliminateWith = 0; rowToEliminateWith < rows; ++rowToEliminateWith)
|
136
|
+
{
|
137
|
+
if (rowHasBeenUsed[rowToEliminateWith] > 0)
|
138
|
+
continue;
|
139
|
+
|
140
|
+
eliminationCellValue = matrix.at(rowToEliminateWith).at(columnToEliminateWith);
|
141
|
+
|
142
|
+
if (eliminationCellValue > 0.001)
|
143
|
+
break;
|
144
|
+
else if (eliminationCellValue > 0)
|
145
|
+
{
|
146
|
+
// first check it's not TINY and we're dealing with precision issues
|
147
|
+
double rowAbsSum = Utils::vectorSum(Utils::vectorAbs(matrix.at(rowToEliminateWith)));
|
148
|
+
if (rowAbsSum > 0.01)
|
149
|
+
break; // if it's less than 0.001 it may as well be zero, so it's linearly dependent
|
150
|
+
}
|
151
|
+
}
|
152
|
+
|
153
|
+
if (rowToEliminateWith == rows)
|
154
|
+
continue; // didn't find a non-zero value in the column to eliminate, so keep going!
|
155
|
+
|
156
|
+
// vlcMessage.Write(string("Eliminating column " + ToString(columnToEliminateWith)).c_str());
|
157
|
+
// vlcMessage.Write(string("Eliminating using row " + ToString(rowToEliminateWith)).c_str());
|
158
|
+
|
159
|
+
for (unsigned long rowToEliminate = 0; rowToEliminate < rows; ++rowToEliminate)
|
160
|
+
{
|
161
|
+
if (rowToEliminate == rowToEliminateWith || rowHasBeenUsed[rowToEliminate] > 0)
|
162
|
+
continue;
|
163
|
+
|
164
|
+
double cellValueToEliminate = matrix.at(rowToEliminate).at(columnToEliminateWith);
|
165
|
+
if (cellValueToEliminate == 0.0)
|
166
|
+
continue; // nothing to do!
|
167
|
+
|
168
|
+
// vlcMessage.Write(string("Eliminating row " + ToString(rowToEliminate)).c_str());
|
169
|
+
|
170
|
+
double ratio = cellValueToEliminate / eliminationCellValue;
|
171
|
+
// vlcMessage.Write(ToString(ratio).c_str());
|
172
|
+
|
173
|
+
for (unsigned long columnToEliminate = columnToEliminateWith; columnToEliminate < cols; ++columnToEliminate)
|
174
|
+
{
|
175
|
+
double increment = ratio * matrix.at(rowToEliminateWith).at(columnToEliminate);
|
176
|
+
//vlcMessage.Write(string("Subtracting " + ToString(increment) + " from value " + ToString(matrix.at(rowToEliminate).at(columnToEliminate))).c_str());
|
177
|
+
matrix.at(rowToEliminate).at(columnToEliminate) -= increment;
|
178
|
+
}
|
179
|
+
}
|
180
|
+
rowHasBeenUsed[rowToEliminateWith] = 1;
|
181
|
+
numberOfColumnsEliminated += 1;
|
182
|
+
}
|
183
|
+
|
184
|
+
vector<int> linearlyDependentRows;
|
185
|
+
|
186
|
+
// now check which rows are *basically* zero!
|
187
|
+
for (unsigned long row = 0; row < rows; ++row)
|
188
|
+
{
|
189
|
+
double rowAbsSum = Utils::vectorSum(Utils::vectorAbs(matrix.at(row)));
|
190
|
+
// vlcMessage.Write(string("We have row abs sum " + ToString(rowAbsSum) + " for row " + ToString(row)).c_str());
|
191
|
+
if (rowAbsSum < 0.001)
|
192
|
+
linearlyDependentRows.push_back(row);
|
193
|
+
}
|
194
|
+
|
195
|
+
return linearlyDependentRows;
|
196
|
+
}
|
197
|
+
|
198
|
+
double MathUtils::convertRandomNumberToExtremeValue(long double random_number)
|
199
|
+
{
|
200
|
+
if (random_number < std::numeric_limits<double>::epsilon() || random_number > 1.0 - std::numeric_limits<double>::epsilon())
|
201
|
+
throw std::runtime_error("Can't have a random number not on the range (0.0, 1.0)");
|
202
|
+
|
203
|
+
return -::log(-::log(random_number));
|
204
|
+
}
|
@@ -0,0 +1,73 @@
|
|
1
|
+
#include "utils/StochasticUtils.h"
|
2
|
+
|
3
|
+
vector<double> StochasticUtils::convertPdfToCumulativeSum(std::vector<double> pdf)
|
4
|
+
{
|
5
|
+
float cumulativeSum = 0;
|
6
|
+
vector<double> returnVal(pdf.size());
|
7
|
+
for (unsigned int i=0; i<pdf.size(); ++i)
|
8
|
+
{
|
9
|
+
returnVal[i] = cumulativeSum + pdf[i];
|
10
|
+
cumulativeSum += pdf[i];
|
11
|
+
}
|
12
|
+
|
13
|
+
// normalise to unity sum
|
14
|
+
if (cumulativeSum != 1.0)
|
15
|
+
{
|
16
|
+
for (unsigned int i=0; i<pdf.size(); ++i)
|
17
|
+
{
|
18
|
+
returnVal[i] /= cumulativeSum;
|
19
|
+
}
|
20
|
+
}
|
21
|
+
|
22
|
+
if (returnVal[returnVal.size()-1] != 1.0)
|
23
|
+
returnVal[returnVal.size()-1] = 1.0;
|
24
|
+
|
25
|
+
return returnVal;
|
26
|
+
}
|
27
|
+
|
28
|
+
int StochasticUtils::chooseCategoryFromCdf( float * cumulativeProbabilities, int N )
|
29
|
+
{
|
30
|
+
float selection = rand() / (RAND_MAX_FLOAT+1.0f);
|
31
|
+
int i=0;
|
32
|
+
while ((i<N) && (selection > cumulativeProbabilities[i]))
|
33
|
+
i++;
|
34
|
+
return i;
|
35
|
+
}
|
36
|
+
|
37
|
+
int StochasticUtils::chooseCategoryFromCdf( vector<float>& cumulativeProbabilities )
|
38
|
+
{
|
39
|
+
float selection = rand() / (RAND_MAX_FLOAT+1.0f);
|
40
|
+
return chooseCategoryFromCdf(selection, cumulativeProbabilities);
|
41
|
+
}
|
42
|
+
|
43
|
+
int StochasticUtils::chooseCategoryFromCdf( float qot, vector<float>& cumulativeProbabilities )
|
44
|
+
{
|
45
|
+
unsigned int i=0;
|
46
|
+
while ((i<cumulativeProbabilities.size()) && (qot > cumulativeProbabilities[i]))
|
47
|
+
i++;
|
48
|
+
return i;
|
49
|
+
}
|
50
|
+
|
51
|
+
int StochasticUtils::chooseCategoryFromPdf(vector<float>& probabilities, string categoryType)
|
52
|
+
{
|
53
|
+
return chooseCategoryFromPdf(getQot(), probabilities, categoryType);
|
54
|
+
}
|
55
|
+
|
56
|
+
int StochasticUtils::chooseCategoryFromPdf(double qot, vector<float>& probabilities, string categoryType)
|
57
|
+
{
|
58
|
+
if (!probabilities.size())
|
59
|
+
throw runtime_error("There was a problem selecting a " + categoryType + " from an empty PDF!");
|
60
|
+
|
61
|
+
double summative_usage = 0.0;
|
62
|
+
unsigned int chosen_index = 0;
|
63
|
+
for (; chosen_index < probabilities.size(); ++chosen_index)
|
64
|
+
{
|
65
|
+
summative_usage += probabilities[chosen_index];
|
66
|
+
if (summative_usage > qot) break;
|
67
|
+
}
|
68
|
+
if (chosen_index >= probabilities.size())
|
69
|
+
{
|
70
|
+
throw runtime_error("There was a problem selecting a " + categoryType + " from the PDF!");
|
71
|
+
}
|
72
|
+
return chosen_index;
|
73
|
+
}
|
@@ -0,0 +1,14 @@
|
|
1
|
+
#include "utils/Utils.h"
|
2
|
+
|
3
|
+
#include <vector>
|
4
|
+
using std::vector;
|
5
|
+
|
6
|
+
vector<int> Utils::vectorOfRandomInt(int length)
|
7
|
+
{
|
8
|
+
vector<int> returnValue;
|
9
|
+
returnValue.reserve(length);
|
10
|
+
for (int i = 0; i < length; ++i)
|
11
|
+
returnValue.push_back(rand());
|
12
|
+
|
13
|
+
return returnValue;
|
14
|
+
}
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ml4r
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -24,6 +24,38 @@ files:
|
|
24
24
|
- lib/ml4r/linear_regression.rb
|
25
25
|
- lib/ml4r.rb
|
26
26
|
- lib/test_cpp_extension.rb
|
27
|
+
- ext/ml4r/LinearRegression/LinearRegression.cpp
|
28
|
+
- ext/ml4r/LinearRegression/OLSLinearRegression.cpp
|
29
|
+
- ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.cpp
|
30
|
+
- ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.cpp
|
31
|
+
- ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.cpp
|
32
|
+
- ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.cpp
|
33
|
+
- ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.cpp
|
34
|
+
- ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.cpp
|
35
|
+
- ext/ml4r/MachineLearning/GBM/BernoulliCalculator.cpp
|
36
|
+
- ext/ml4r/MachineLearning/GBM/GaussianCalculator.cpp
|
37
|
+
- ext/ml4r/MachineLearning/GBM/GBMEstimator.cpp
|
38
|
+
- ext/ml4r/MachineLearning/GBM/GBMOutput.cpp
|
39
|
+
- ext/ml4r/MachineLearning/GBM/GBMRunner.cpp
|
40
|
+
- ext/ml4r/MachineLearning/GBM/ZenithGBM.cpp
|
41
|
+
- ext/ml4r/MachineLearning/MLData/MLData.cpp
|
42
|
+
- ext/ml4r/MachineLearning/MLData/MLDataFields.cpp
|
43
|
+
- ext/ml4r/MachineLearning/MLData/MLDataReader.cpp
|
44
|
+
- ext/ml4r/MachineLearning/MLData/ZenithMLData.cpp
|
45
|
+
- ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.cpp
|
46
|
+
- ext/ml4r/MachineLearning/MLExperiment.cpp
|
47
|
+
- ext/ml4r/MachineLearning/MLRunner.cpp
|
48
|
+
- ext/ml4r/MachineLearning/MLUtils.cpp
|
49
|
+
- ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.cpp
|
50
|
+
- ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.cpp
|
51
|
+
- ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.cpp
|
52
|
+
- ext/ml4r/MachineLearning/RandomForest/ZenithRandomForest.cpp
|
53
|
+
- ext/ml4r/ml4r.cpp
|
54
|
+
- ext/ml4r/ml4r_wrap.cpp
|
55
|
+
- ext/ml4r/utils/MathUtils.cpp
|
56
|
+
- ext/ml4r/utils/StochasticUtils.cpp
|
57
|
+
- ext/ml4r/utils/Utils.cpp
|
58
|
+
- ext/ml4r/utils/VlcMessage.cpp
|
27
59
|
- ext/ml4r/LinearRegression/LinearRegression.h
|
28
60
|
- ext/ml4r/LinearRegression/OLSLinearRegression.h
|
29
61
|
- ext/ml4r/MachineLearning/DecisionTree/CategoryInfo.h
|