ml4r 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/ml4r/LinearRegression/LinearRegression.cpp +305 -0
- data/ext/ml4r/LinearRegression/OLSLinearRegression.cpp +75 -0
- data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.cpp +50 -0
- data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.cpp +195 -0
- data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.cpp +551 -0
- data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.cpp +22 -0
- data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.cpp +21 -0
- data/ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.cpp +142 -0
- data/ext/ml4r/MachineLearning/GBM/BernoulliCalculator.cpp +95 -0
- data/ext/ml4r/MachineLearning/GBM/GBMEstimator.cpp +601 -0
- data/ext/ml4r/MachineLearning/GBM/GBMOutput.cpp +86 -0
- data/ext/ml4r/MachineLearning/GBM/GBMRunner.cpp +117 -0
- data/ext/ml4r/MachineLearning/GBM/GaussianCalculator.cpp +94 -0
- data/ext/ml4r/MachineLearning/GBM/ZenithGBM.cpp +317 -0
- data/ext/ml4r/MachineLearning/MLData/MLData.cpp +232 -0
- data/ext/ml4r/MachineLearning/MLData/MLDataFields.cpp +1 -0
- data/ext/ml4r/MachineLearning/MLData/MLDataReader.cpp +139 -0
- data/ext/ml4r/MachineLearning/MLData/ZenithMLData.cpp +96 -0
- data/ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.cpp +113 -0
- data/ext/ml4r/MachineLearning/MLExperiment.cpp +69 -0
- data/ext/ml4r/MachineLearning/MLRunner.cpp +183 -0
- data/ext/ml4r/MachineLearning/MLUtils.cpp +15 -0
- data/ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.cpp +172 -0
- data/ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.cpp +66 -0
- data/ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.cpp +84 -0
- data/ext/ml4r/MachineLearning/RandomForest/ZenithRandomForest.cpp +184 -0
- data/ext/ml4r/ml4r.cpp +34 -0
- data/ext/ml4r/ml4r_wrap.cpp +15727 -0
- data/ext/ml4r/utils/MathUtils.cpp +204 -0
- data/ext/ml4r/utils/StochasticUtils.cpp +73 -0
- data/ext/ml4r/utils/Utils.cpp +14 -0
- data/ext/ml4r/utils/VlcMessage.cpp +3 -0
- metadata +33 -1
@@ -0,0 +1,204 @@
|
|
1
|
+
#include "MathUtils.h"
|
2
|
+
#include <boost/lexical_cast.hpp>
|
3
|
+
#include <boost/foreach.hpp>
|
4
|
+
#include "Utils.h"
|
5
|
+
#include <stdlib.h>
|
6
|
+
#include <cmath>
|
7
|
+
#include <stdexcept>
|
8
|
+
#include <map>
|
9
|
+
using std::map;
|
10
|
+
using std::runtime_error;
|
11
|
+
|
12
|
+
void MathUtils::checkSystemDimensions(vector<vector<double> >& a, vector<double>& b)
|
13
|
+
{
|
14
|
+
if (b.size() != a.size())
|
15
|
+
throw std::runtime_error("[MathUtils::solveSystemOfLinearEquations]: A and b must have the same number of rows");
|
16
|
+
unsigned long n = a.size();
|
17
|
+
BOOST_FOREACH(vector<double>& row, a)
|
18
|
+
if (row.size() != n)
|
19
|
+
runtime_error("[MathUtils::solveSystemOfLinearEquations]: A must be a square matrix");
|
20
|
+
}
|
21
|
+
|
22
|
+
vector<double> MathUtils::solveSystemOfLinearEquations(vector<vector<double> > a, vector<double> b)
|
23
|
+
{
|
24
|
+
if (!a.size()) throw std::runtime_error("[MathUtils::solveSystemOfLinearEquations] called with empty input");
|
25
|
+
|
26
|
+
// used to solve
|
27
|
+
// A.x = b
|
28
|
+
// where:
|
29
|
+
// A is an n x n matrix
|
30
|
+
// x and b are n x 1 vectors
|
31
|
+
MathUtils::checkSystemDimensions(a,b);
|
32
|
+
unsigned long n = a.size();
|
33
|
+
|
34
|
+
// perform Gaussian elimination
|
35
|
+
for (unsigned long i = 0; i < n - 1; ++i)
|
36
|
+
{
|
37
|
+
// eliminate everything in column i, below row i
|
38
|
+
MathUtils::eliminate(a, b, i);
|
39
|
+
}
|
40
|
+
|
41
|
+
vector<double> x(n);
|
42
|
+
|
43
|
+
// now solve, from the bottom up
|
44
|
+
for (long index = n - 1; index >= 0; --index)
|
45
|
+
{
|
46
|
+
double otherVarSum = 0.0;
|
47
|
+
for (unsigned long column = index +1; column < n; ++column)
|
48
|
+
{
|
49
|
+
otherVarSum += a.at(index).at(column) * x.at(column);
|
50
|
+
}
|
51
|
+
double val = b.at(index) - otherVarSum;
|
52
|
+
double denominator = a.at(index).at(index);
|
53
|
+
if (!denominator)
|
54
|
+
throw std::runtime_error("[MathUtils::solveSystemOfLinearEquations] Cannot divide by zero ==> no unique solution. Is system identified? Have you included too many variables?");
|
55
|
+
|
56
|
+
double coefficient = val / denominator;
|
57
|
+
x.at(index) = coefficient;
|
58
|
+
}
|
59
|
+
return x;
|
60
|
+
}
|
61
|
+
|
62
|
+
// void MathUtils::eliminate(TOtMatrix& a, TOtMatrix& b, int index)
|
63
|
+
void MathUtils::eliminate(vector<vector<double> >& a, vector<double>& b, unsigned long index)
|
64
|
+
{
|
65
|
+
unsigned long n = a.size();
|
66
|
+
if (a.at(index).at(index) == 0)
|
67
|
+
{
|
68
|
+
// find a non-zero value in the rows below.
|
69
|
+
bool found = false;
|
70
|
+
for (unsigned long row = index + 1; row < n; ++row)
|
71
|
+
{
|
72
|
+
if (a.at(row).at(index) != 0)
|
73
|
+
{
|
74
|
+
// found a non-zero value
|
75
|
+
found = true;
|
76
|
+
for (unsigned long column = index; column < n; ++column)
|
77
|
+
{
|
78
|
+
a.at(index).at(column) = a.at(index).at(column) + a.at(row).at(column);
|
79
|
+
}
|
80
|
+
b.at(index) = b.at(index) + b.at(row);
|
81
|
+
break;
|
82
|
+
}
|
83
|
+
}
|
84
|
+
if (!found)
|
85
|
+
throw std::runtime_error("Could not eliminate on index " + boost::lexical_cast<std::string>(index));
|
86
|
+
}
|
87
|
+
|
88
|
+
// now eliminate in all the rows below index
|
89
|
+
for (unsigned long row = index + 1; row < n; ++row)
|
90
|
+
{
|
91
|
+
double factor = a.at(row).at(index) / a.at(index).at(index);
|
92
|
+
if (factor != 0)
|
93
|
+
{
|
94
|
+
for (unsigned long column = index; column < n; ++column)
|
95
|
+
{
|
96
|
+
a.at(row).at(column) = a.at(row).at(column) - factor * a.at(index).at(column);
|
97
|
+
}
|
98
|
+
b.at(row) = b.at(row) - factor * b.at(index);
|
99
|
+
}
|
100
|
+
}
|
101
|
+
}
|
102
|
+
|
103
|
+
vector<int> MathUtils::identifyLinearlyDependentMatrixRows(vector<vector<double> >& matrix)
|
104
|
+
{
|
105
|
+
// this method identifies rows which need to be removed to achieve full row rank.
|
106
|
+
// rows which can be fully eliminated (as a linear combination of other rows) are what we're looking for.
|
107
|
+
|
108
|
+
unsigned long rows = matrix.size();
|
109
|
+
unsigned long cols = matrix.front().size();
|
110
|
+
// vlcMessage.Write(string("We have " + ToString(rows) + " rows.").c_str());
|
111
|
+
// vlcMessage.Write(string("We have " + ToString(cols) + " cols.").c_str());
|
112
|
+
|
113
|
+
BOOST_FOREACH(vector<double>& row, matrix)
|
114
|
+
{
|
115
|
+
if (row.size() != cols)
|
116
|
+
throw runtime_error("[MathUtils::identifyLinearlyDependentMatrixRows] Matrix is not rectangular");
|
117
|
+
}
|
118
|
+
|
119
|
+
unsigned long numberOfColumnsEliminated = 0;
|
120
|
+
map<int, int> rowHasBeenUsed;
|
121
|
+
|
122
|
+
for (unsigned long row = 0; row < rows; ++row)
|
123
|
+
{
|
124
|
+
double rowAbsSum = Utils::vectorSum(Utils::vectorAbs(matrix.at(row)));
|
125
|
+
// vlcMessage.Write(string("We have initial row abs sum " + ToString(rowAbsSum) + " for row " + ToString(row)).c_str());
|
126
|
+
}
|
127
|
+
|
128
|
+
unsigned long columnToEliminateWith = -1;
|
129
|
+
while (numberOfColumnsEliminated < rows-1 && columnToEliminateWith < cols - 1)
|
130
|
+
{
|
131
|
+
columnToEliminateWith += 1;
|
132
|
+
|
133
|
+
unsigned long rowToEliminateWith = -1;
|
134
|
+
double eliminationCellValue;
|
135
|
+
for (rowToEliminateWith = 0; rowToEliminateWith < rows; ++rowToEliminateWith)
|
136
|
+
{
|
137
|
+
if (rowHasBeenUsed[rowToEliminateWith] > 0)
|
138
|
+
continue;
|
139
|
+
|
140
|
+
eliminationCellValue = matrix.at(rowToEliminateWith).at(columnToEliminateWith);
|
141
|
+
|
142
|
+
if (eliminationCellValue > 0.001)
|
143
|
+
break;
|
144
|
+
else if (eliminationCellValue > 0)
|
145
|
+
{
|
146
|
+
// first check it's not TINY and we're dealing with precision issues
|
147
|
+
double rowAbsSum = Utils::vectorSum(Utils::vectorAbs(matrix.at(rowToEliminateWith)));
|
148
|
+
if (rowAbsSum > 0.01)
|
149
|
+
break; // if it's less than 0.001 it may as well be zero, so it's linearly dependent
|
150
|
+
}
|
151
|
+
}
|
152
|
+
|
153
|
+
if (rowToEliminateWith == rows)
|
154
|
+
continue; // didn't find a non-zero value in the column to eliminate, so keep going!
|
155
|
+
|
156
|
+
// vlcMessage.Write(string("Eliminating column " + ToString(columnToEliminateWith)).c_str());
|
157
|
+
// vlcMessage.Write(string("Eliminating using row " + ToString(rowToEliminateWith)).c_str());
|
158
|
+
|
159
|
+
for (unsigned long rowToEliminate = 0; rowToEliminate < rows; ++rowToEliminate)
|
160
|
+
{
|
161
|
+
if (rowToEliminate == rowToEliminateWith || rowHasBeenUsed[rowToEliminate] > 0)
|
162
|
+
continue;
|
163
|
+
|
164
|
+
double cellValueToEliminate = matrix.at(rowToEliminate).at(columnToEliminateWith);
|
165
|
+
if (cellValueToEliminate == 0.0)
|
166
|
+
continue; // nothing to do!
|
167
|
+
|
168
|
+
// vlcMessage.Write(string("Eliminating row " + ToString(rowToEliminate)).c_str());
|
169
|
+
|
170
|
+
double ratio = cellValueToEliminate / eliminationCellValue;
|
171
|
+
// vlcMessage.Write(ToString(ratio).c_str());
|
172
|
+
|
173
|
+
for (unsigned long columnToEliminate = columnToEliminateWith; columnToEliminate < cols; ++columnToEliminate)
|
174
|
+
{
|
175
|
+
double increment = ratio * matrix.at(rowToEliminateWith).at(columnToEliminate);
|
176
|
+
//vlcMessage.Write(string("Subtracting " + ToString(increment) + " from value " + ToString(matrix.at(rowToEliminate).at(columnToEliminate))).c_str());
|
177
|
+
matrix.at(rowToEliminate).at(columnToEliminate) -= increment;
|
178
|
+
}
|
179
|
+
}
|
180
|
+
rowHasBeenUsed[rowToEliminateWith] = 1;
|
181
|
+
numberOfColumnsEliminated += 1;
|
182
|
+
}
|
183
|
+
|
184
|
+
vector<int> linearlyDependentRows;
|
185
|
+
|
186
|
+
// now check which rows are *basically* zero!
|
187
|
+
for (unsigned long row = 0; row < rows; ++row)
|
188
|
+
{
|
189
|
+
double rowAbsSum = Utils::vectorSum(Utils::vectorAbs(matrix.at(row)));
|
190
|
+
// vlcMessage.Write(string("We have row abs sum " + ToString(rowAbsSum) + " for row " + ToString(row)).c_str());
|
191
|
+
if (rowAbsSum < 0.001)
|
192
|
+
linearlyDependentRows.push_back(row);
|
193
|
+
}
|
194
|
+
|
195
|
+
return linearlyDependentRows;
|
196
|
+
}
|
197
|
+
|
198
|
+
double MathUtils::convertRandomNumberToExtremeValue(long double random_number)
|
199
|
+
{
|
200
|
+
if (random_number < std::numeric_limits<double>::epsilon() || random_number > 1.0 - std::numeric_limits<double>::epsilon())
|
201
|
+
throw std::runtime_error("Can't have a random number not on the range (0.0, 1.0)");
|
202
|
+
|
203
|
+
return -::log(-::log(random_number));
|
204
|
+
}
|
@@ -0,0 +1,73 @@
|
|
1
|
+
#include "utils/StochasticUtils.h"
|
2
|
+
|
3
|
+
vector<double> StochasticUtils::convertPdfToCumulativeSum(std::vector<double> pdf)
|
4
|
+
{
|
5
|
+
float cumulativeSum = 0;
|
6
|
+
vector<double> returnVal(pdf.size());
|
7
|
+
for (unsigned int i=0; i<pdf.size(); ++i)
|
8
|
+
{
|
9
|
+
returnVal[i] = cumulativeSum + pdf[i];
|
10
|
+
cumulativeSum += pdf[i];
|
11
|
+
}
|
12
|
+
|
13
|
+
// normalise to unity sum
|
14
|
+
if (cumulativeSum != 1.0)
|
15
|
+
{
|
16
|
+
for (unsigned int i=0; i<pdf.size(); ++i)
|
17
|
+
{
|
18
|
+
returnVal[i] /= cumulativeSum;
|
19
|
+
}
|
20
|
+
}
|
21
|
+
|
22
|
+
if (returnVal[returnVal.size()-1] != 1.0)
|
23
|
+
returnVal[returnVal.size()-1] = 1.0;
|
24
|
+
|
25
|
+
return returnVal;
|
26
|
+
}
|
27
|
+
|
28
|
+
int StochasticUtils::chooseCategoryFromCdf( float * cumulativeProbabilities, int N )
|
29
|
+
{
|
30
|
+
float selection = rand() / (RAND_MAX_FLOAT+1.0f);
|
31
|
+
int i=0;
|
32
|
+
while ((i<N) && (selection > cumulativeProbabilities[i]))
|
33
|
+
i++;
|
34
|
+
return i;
|
35
|
+
}
|
36
|
+
|
37
|
+
int StochasticUtils::chooseCategoryFromCdf( vector<float>& cumulativeProbabilities )
|
38
|
+
{
|
39
|
+
float selection = rand() / (RAND_MAX_FLOAT+1.0f);
|
40
|
+
return chooseCategoryFromCdf(selection, cumulativeProbabilities);
|
41
|
+
}
|
42
|
+
|
43
|
+
int StochasticUtils::chooseCategoryFromCdf( float qot, vector<float>& cumulativeProbabilities )
|
44
|
+
{
|
45
|
+
unsigned int i=0;
|
46
|
+
while ((i<cumulativeProbabilities.size()) && (qot > cumulativeProbabilities[i]))
|
47
|
+
i++;
|
48
|
+
return i;
|
49
|
+
}
|
50
|
+
|
51
|
+
int StochasticUtils::chooseCategoryFromPdf(vector<float>& probabilities, string categoryType)
|
52
|
+
{
|
53
|
+
return chooseCategoryFromPdf(getQot(), probabilities, categoryType);
|
54
|
+
}
|
55
|
+
|
56
|
+
int StochasticUtils::chooseCategoryFromPdf(double qot, vector<float>& probabilities, string categoryType)
|
57
|
+
{
|
58
|
+
if (!probabilities.size())
|
59
|
+
throw runtime_error("There was a problem selecting a " + categoryType + " from an empty PDF!");
|
60
|
+
|
61
|
+
double summative_usage = 0.0;
|
62
|
+
unsigned int chosen_index = 0;
|
63
|
+
for (; chosen_index < probabilities.size(); ++chosen_index)
|
64
|
+
{
|
65
|
+
summative_usage += probabilities[chosen_index];
|
66
|
+
if (summative_usage > qot) break;
|
67
|
+
}
|
68
|
+
if (chosen_index >= probabilities.size())
|
69
|
+
{
|
70
|
+
throw runtime_error("There was a problem selecting a " + categoryType + " from the PDF!");
|
71
|
+
}
|
72
|
+
return chosen_index;
|
73
|
+
}
|
@@ -0,0 +1,14 @@
|
|
1
|
+
#include "utils/Utils.h"
|
2
|
+
|
3
|
+
#include <vector>
|
4
|
+
using std::vector;
|
5
|
+
|
6
|
+
vector<int> Utils::vectorOfRandomInt(int length)
|
7
|
+
{
|
8
|
+
vector<int> returnValue;
|
9
|
+
returnValue.reserve(length);
|
10
|
+
for (int i = 0; i < length; ++i)
|
11
|
+
returnValue.push_back(rand());
|
12
|
+
|
13
|
+
return returnValue;
|
14
|
+
}
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ml4r
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -24,6 +24,38 @@ files:
|
|
24
24
|
- lib/ml4r/linear_regression.rb
|
25
25
|
- lib/ml4r.rb
|
26
26
|
- lib/test_cpp_extension.rb
|
27
|
+
- ext/ml4r/LinearRegression/LinearRegression.cpp
|
28
|
+
- ext/ml4r/LinearRegression/OLSLinearRegression.cpp
|
29
|
+
- ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.cpp
|
30
|
+
- ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.cpp
|
31
|
+
- ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.cpp
|
32
|
+
- ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.cpp
|
33
|
+
- ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.cpp
|
34
|
+
- ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.cpp
|
35
|
+
- ext/ml4r/MachineLearning/GBM/BernoulliCalculator.cpp
|
36
|
+
- ext/ml4r/MachineLearning/GBM/GaussianCalculator.cpp
|
37
|
+
- ext/ml4r/MachineLearning/GBM/GBMEstimator.cpp
|
38
|
+
- ext/ml4r/MachineLearning/GBM/GBMOutput.cpp
|
39
|
+
- ext/ml4r/MachineLearning/GBM/GBMRunner.cpp
|
40
|
+
- ext/ml4r/MachineLearning/GBM/ZenithGBM.cpp
|
41
|
+
- ext/ml4r/MachineLearning/MLData/MLData.cpp
|
42
|
+
- ext/ml4r/MachineLearning/MLData/MLDataFields.cpp
|
43
|
+
- ext/ml4r/MachineLearning/MLData/MLDataReader.cpp
|
44
|
+
- ext/ml4r/MachineLearning/MLData/ZenithMLData.cpp
|
45
|
+
- ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.cpp
|
46
|
+
- ext/ml4r/MachineLearning/MLExperiment.cpp
|
47
|
+
- ext/ml4r/MachineLearning/MLRunner.cpp
|
48
|
+
- ext/ml4r/MachineLearning/MLUtils.cpp
|
49
|
+
- ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.cpp
|
50
|
+
- ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.cpp
|
51
|
+
- ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.cpp
|
52
|
+
- ext/ml4r/MachineLearning/RandomForest/ZenithRandomForest.cpp
|
53
|
+
- ext/ml4r/ml4r.cpp
|
54
|
+
- ext/ml4r/ml4r_wrap.cpp
|
55
|
+
- ext/ml4r/utils/MathUtils.cpp
|
56
|
+
- ext/ml4r/utils/StochasticUtils.cpp
|
57
|
+
- ext/ml4r/utils/Utils.cpp
|
58
|
+
- ext/ml4r/utils/VlcMessage.cpp
|
27
59
|
- ext/ml4r/LinearRegression/LinearRegression.h
|
28
60
|
- ext/ml4r/LinearRegression/OLSLinearRegression.h
|
29
61
|
- ext/ml4r/MachineLearning/DecisionTree/CategoryInfo.h
|