ml4r 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. data/ext/ml4r/LinearRegression/LinearRegression.cpp +305 -0
  2. data/ext/ml4r/LinearRegression/OLSLinearRegression.cpp +75 -0
  3. data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.cpp +50 -0
  4. data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.cpp +195 -0
  5. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.cpp +551 -0
  6. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.cpp +22 -0
  7. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.cpp +21 -0
  8. data/ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.cpp +142 -0
  9. data/ext/ml4r/MachineLearning/GBM/BernoulliCalculator.cpp +95 -0
  10. data/ext/ml4r/MachineLearning/GBM/GBMEstimator.cpp +601 -0
  11. data/ext/ml4r/MachineLearning/GBM/GBMOutput.cpp +86 -0
  12. data/ext/ml4r/MachineLearning/GBM/GBMRunner.cpp +117 -0
  13. data/ext/ml4r/MachineLearning/GBM/GaussianCalculator.cpp +94 -0
  14. data/ext/ml4r/MachineLearning/GBM/ZenithGBM.cpp +317 -0
  15. data/ext/ml4r/MachineLearning/MLData/MLData.cpp +232 -0
  16. data/ext/ml4r/MachineLearning/MLData/MLDataFields.cpp +1 -0
  17. data/ext/ml4r/MachineLearning/MLData/MLDataReader.cpp +139 -0
  18. data/ext/ml4r/MachineLearning/MLData/ZenithMLData.cpp +96 -0
  19. data/ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.cpp +113 -0
  20. data/ext/ml4r/MachineLearning/MLExperiment.cpp +69 -0
  21. data/ext/ml4r/MachineLearning/MLRunner.cpp +183 -0
  22. data/ext/ml4r/MachineLearning/MLUtils.cpp +15 -0
  23. data/ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.cpp +172 -0
  24. data/ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.cpp +66 -0
  25. data/ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.cpp +84 -0
  26. data/ext/ml4r/MachineLearning/RandomForest/ZenithRandomForest.cpp +184 -0
  27. data/ext/ml4r/ml4r.cpp +34 -0
  28. data/ext/ml4r/ml4r_wrap.cpp +15727 -0
  29. data/ext/ml4r/utils/MathUtils.cpp +204 -0
  30. data/ext/ml4r/utils/StochasticUtils.cpp +73 -0
  31. data/ext/ml4r/utils/Utils.cpp +14 -0
  32. data/ext/ml4r/utils/VlcMessage.cpp +3 -0
  33. metadata +33 -1
@@ -0,0 +1,204 @@
1
+ #include "MathUtils.h"
2
+ #include <boost/lexical_cast.hpp>
3
+ #include <boost/foreach.hpp>
4
+ #include "Utils.h"
5
+ #include <stdlib.h>
6
+ #include <cmath>
7
+ #include <stdexcept>
8
+ #include <map>
9
+ using std::map;
10
+ using std::runtime_error;
11
+
12
+ void MathUtils::checkSystemDimensions(vector<vector<double> >& a, vector<double>& b)
13
+ {
14
+ if (b.size() != a.size())
15
+ throw std::runtime_error("[MathUtils::solveSystemOfLinearEquations]: A and b must have the same number of rows");
16
+ unsigned long n = a.size();
17
+ BOOST_FOREACH(vector<double>& row, a)
18
+ if (row.size() != n)
19
+ runtime_error("[MathUtils::solveSystemOfLinearEquations]: A must be a square matrix");
20
+ }
21
+
22
+ vector<double> MathUtils::solveSystemOfLinearEquations(vector<vector<double> > a, vector<double> b)
23
+ {
24
+ if (!a.size()) throw std::runtime_error("[MathUtils::solveSystemOfLinearEquations] called with empty input");
25
+
26
+ // used to solve
27
+ // A.x = b
28
+ // where:
29
+ // A is an n x n matrix
30
+ // x and b are n x 1 vectors
31
+ MathUtils::checkSystemDimensions(a,b);
32
+ unsigned long n = a.size();
33
+
34
+ // perform Gaussian elimination
35
+ for (unsigned long i = 0; i < n - 1; ++i)
36
+ {
37
+ // eliminate everything in column i, below row i
38
+ MathUtils::eliminate(a, b, i);
39
+ }
40
+
41
+ vector<double> x(n);
42
+
43
+ // now solve, from the bottom up
44
+ for (long index = n - 1; index >= 0; --index)
45
+ {
46
+ double otherVarSum = 0.0;
47
+ for (unsigned long column = index +1; column < n; ++column)
48
+ {
49
+ otherVarSum += a.at(index).at(column) * x.at(column);
50
+ }
51
+ double val = b.at(index) - otherVarSum;
52
+ double denominator = a.at(index).at(index);
53
+ if (!denominator)
54
+ throw std::runtime_error("[MathUtils::solveSystemOfLinearEquations] Cannot divide by zero ==> no unique solution. Is system identified? Have you included too many variables?");
55
+
56
+ double coefficient = val / denominator;
57
+ x.at(index) = coefficient;
58
+ }
59
+ return x;
60
+ }
61
+
62
+ // void MathUtils::eliminate(TOtMatrix& a, TOtMatrix& b, int index)
63
+ void MathUtils::eliminate(vector<vector<double> >& a, vector<double>& b, unsigned long index)
64
+ {
65
+ unsigned long n = a.size();
66
+ if (a.at(index).at(index) == 0)
67
+ {
68
+ // find a non-zero value in the rows below.
69
+ bool found = false;
70
+ for (unsigned long row = index + 1; row < n; ++row)
71
+ {
72
+ if (a.at(row).at(index) != 0)
73
+ {
74
+ // found a non-zero value
75
+ found = true;
76
+ for (unsigned long column = index; column < n; ++column)
77
+ {
78
+ a.at(index).at(column) = a.at(index).at(column) + a.at(row).at(column);
79
+ }
80
+ b.at(index) = b.at(index) + b.at(row);
81
+ break;
82
+ }
83
+ }
84
+ if (!found)
85
+ throw std::runtime_error("Could not eliminate on index " + boost::lexical_cast<std::string>(index));
86
+ }
87
+
88
+ // now eliminate in all the rows below index
89
+ for (unsigned long row = index + 1; row < n; ++row)
90
+ {
91
+ double factor = a.at(row).at(index) / a.at(index).at(index);
92
+ if (factor != 0)
93
+ {
94
+ for (unsigned long column = index; column < n; ++column)
95
+ {
96
+ a.at(row).at(column) = a.at(row).at(column) - factor * a.at(index).at(column);
97
+ }
98
+ b.at(row) = b.at(row) - factor * b.at(index);
99
+ }
100
+ }
101
+ }
102
+
103
+ vector<int> MathUtils::identifyLinearlyDependentMatrixRows(vector<vector<double> >& matrix)
104
+ {
105
+ // this method identifies rows which need to be removed to achieve full row rank.
106
+ // rows which can be fully eliminated (as a linear combination of other rows) are what we're looking for.
107
+
108
+ unsigned long rows = matrix.size();
109
+ unsigned long cols = matrix.front().size();
110
+ // vlcMessage.Write(string("We have " + ToString(rows) + " rows.").c_str());
111
+ // vlcMessage.Write(string("We have " + ToString(cols) + " cols.").c_str());
112
+
113
+ BOOST_FOREACH(vector<double>& row, matrix)
114
+ {
115
+ if (row.size() != cols)
116
+ throw runtime_error("[MathUtils::identifyLinearlyDependentMatrixRows] Matrix is not rectangular");
117
+ }
118
+
119
+ unsigned long numberOfColumnsEliminated = 0;
120
+ map<int, int> rowHasBeenUsed;
121
+
122
+ for (unsigned long row = 0; row < rows; ++row)
123
+ {
124
+ double rowAbsSum = Utils::vectorSum(Utils::vectorAbs(matrix.at(row)));
125
+ // vlcMessage.Write(string("We have initial row abs sum " + ToString(rowAbsSum) + " for row " + ToString(row)).c_str());
126
+ }
127
+
128
+ unsigned long columnToEliminateWith = -1;
129
+ while (numberOfColumnsEliminated < rows-1 && columnToEliminateWith < cols - 1)
130
+ {
131
+ columnToEliminateWith += 1;
132
+
133
+ unsigned long rowToEliminateWith = -1;
134
+ double eliminationCellValue;
135
+ for (rowToEliminateWith = 0; rowToEliminateWith < rows; ++rowToEliminateWith)
136
+ {
137
+ if (rowHasBeenUsed[rowToEliminateWith] > 0)
138
+ continue;
139
+
140
+ eliminationCellValue = matrix.at(rowToEliminateWith).at(columnToEliminateWith);
141
+
142
+ if (eliminationCellValue > 0.001)
143
+ break;
144
+ else if (eliminationCellValue > 0)
145
+ {
146
+ // first check it's not TINY and we're dealing with precision issues
147
+ double rowAbsSum = Utils::vectorSum(Utils::vectorAbs(matrix.at(rowToEliminateWith)));
148
+ if (rowAbsSum > 0.01)
149
+ break; // if it's less than 0.001 it may as well be zero, so it's linearly dependent
150
+ }
151
+ }
152
+
153
+ if (rowToEliminateWith == rows)
154
+ continue; // didn't find a non-zero value in the column to eliminate, so keep going!
155
+
156
+ // vlcMessage.Write(string("Eliminating column " + ToString(columnToEliminateWith)).c_str());
157
+ // vlcMessage.Write(string("Eliminating using row " + ToString(rowToEliminateWith)).c_str());
158
+
159
+ for (unsigned long rowToEliminate = 0; rowToEliminate < rows; ++rowToEliminate)
160
+ {
161
+ if (rowToEliminate == rowToEliminateWith || rowHasBeenUsed[rowToEliminate] > 0)
162
+ continue;
163
+
164
+ double cellValueToEliminate = matrix.at(rowToEliminate).at(columnToEliminateWith);
165
+ if (cellValueToEliminate == 0.0)
166
+ continue; // nothing to do!
167
+
168
+ // vlcMessage.Write(string("Eliminating row " + ToString(rowToEliminate)).c_str());
169
+
170
+ double ratio = cellValueToEliminate / eliminationCellValue;
171
+ // vlcMessage.Write(ToString(ratio).c_str());
172
+
173
+ for (unsigned long columnToEliminate = columnToEliminateWith; columnToEliminate < cols; ++columnToEliminate)
174
+ {
175
+ double increment = ratio * matrix.at(rowToEliminateWith).at(columnToEliminate);
176
+ //vlcMessage.Write(string("Subtracting " + ToString(increment) + " from value " + ToString(matrix.at(rowToEliminate).at(columnToEliminate))).c_str());
177
+ matrix.at(rowToEliminate).at(columnToEliminate) -= increment;
178
+ }
179
+ }
180
+ rowHasBeenUsed[rowToEliminateWith] = 1;
181
+ numberOfColumnsEliminated += 1;
182
+ }
183
+
184
+ vector<int> linearlyDependentRows;
185
+
186
+ // now check which rows are *basically* zero!
187
+ for (unsigned long row = 0; row < rows; ++row)
188
+ {
189
+ double rowAbsSum = Utils::vectorSum(Utils::vectorAbs(matrix.at(row)));
190
+ // vlcMessage.Write(string("We have row abs sum " + ToString(rowAbsSum) + " for row " + ToString(row)).c_str());
191
+ if (rowAbsSum < 0.001)
192
+ linearlyDependentRows.push_back(row);
193
+ }
194
+
195
+ return linearlyDependentRows;
196
+ }
197
+
198
+ double MathUtils::convertRandomNumberToExtremeValue(long double random_number)
199
+ {
200
+ if (random_number < std::numeric_limits<double>::epsilon() || random_number > 1.0 - std::numeric_limits<double>::epsilon())
201
+ throw std::runtime_error("Can't have a random number not on the range (0.0, 1.0)");
202
+
203
+ return -::log(-::log(random_number));
204
+ }
@@ -0,0 +1,73 @@
1
+ #include "utils/StochasticUtils.h"
2
+
3
+ vector<double> StochasticUtils::convertPdfToCumulativeSum(std::vector<double> pdf)
4
+ {
5
+ float cumulativeSum = 0;
6
+ vector<double> returnVal(pdf.size());
7
+ for (unsigned int i=0; i<pdf.size(); ++i)
8
+ {
9
+ returnVal[i] = cumulativeSum + pdf[i];
10
+ cumulativeSum += pdf[i];
11
+ }
12
+
13
+ // normalise to unity sum
14
+ if (cumulativeSum != 1.0)
15
+ {
16
+ for (unsigned int i=0; i<pdf.size(); ++i)
17
+ {
18
+ returnVal[i] /= cumulativeSum;
19
+ }
20
+ }
21
+
22
+ if (returnVal[returnVal.size()-1] != 1.0)
23
+ returnVal[returnVal.size()-1] = 1.0;
24
+
25
+ return returnVal;
26
+ }
27
+
28
+ int StochasticUtils::chooseCategoryFromCdf( float * cumulativeProbabilities, int N )
29
+ {
30
+ float selection = rand() / (RAND_MAX_FLOAT+1.0f);
31
+ int i=0;
32
+ while ((i<N) && (selection > cumulativeProbabilities[i]))
33
+ i++;
34
+ return i;
35
+ }
36
+
37
+ int StochasticUtils::chooseCategoryFromCdf( vector<float>& cumulativeProbabilities )
38
+ {
39
+ float selection = rand() / (RAND_MAX_FLOAT+1.0f);
40
+ return chooseCategoryFromCdf(selection, cumulativeProbabilities);
41
+ }
42
+
43
+ int StochasticUtils::chooseCategoryFromCdf( float qot, vector<float>& cumulativeProbabilities )
44
+ {
45
+ unsigned int i=0;
46
+ while ((i<cumulativeProbabilities.size()) && (qot > cumulativeProbabilities[i]))
47
+ i++;
48
+ return i;
49
+ }
50
+
51
+ int StochasticUtils::chooseCategoryFromPdf(vector<float>& probabilities, string categoryType)
52
+ {
53
+ return chooseCategoryFromPdf(getQot(), probabilities, categoryType);
54
+ }
55
+
56
+ int StochasticUtils::chooseCategoryFromPdf(double qot, vector<float>& probabilities, string categoryType)
57
+ {
58
+ if (!probabilities.size())
59
+ throw runtime_error("There was a problem selecting a " + categoryType + " from an empty PDF!");
60
+
61
+ double summative_usage = 0.0;
62
+ unsigned int chosen_index = 0;
63
+ for (; chosen_index < probabilities.size(); ++chosen_index)
64
+ {
65
+ summative_usage += probabilities[chosen_index];
66
+ if (summative_usage > qot) break;
67
+ }
68
+ if (chosen_index >= probabilities.size())
69
+ {
70
+ throw runtime_error("There was a problem selecting a " + categoryType + " from the PDF!");
71
+ }
72
+ return chosen_index;
73
+ }
@@ -0,0 +1,14 @@
1
+ #include "utils/Utils.h"
2
+
3
+ #include <vector>
4
+ using std::vector;
5
+
6
+ vector<int> Utils::vectorOfRandomInt(int length)
7
+ {
8
+ vector<int> returnValue;
9
+ returnValue.reserve(length);
10
+ for (int i = 0; i < length; ++i)
11
+ returnValue.push_back(rand());
12
+
13
+ return returnValue;
14
+ }
@@ -0,0 +1,3 @@
1
+ #include "VlcMessage.h"
2
+
3
+ VlcMessage vlcMessage;
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ml4r
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.1.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -24,6 +24,38 @@ files:
24
24
  - lib/ml4r/linear_regression.rb
25
25
  - lib/ml4r.rb
26
26
  - lib/test_cpp_extension.rb
27
+ - ext/ml4r/LinearRegression/LinearRegression.cpp
28
+ - ext/ml4r/LinearRegression/OLSLinearRegression.cpp
29
+ - ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.cpp
30
+ - ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.cpp
31
+ - ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.cpp
32
+ - ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.cpp
33
+ - ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.cpp
34
+ - ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.cpp
35
+ - ext/ml4r/MachineLearning/GBM/BernoulliCalculator.cpp
36
+ - ext/ml4r/MachineLearning/GBM/GaussianCalculator.cpp
37
+ - ext/ml4r/MachineLearning/GBM/GBMEstimator.cpp
38
+ - ext/ml4r/MachineLearning/GBM/GBMOutput.cpp
39
+ - ext/ml4r/MachineLearning/GBM/GBMRunner.cpp
40
+ - ext/ml4r/MachineLearning/GBM/ZenithGBM.cpp
41
+ - ext/ml4r/MachineLearning/MLData/MLData.cpp
42
+ - ext/ml4r/MachineLearning/MLData/MLDataFields.cpp
43
+ - ext/ml4r/MachineLearning/MLData/MLDataReader.cpp
44
+ - ext/ml4r/MachineLearning/MLData/ZenithMLData.cpp
45
+ - ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.cpp
46
+ - ext/ml4r/MachineLearning/MLExperiment.cpp
47
+ - ext/ml4r/MachineLearning/MLRunner.cpp
48
+ - ext/ml4r/MachineLearning/MLUtils.cpp
49
+ - ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.cpp
50
+ - ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.cpp
51
+ - ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.cpp
52
+ - ext/ml4r/MachineLearning/RandomForest/ZenithRandomForest.cpp
53
+ - ext/ml4r/ml4r.cpp
54
+ - ext/ml4r/ml4r_wrap.cpp
55
+ - ext/ml4r/utils/MathUtils.cpp
56
+ - ext/ml4r/utils/StochasticUtils.cpp
57
+ - ext/ml4r/utils/Utils.cpp
58
+ - ext/ml4r/utils/VlcMessage.cpp
27
59
  - ext/ml4r/LinearRegression/LinearRegression.h
28
60
  - ext/ml4r/LinearRegression/OLSLinearRegression.h
29
61
  - ext/ml4r/MachineLearning/DecisionTree/CategoryInfo.h