ml4r 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. data/ext/ml4r/LinearRegression/LinearRegression.cpp +305 -0
  2. data/ext/ml4r/LinearRegression/OLSLinearRegression.cpp +75 -0
  3. data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.cpp +50 -0
  4. data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.cpp +195 -0
  5. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.cpp +551 -0
  6. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.cpp +22 -0
  7. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.cpp +21 -0
  8. data/ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.cpp +142 -0
  9. data/ext/ml4r/MachineLearning/GBM/BernoulliCalculator.cpp +95 -0
  10. data/ext/ml4r/MachineLearning/GBM/GBMEstimator.cpp +601 -0
  11. data/ext/ml4r/MachineLearning/GBM/GBMOutput.cpp +86 -0
  12. data/ext/ml4r/MachineLearning/GBM/GBMRunner.cpp +117 -0
  13. data/ext/ml4r/MachineLearning/GBM/GaussianCalculator.cpp +94 -0
  14. data/ext/ml4r/MachineLearning/GBM/ZenithGBM.cpp +317 -0
  15. data/ext/ml4r/MachineLearning/MLData/MLData.cpp +232 -0
  16. data/ext/ml4r/MachineLearning/MLData/MLDataFields.cpp +1 -0
  17. data/ext/ml4r/MachineLearning/MLData/MLDataReader.cpp +139 -0
  18. data/ext/ml4r/MachineLearning/MLData/ZenithMLData.cpp +96 -0
  19. data/ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.cpp +113 -0
  20. data/ext/ml4r/MachineLearning/MLExperiment.cpp +69 -0
  21. data/ext/ml4r/MachineLearning/MLRunner.cpp +183 -0
  22. data/ext/ml4r/MachineLearning/MLUtils.cpp +15 -0
  23. data/ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.cpp +172 -0
  24. data/ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.cpp +66 -0
  25. data/ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.cpp +84 -0
  26. data/ext/ml4r/MachineLearning/RandomForest/ZenithRandomForest.cpp +184 -0
  27. data/ext/ml4r/ml4r.cpp +34 -0
  28. data/ext/ml4r/ml4r_wrap.cpp +15727 -0
  29. data/ext/ml4r/utils/MathUtils.cpp +204 -0
  30. data/ext/ml4r/utils/StochasticUtils.cpp +73 -0
  31. data/ext/ml4r/utils/Utils.cpp +14 -0
  32. data/ext/ml4r/utils/VlcMessage.cpp +3 -0
  33. metadata +33 -1
@@ -0,0 +1,204 @@
1
+ #include "MathUtils.h"
2
+ #include <boost/lexical_cast.hpp>
3
+ #include <boost/foreach.hpp>
4
+ #include "Utils.h"
5
+ #include <stdlib.h>
6
+ #include <cmath>
7
+ #include <stdexcept>
8
+ #include <map>
9
+ using std::map;
10
+ using std::runtime_error;
11
+
12
+ void MathUtils::checkSystemDimensions(vector<vector<double> >& a, vector<double>& b)
13
+ {
14
+ if (b.size() != a.size())
15
+ throw std::runtime_error("[MathUtils::solveSystemOfLinearEquations]: A and b must have the same number of rows");
16
+ unsigned long n = a.size();
17
+ BOOST_FOREACH(vector<double>& row, a)
18
+ if (row.size() != n)
19
+ runtime_error("[MathUtils::solveSystemOfLinearEquations]: A must be a square matrix");
20
+ }
21
+
22
+ vector<double> MathUtils::solveSystemOfLinearEquations(vector<vector<double> > a, vector<double> b)
23
+ {
24
+ if (!a.size()) throw std::runtime_error("[MathUtils::solveSystemOfLinearEquations] called with empty input");
25
+
26
+ // used to solve
27
+ // A.x = b
28
+ // where:
29
+ // A is an n x n matrix
30
+ // x and b are n x 1 vectors
31
+ MathUtils::checkSystemDimensions(a,b);
32
+ unsigned long n = a.size();
33
+
34
+ // perform Gaussian elimination
35
+ for (unsigned long i = 0; i < n - 1; ++i)
36
+ {
37
+ // eliminate everything in column i, below row i
38
+ MathUtils::eliminate(a, b, i);
39
+ }
40
+
41
+ vector<double> x(n);
42
+
43
+ // now solve, from the bottom up
44
+ for (long index = n - 1; index >= 0; --index)
45
+ {
46
+ double otherVarSum = 0.0;
47
+ for (unsigned long column = index +1; column < n; ++column)
48
+ {
49
+ otherVarSum += a.at(index).at(column) * x.at(column);
50
+ }
51
+ double val = b.at(index) - otherVarSum;
52
+ double denominator = a.at(index).at(index);
53
+ if (!denominator)
54
+ throw std::runtime_error("[MathUtils::solveSystemOfLinearEquations] Cannot divide by zero ==> no unique solution. Is system identified? Have you included too many variables?");
55
+
56
+ double coefficient = val / denominator;
57
+ x.at(index) = coefficient;
58
+ }
59
+ return x;
60
+ }
61
+
62
+ // void MathUtils::eliminate(TOtMatrix& a, TOtMatrix& b, int index)
63
+ void MathUtils::eliminate(vector<vector<double> >& a, vector<double>& b, unsigned long index)
64
+ {
65
+ unsigned long n = a.size();
66
+ if (a.at(index).at(index) == 0)
67
+ {
68
+ // find a non-zero value in the rows below.
69
+ bool found = false;
70
+ for (unsigned long row = index + 1; row < n; ++row)
71
+ {
72
+ if (a.at(row).at(index) != 0)
73
+ {
74
+ // found a non-zero value
75
+ found = true;
76
+ for (unsigned long column = index; column < n; ++column)
77
+ {
78
+ a.at(index).at(column) = a.at(index).at(column) + a.at(row).at(column);
79
+ }
80
+ b.at(index) = b.at(index) + b.at(row);
81
+ break;
82
+ }
83
+ }
84
+ if (!found)
85
+ throw std::runtime_error("Could not eliminate on index " + boost::lexical_cast<std::string>(index));
86
+ }
87
+
88
+ // now eliminate in all the rows below index
89
+ for (unsigned long row = index + 1; row < n; ++row)
90
+ {
91
+ double factor = a.at(row).at(index) / a.at(index).at(index);
92
+ if (factor != 0)
93
+ {
94
+ for (unsigned long column = index; column < n; ++column)
95
+ {
96
+ a.at(row).at(column) = a.at(row).at(column) - factor * a.at(index).at(column);
97
+ }
98
+ b.at(row) = b.at(row) - factor * b.at(index);
99
+ }
100
+ }
101
+ }
102
+
103
+ vector<int> MathUtils::identifyLinearlyDependentMatrixRows(vector<vector<double> >& matrix)
104
+ {
105
+ // this method identifies rows which need to be removed to achieve full row rank.
106
+ // rows which can be fully eliminated (as a linear combination of other rows) are what we're looking for.
107
+
108
+ unsigned long rows = matrix.size();
109
+ unsigned long cols = matrix.front().size();
110
+ // vlcMessage.Write(string("We have " + ToString(rows) + " rows.").c_str());
111
+ // vlcMessage.Write(string("We have " + ToString(cols) + " cols.").c_str());
112
+
113
+ BOOST_FOREACH(vector<double>& row, matrix)
114
+ {
115
+ if (row.size() != cols)
116
+ throw runtime_error("[MathUtils::identifyLinearlyDependentMatrixRows] Matrix is not rectangular");
117
+ }
118
+
119
+ unsigned long numberOfColumnsEliminated = 0;
120
+ map<int, int> rowHasBeenUsed;
121
+
122
+ for (unsigned long row = 0; row < rows; ++row)
123
+ {
124
+ double rowAbsSum = Utils::vectorSum(Utils::vectorAbs(matrix.at(row)));
125
+ // vlcMessage.Write(string("We have initial row abs sum " + ToString(rowAbsSum) + " for row " + ToString(row)).c_str());
126
+ }
127
+
128
+ unsigned long columnToEliminateWith = -1;
129
+ while (numberOfColumnsEliminated < rows-1 && columnToEliminateWith < cols - 1)
130
+ {
131
+ columnToEliminateWith += 1;
132
+
133
+ unsigned long rowToEliminateWith = -1;
134
+ double eliminationCellValue;
135
+ for (rowToEliminateWith = 0; rowToEliminateWith < rows; ++rowToEliminateWith)
136
+ {
137
+ if (rowHasBeenUsed[rowToEliminateWith] > 0)
138
+ continue;
139
+
140
+ eliminationCellValue = matrix.at(rowToEliminateWith).at(columnToEliminateWith);
141
+
142
+ if (eliminationCellValue > 0.001)
143
+ break;
144
+ else if (eliminationCellValue > 0)
145
+ {
146
+ // first check it's not TINY and we're dealing with precision issues
147
+ double rowAbsSum = Utils::vectorSum(Utils::vectorAbs(matrix.at(rowToEliminateWith)));
148
+ if (rowAbsSum > 0.01)
149
+ break; // if it's less than 0.001 it may as well be zero, so it's linearly dependent
150
+ }
151
+ }
152
+
153
+ if (rowToEliminateWith == rows)
154
+ continue; // didn't find a non-zero value in the column to eliminate, so keep going!
155
+
156
+ // vlcMessage.Write(string("Eliminating column " + ToString(columnToEliminateWith)).c_str());
157
+ // vlcMessage.Write(string("Eliminating using row " + ToString(rowToEliminateWith)).c_str());
158
+
159
+ for (unsigned long rowToEliminate = 0; rowToEliminate < rows; ++rowToEliminate)
160
+ {
161
+ if (rowToEliminate == rowToEliminateWith || rowHasBeenUsed[rowToEliminate] > 0)
162
+ continue;
163
+
164
+ double cellValueToEliminate = matrix.at(rowToEliminate).at(columnToEliminateWith);
165
+ if (cellValueToEliminate == 0.0)
166
+ continue; // nothing to do!
167
+
168
+ // vlcMessage.Write(string("Eliminating row " + ToString(rowToEliminate)).c_str());
169
+
170
+ double ratio = cellValueToEliminate / eliminationCellValue;
171
+ // vlcMessage.Write(ToString(ratio).c_str());
172
+
173
+ for (unsigned long columnToEliminate = columnToEliminateWith; columnToEliminate < cols; ++columnToEliminate)
174
+ {
175
+ double increment = ratio * matrix.at(rowToEliminateWith).at(columnToEliminate);
176
+ //vlcMessage.Write(string("Subtracting " + ToString(increment) + " from value " + ToString(matrix.at(rowToEliminate).at(columnToEliminate))).c_str());
177
+ matrix.at(rowToEliminate).at(columnToEliminate) -= increment;
178
+ }
179
+ }
180
+ rowHasBeenUsed[rowToEliminateWith] = 1;
181
+ numberOfColumnsEliminated += 1;
182
+ }
183
+
184
+ vector<int> linearlyDependentRows;
185
+
186
+ // now check which rows are *basically* zero!
187
+ for (unsigned long row = 0; row < rows; ++row)
188
+ {
189
+ double rowAbsSum = Utils::vectorSum(Utils::vectorAbs(matrix.at(row)));
190
+ // vlcMessage.Write(string("We have row abs sum " + ToString(rowAbsSum) + " for row " + ToString(row)).c_str());
191
+ if (rowAbsSum < 0.001)
192
+ linearlyDependentRows.push_back(row);
193
+ }
194
+
195
+ return linearlyDependentRows;
196
+ }
197
+
198
+ double MathUtils::convertRandomNumberToExtremeValue(long double random_number)
199
+ {
200
+ if (random_number < std::numeric_limits<double>::epsilon() || random_number > 1.0 - std::numeric_limits<double>::epsilon())
201
+ throw std::runtime_error("Can't have a random number not on the range (0.0, 1.0)");
202
+
203
+ return -::log(-::log(random_number));
204
+ }
@@ -0,0 +1,73 @@
1
+ #include "utils/StochasticUtils.h"
2
+
3
+ vector<double> StochasticUtils::convertPdfToCumulativeSum(std::vector<double> pdf)
4
+ {
5
+ float cumulativeSum = 0;
6
+ vector<double> returnVal(pdf.size());
7
+ for (unsigned int i=0; i<pdf.size(); ++i)
8
+ {
9
+ returnVal[i] = cumulativeSum + pdf[i];
10
+ cumulativeSum += pdf[i];
11
+ }
12
+
13
+ // normalise to unity sum
14
+ if (cumulativeSum != 1.0)
15
+ {
16
+ for (unsigned int i=0; i<pdf.size(); ++i)
17
+ {
18
+ returnVal[i] /= cumulativeSum;
19
+ }
20
+ }
21
+
22
+ if (returnVal[returnVal.size()-1] != 1.0)
23
+ returnVal[returnVal.size()-1] = 1.0;
24
+
25
+ return returnVal;
26
+ }
27
+
28
+ int StochasticUtils::chooseCategoryFromCdf( float * cumulativeProbabilities, int N )
29
+ {
30
+ float selection = rand() / (RAND_MAX_FLOAT+1.0f);
31
+ int i=0;
32
+ while ((i<N) && (selection > cumulativeProbabilities[i]))
33
+ i++;
34
+ return i;
35
+ }
36
+
37
+ int StochasticUtils::chooseCategoryFromCdf( vector<float>& cumulativeProbabilities )
38
+ {
39
+ float selection = rand() / (RAND_MAX_FLOAT+1.0f);
40
+ return chooseCategoryFromCdf(selection, cumulativeProbabilities);
41
+ }
42
+
43
+ int StochasticUtils::chooseCategoryFromCdf( float qot, vector<float>& cumulativeProbabilities )
44
+ {
45
+ unsigned int i=0;
46
+ while ((i<cumulativeProbabilities.size()) && (qot > cumulativeProbabilities[i]))
47
+ i++;
48
+ return i;
49
+ }
50
+
51
+ int StochasticUtils::chooseCategoryFromPdf(vector<float>& probabilities, string categoryType)
52
+ {
53
+ return chooseCategoryFromPdf(getQot(), probabilities, categoryType);
54
+ }
55
+
56
+ int StochasticUtils::chooseCategoryFromPdf(double qot, vector<float>& probabilities, string categoryType)
57
+ {
58
+ if (!probabilities.size())
59
+ throw runtime_error("There was a problem selecting a " + categoryType + " from an empty PDF!");
60
+
61
+ double summative_usage = 0.0;
62
+ unsigned int chosen_index = 0;
63
+ for (; chosen_index < probabilities.size(); ++chosen_index)
64
+ {
65
+ summative_usage += probabilities[chosen_index];
66
+ if (summative_usage > qot) break;
67
+ }
68
+ if (chosen_index >= probabilities.size())
69
+ {
70
+ throw runtime_error("There was a problem selecting a " + categoryType + " from the PDF!");
71
+ }
72
+ return chosen_index;
73
+ }
@@ -0,0 +1,14 @@
1
+ #include "utils/Utils.h"
2
+
3
+ #include <vector>
4
+ using std::vector;
5
+
6
+ vector<int> Utils::vectorOfRandomInt(int length)
7
+ {
8
+ vector<int> returnValue;
9
+ returnValue.reserve(length);
10
+ for (int i = 0; i < length; ++i)
11
+ returnValue.push_back(rand());
12
+
13
+ return returnValue;
14
+ }
@@ -0,0 +1,3 @@
1
+ #include "VlcMessage.h"
2
+
3
+ VlcMessage vlcMessage;
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ml4r
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.1.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -24,6 +24,38 @@ files:
24
24
  - lib/ml4r/linear_regression.rb
25
25
  - lib/ml4r.rb
26
26
  - lib/test_cpp_extension.rb
27
+ - ext/ml4r/LinearRegression/LinearRegression.cpp
28
+ - ext/ml4r/LinearRegression/OLSLinearRegression.cpp
29
+ - ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.cpp
30
+ - ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.cpp
31
+ - ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.cpp
32
+ - ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.cpp
33
+ - ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.cpp
34
+ - ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.cpp
35
+ - ext/ml4r/MachineLearning/GBM/BernoulliCalculator.cpp
36
+ - ext/ml4r/MachineLearning/GBM/GaussianCalculator.cpp
37
+ - ext/ml4r/MachineLearning/GBM/GBMEstimator.cpp
38
+ - ext/ml4r/MachineLearning/GBM/GBMOutput.cpp
39
+ - ext/ml4r/MachineLearning/GBM/GBMRunner.cpp
40
+ - ext/ml4r/MachineLearning/GBM/ZenithGBM.cpp
41
+ - ext/ml4r/MachineLearning/MLData/MLData.cpp
42
+ - ext/ml4r/MachineLearning/MLData/MLDataFields.cpp
43
+ - ext/ml4r/MachineLearning/MLData/MLDataReader.cpp
44
+ - ext/ml4r/MachineLearning/MLData/ZenithMLData.cpp
45
+ - ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.cpp
46
+ - ext/ml4r/MachineLearning/MLExperiment.cpp
47
+ - ext/ml4r/MachineLearning/MLRunner.cpp
48
+ - ext/ml4r/MachineLearning/MLUtils.cpp
49
+ - ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.cpp
50
+ - ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.cpp
51
+ - ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.cpp
52
+ - ext/ml4r/MachineLearning/RandomForest/ZenithRandomForest.cpp
53
+ - ext/ml4r/ml4r.cpp
54
+ - ext/ml4r/ml4r_wrap.cpp
55
+ - ext/ml4r/utils/MathUtils.cpp
56
+ - ext/ml4r/utils/StochasticUtils.cpp
57
+ - ext/ml4r/utils/Utils.cpp
58
+ - ext/ml4r/utils/VlcMessage.cpp
27
59
  - ext/ml4r/LinearRegression/LinearRegression.h
28
60
  - ext/ml4r/LinearRegression/OLSLinearRegression.h
29
61
  - ext/ml4r/MachineLearning/DecisionTree/CategoryInfo.h