ml4r 0.1.2 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. data/ext/ml4r/{LinearRegression.h → LinearRegression/LinearRegression.h} +25 -19
  2. data/ext/ml4r/LinearRegression/OLSLinearRegression.h +29 -0
  3. data/ext/ml4r/MachineLearning/DecisionTree/CategoryInfo.h +32 -0
  4. data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.h +30 -0
  5. data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.h +86 -0
  6. data/ext/ml4r/MachineLearning/DecisionTree/FeatureInteraction.h +31 -0
  7. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.h +45 -0
  8. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.h +17 -0
  9. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.h +16 -0
  10. data/ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.h +81 -0
  11. data/ext/ml4r/MachineLearning/GBM/BernoulliCalculator.h +29 -0
  12. data/ext/ml4r/MachineLearning/GBM/GBM.h +50 -0
  13. data/ext/ml4r/MachineLearning/GBM/GBMCalculator.h +31 -0
  14. data/ext/ml4r/MachineLearning/GBM/GBMData.h +0 -0
  15. data/ext/ml4r/MachineLearning/GBM/GBMEstimator.h +79 -0
  16. data/ext/ml4r/MachineLearning/GBM/GBMOutput.h +53 -0
  17. data/ext/ml4r/MachineLearning/GBM/GBMParameters.h +50 -0
  18. data/ext/ml4r/MachineLearning/GBM/GBMRunner.h +35 -0
  19. data/ext/ml4r/MachineLearning/GBM/GaussianCalculator.h +29 -0
  20. data/ext/ml4r/MachineLearning/GBM/ZenithGBM.h +27 -0
  21. data/ext/ml4r/MachineLearning/MLData/MLData.h +77 -0
  22. data/ext/ml4r/MachineLearning/MLData/MLDataFields.h +25 -0
  23. data/ext/ml4r/MachineLearning/MLData/MLDataReader.h +37 -0
  24. data/ext/ml4r/MachineLearning/MLData/ZenithMLData.h +13 -0
  25. data/ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.h +20 -0
  26. data/ext/ml4r/MachineLearning/MLEstimator.h +30 -0
  27. data/ext/ml4r/MachineLearning/MLEstimatorFactory.h +25 -0
  28. data/ext/ml4r/MachineLearning/MLExperiment.h +41 -0
  29. data/ext/ml4r/MachineLearning/MLOutput.h +45 -0
  30. data/ext/ml4r/MachineLearning/MLParameters.h +16 -0
  31. data/ext/ml4r/MachineLearning/MLRunner.h +47 -0
  32. data/ext/ml4r/MachineLearning/MLUtils.h +75 -0
  33. data/ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.h +47 -0
  34. data/ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.h +33 -0
  35. data/ext/ml4r/MachineLearning/RandomForest/RandomForestParameters.h +32 -0
  36. data/ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.h +34 -0
  37. data/ext/ml4r/extconf.rb +16 -3
  38. data/ext/ml4r/{MathUtils.h → utils/MathUtils.h} +0 -0
  39. data/ext/ml4r/{MatrixInversion.h → utils/MatrixInversion.h} +0 -0
  40. data/ext/ml4r/utils/StochasticUtils.h +33 -0
  41. data/ext/ml4r/utils/Utils.h +147 -0
  42. data/ext/ml4r/utils/VlcMessage.h +44 -0
  43. data/lib/ml4r/linear_regression.rb +7 -0
  44. metadata +45 -13
  45. data/ext/ml4r/LinearRegression/ZenithRegression.h +0 -17
  46. data/ext/ml4r/OLSLinearRegression.h +0 -23
  47. data/ext/ml4r/Utils.h +0 -53
  48. data/ext/ml4r/example.h +0 -18
  49. data/ext/ml4r/swig/example.h +0 -13
  50. data/ext/ml4r/swig/example_wrap.c +0 -2093
  51. data/ext/ml4r/utils/RubyUtils.h +0 -174
@@ -8,37 +8,43 @@ using boost::numeric::ublas::matrix;
8
8
  #include <utility>
9
9
  using std::pair;
10
10
 
11
- //#include "ZenithBase.h"
12
- //#include "boost/MatrixInverse.h"
13
-
14
11
  class LinearRegression
15
12
  {
16
13
  public:
17
- LinearRegression() : m_constantIsFixed(false), m_paramsAreValid(false) {}
14
+ LinearRegression(std::vector<std::vector<double> > xs, std::vector<double> ys,
15
+ std::vector<double> weights = std::vector<double>())
16
+ : m_xs(xs), m_ys(ys), m_ws(weights), m_constantIsFixed(false), m_paramsAreValid(false) {}
17
+
18
+ LinearRegression(std::vector<double> xs, std::vector<double> ys,
19
+ std::vector<double> weights = std::vector<double>())
20
+ : m_ys(ys), m_ws(weights), m_constantIsFixed(false), m_paramsAreValid(false)
21
+ {
22
+ m_xs.resize(xs.size());
23
+ for (unsigned int i=0; i<xs.size(); ++i) m_xs.at(i).resize(1, xs.at(i));
24
+ }
25
+
26
+ LinearRegression(std::vector<std::vector<double> > xs, std::vector<double> ys, double fixedConstant,
27
+ std::vector<double> weights = std::vector<double>())
28
+ : m_xs(xs), m_ys(ys), m_ws(weights), m_constantIsFixed(true), m_constant(fixedConstant), m_paramsAreValid(false) {}
18
29
  ~LinearRegression() {}
19
30
 
20
- void setXs(std::vector<std::vector<double> > xs);
21
- void setYs(std::vector<double> ys);
22
- void setWeights(std::vector<double> weights);
31
+
23
32
  void setFixedConstant(double val);
24
33
 
25
34
 
26
35
  pair<std::vector<double>,double> getParameterEstimates();
27
36
  std::vector<double>& getFittedYs();
28
37
  std::vector<double>& getPredictedYs();
29
- // double GetConstant();
30
- double getRSquared();
31
- double getSSquared();
32
- double getFstatistic();
38
+ // double GetConstant();
39
+ double getRSquared();
40
+ double getSSquared();
41
+ double getFstatistic();
33
42
  std::vector<double>& getTstatistics();
34
43
  std::vector<double>& getStandardErrors();
35
- double getPressStatistic();
36
- double getPresarStatistic();
37
- double getAdjustedRSquared();
38
- double getRSquaredPrediction();
39
-
40
- // BOOM THIS IS THE PROBLEM HERE - CAN'T INSTANTIATE A PURE VIRTUAL CLASS
41
- virtual void Execute() = 0;
44
+ double getPressStatistic();
45
+ double getPresarStatistic();
46
+ double getAdjustedRSquared();
47
+ double getRSquaredPrediction();
42
48
 
43
49
  protected:
44
50
 
@@ -75,7 +81,7 @@ protected:
75
81
  boost::numeric::ublas::matrix<double> m_Y;
76
82
  boost::numeric::ublas::matrix<double> m_B; // m_B = prod(m_A, m_Y)
77
83
  boost::numeric::ublas::matrix<double> m_A; // m_A = (X'WX)-1 X'W
78
- std::vector<double> m_h_diagonal; // hat ublas::matrix = XA. This is the diagonal of it.
84
+ std::vector<double> m_h_diagonal; // hat ublas::matrix = XA. This is the diagonal of it.
79
85
  boost::numeric::ublas::matrix<double> m_Xtranspose;
80
86
  boost::numeric::ublas::matrix<double> m_Xtranspose_W;
81
87
  boost::numeric::ublas::matrix<double> m_Xtranspose_W_X;
@@ -0,0 +1,29 @@
1
+ #ifndef OLSLinearRegression_h__
2
+ #define OLSLinearRegression_h__
3
+
4
+ #include "LinearRegression.h"
5
+
6
+ class OLSLinearRegression : public LinearRegression
7
+ {
8
+ public:
9
+
10
+ OLSLinearRegression(std::vector<std::vector<double> > xs, std::vector<double> ys,
11
+ std::vector<double> weights = std::vector<double>());
12
+ OLSLinearRegression(std::vector<double> xs, std::vector<double> ys,
13
+ std::vector<double> weights = std::vector<double>());
14
+ OLSLinearRegression(std::vector<std::vector<double> > xs, std::vector<double> ys, double fixedConstant,
15
+ std::vector<double> weights = std::vector<double>());
16
+ ~OLSLinearRegression();
17
+
18
+ void calculate();
19
+
20
+ void EstimateBs();
21
+
22
+ protected:
23
+
24
+ private:
25
+ };
26
+
27
+
28
+
29
+ #endif // OLSLinearRegression_h__
@@ -0,0 +1,32 @@
1
+ #ifndef __CATEGORY_INFO_H__
2
+ #define __CATEGORY_INFO_H__
3
+
4
+ struct CategoryInfo
5
+ {
6
+ CategoryInfo()
7
+ : sumZ(0), sumW(0), countN(0), meanZ(0), category(-1)
8
+ {
9
+
10
+ }
11
+
12
+ vector<shared_ptr<DecisionTreeExperiment> > experiments;
13
+ double sumW;
14
+ double sumZ;
15
+ int countN;
16
+ double meanZ;
17
+
18
+ double category;
19
+ void addExperiment(shared_ptr<DecisionTreeExperiment> experiment)
20
+ {
21
+ experiments.push_back(experiment);
22
+ sumW += experiment->getWeight();
23
+ sumZ += experiment->getWeight() * experiment->getZ();
24
+ countN += 1;
25
+ meanZ = sumZ / sumW;
26
+ }
27
+
28
+ // define operator to sort on meanZ
29
+ bool operator<(const CategoryInfo& rhs) const { return this->meanZ < rhs.meanZ; }
30
+ };
31
+
32
+ #endif // __CATEGORY_INFO_H__
@@ -0,0 +1,30 @@
1
+ #ifndef DecisionTreeExperiment_h__
2
+ #define DecisionTreeExperiment_h__
3
+
4
+ #include "MachineLearning/MLExperiment.h"
5
+ #include <boost/shared_ptr.hpp>
6
+ using boost::shared_ptr;
7
+
8
+ class DecisionTreeExperiment : public MLExperiment
9
+ {
10
+ public:
11
+ DecisionTreeExperiment();
12
+ DecisionTreeExperiment(shared_ptr<MLExperiment> mlExperiment);
13
+ ~DecisionTreeExperiment();
14
+
15
+ void setF(double f);
16
+ void setZ(double z);
17
+ double getF();
18
+ double getY();
19
+ double getZ();
20
+ void incrementF(double increment);
21
+
22
+ protected:
23
+ double m_F; // modelled estimate (utility in the case of bernoulli)
24
+ double m_Z; // the thing which decision trees attempt to partition
25
+ private:
26
+ };
27
+
28
+
29
+
30
+ #endif // DecisionTreeExperiment_h__
@@ -0,0 +1,86 @@
1
+ #ifndef __DecisionTreeNode_h__
2
+ #define __DecisionTreeNode_h__
3
+
4
+ #include <vector>
5
+ #include <boost/shared_ptr.hpp>
6
+ #include <set>
7
+ #include <map>
8
+ using std::map;
9
+ using std::set;
10
+ using boost::shared_ptr;
11
+ using std::vector;
12
+
13
+ class DecisionTreeExperiment;
14
+ class SplitDefinition;
15
+
16
+ enum Partition
17
+ {
18
+ ROOT,
19
+ LHS,
20
+ RHS,
21
+ MISSING,
22
+ };
23
+
24
+ class DecisionTreeNode
25
+ {
26
+ public:
27
+ DecisionTreeNode( vector<shared_ptr<DecisionTreeExperiment> > experiments,
28
+ double sumZ,
29
+ double sumW,
30
+ Partition partition,
31
+ shared_ptr<SplitDefinition> parentSplitDefinition);
32
+ ~DecisionTreeNode();
33
+
34
+
35
+
36
+ shared_ptr<DecisionTreeNode> getTerminalNodeForExperiment(shared_ptr<DecisionTreeExperiment> experiment);
37
+
38
+ void defineSplit(shared_ptr<SplitDefinition> splitDefinition,
39
+ shared_ptr<DecisionTreeNode> lhsChild,
40
+ shared_ptr<DecisionTreeNode> rhsChild,
41
+ shared_ptr<DecisionTreeNode> missingChild);
42
+
43
+ vector<shared_ptr<DecisionTreeExperiment> > getExperiments();
44
+
45
+ bool isTerminalNode();
46
+ void clearExperimentsWithinTree();
47
+
48
+ double getSumZ();
49
+ double getSumW();
50
+ void setSumZ(double sumZ);
51
+ void setSumW(double sumW);
52
+ void updateSums();
53
+
54
+ shared_ptr<SplitDefinition> getSplitDefinition();
55
+ shared_ptr<SplitDefinition> getParentSplitDefinition();
56
+ Partition getPartition();
57
+
58
+ static void setMissingValue(double missingValue);
59
+ protected:
60
+ shared_ptr<DecisionTreeNode> chooseChild(shared_ptr<DecisionTreeExperiment> experiment);
61
+ void setChildren(shared_ptr<DecisionTreeNode> lhsChild,
62
+ shared_ptr<DecisionTreeNode> rhsChild,
63
+ shared_ptr<DecisionTreeNode> missingChild);
64
+
65
+ // if this decision tree node has been further split, the following variables will be populated, otherwise they will be null!
66
+ bool m_nodeHasChildren;
67
+
68
+ shared_ptr<DecisionTreeNode> m_lhsChild;
69
+ shared_ptr<DecisionTreeNode> m_rhsChild;
70
+ shared_ptr<DecisionTreeNode> m_missingChild;
71
+
72
+ vector<shared_ptr<DecisionTreeExperiment> > m_experiments;
73
+
74
+ static bool m_missingValueDefined;
75
+ static double m_missingValue;
76
+
77
+ double m_sumZ;
78
+ double m_sumW;
79
+ Partition m_whichPartitionAmI;
80
+ shared_ptr<SplitDefinition> m_parentSplitDefinition;
81
+ shared_ptr<SplitDefinition> m_splitDefinition;
82
+ };
83
+
84
+
85
+
86
+ #endif // DecisionTreeNode_h__
@@ -0,0 +1,31 @@
1
+ #ifndef FeatureInteraction_h__
2
+ #define FeatureInteraction_h__
3
+
4
+ #include <boost/shared_ptr.hpp>
5
+ using boost::shared_ptr;
6
+
7
+ #include "DecisionTreeNode.h"
8
+
9
+ class SplitDefinition;
10
+
11
+ class FeatureInteraction
12
+ {
13
+ public:
14
+ FeatureInteraction(shared_ptr<SplitDefinition> primarySplitDefinition,shared_ptr<SplitDefinition> secondarySplitDefinition,Partition primaryPartition)
15
+ : primarySplitDefinition(primarySplitDefinition), secondarySplitDefinition(secondarySplitDefinition),
16
+ primaryPartition(primaryPartition)
17
+ {
18
+
19
+ }
20
+ ~FeatureInteraction() {};
21
+
22
+ shared_ptr<SplitDefinition> primarySplitDefinition;
23
+ shared_ptr<SplitDefinition> secondarySplitDefinition;
24
+ Partition primaryPartition;
25
+
26
+
27
+ protected:
28
+
29
+ private:
30
+ };
31
+ #endif // FeatureInteraction_h__
@@ -0,0 +1,45 @@
1
+ #ifndef __NodeSplitter_h__
2
+ #define __NodeSplitter_h__
3
+
4
+ #include "MachineLearning/DecisionTree/DecisionTreeNode.h"
5
+
6
+ #include <boost/shared_ptr.hpp>
7
+ using boost::shared_ptr;
8
+
9
+ class SplitDefinition;
10
+ class DecisionTreeNode;
11
+ class DecisionTreeExperiment;
12
+ class MLData;
13
+
14
+ // pure virtual base class for NodeSplitterContinuous and NodeSplitterCategorical
15
+
16
+ class NodeSplitter
17
+ {
18
+ public:
19
+ NodeSplitter(MLData* data, int minObservations, double scale);
20
+ ~NodeSplitter();
21
+
22
+ shared_ptr<SplitDefinition> createSplitDefinition(shared_ptr<DecisionTreeNode> node, int featureIndex);
23
+ shared_ptr<SplitDefinition> createContinuousSplitDefinition(shared_ptr<DecisionTreeNode> node, int featureIndex);
24
+ shared_ptr<SplitDefinition> createCategoricalSplitDefinition(shared_ptr<DecisionTreeNode> node, int featureIndex);
25
+
26
+ double calculateImprovement(double lhsSumW, double lhsSumZ, double rhsSumW, double rhsSumZ, double missingSumW, double missingSumZ);
27
+
28
+ shared_ptr<DecisionTreeNode> createLhsChild(shared_ptr<SplitDefinition> splitDefinition);
29
+ shared_ptr<DecisionTreeNode> createRhsChild(shared_ptr<SplitDefinition> splitDefinition);
30
+ shared_ptr<DecisionTreeNode> createMissingChild(shared_ptr<SplitDefinition> splitDefinition);
31
+ shared_ptr<DecisionTreeNode> createChild(shared_ptr<SplitDefinition> splitDefinition, Partition partition);
32
+
33
+ vector<shared_ptr<DecisionTreeExperiment> > partitionExperiments(vector<shared_ptr<DecisionTreeExperiment> >& experiments,
34
+ shared_ptr<SplitDefinition> splitDefinition, Partition partition);
35
+
36
+ vector<shared_ptr<DecisionTreeNode> > splitNode(shared_ptr<DecisionTreeNode> nodeToSplit, vector<int> featuresToConsider);
37
+ protected:
38
+ MLData* m_data;
39
+ double m_missingValue;
40
+ int m_minObservations;
41
+ bool m_missingValueDefined;
42
+ double m_scale;
43
+ };
44
+
45
+ #endif // NodeSplitter_h__
@@ -0,0 +1,17 @@
1
+ #ifndef __NodeSplitterCategorical_h__
2
+ #define __NodeSplitterCategorical_h__
3
+
4
+ #include "MachineLearning/DecisionTree/NodeSplitter.h"
5
+
6
+ class NodeSplitterCategorical : public NodeSplitter
7
+ {
8
+ public:
9
+ NodeSplitterCategorical(MLData* data, int minObservations, double scale);
10
+ ~NodeSplitterCategorical();
11
+
12
+ shared_ptr<SplitDefinition> createSplitDefinition(shared_ptr<DecisionTreeNode> node, int featureIndex);
13
+ protected:
14
+
15
+ };
16
+
17
+ #endif // NodeSplitterCategorical_h__
@@ -0,0 +1,16 @@
1
+ #ifndef __NodeSplitterContinuous_h__
2
+ #define __NodeSplitterContinuous_h__
3
+
4
+ #include "MachineLearning/DecisionTree/NodeSplitter.h"
5
+
6
+ class NodeSplitterContinuous : public NodeSplitter
7
+ {
8
+ public:
9
+ NodeSplitterContinuous(MLData* data, int minObservations, double scale);
10
+ ~NodeSplitterContinuous();
11
+
12
+ shared_ptr<SplitDefinition> createSplitDefinition(shared_ptr<DecisionTreeNode> node, int featureIndex);
13
+ protected:
14
+ };
15
+
16
+ #endif // NodeSplitterContinuous_h__
@@ -0,0 +1,81 @@
1
+ #ifndef __SplitDefinition_h__
2
+ #define __SplitDefinition_h__
3
+
4
+ #include <boost/shared_ptr.hpp>
5
+ #include <set>
6
+ using std::set;
7
+ using boost::shared_ptr;
8
+
9
+ class DecisionTreeExperiment;
10
+ class DecisionTreeNode;
11
+
12
+ class SplitDefinition
13
+ {
14
+ public:
15
+ SplitDefinition(shared_ptr<DecisionTreeNode> nodeToSplit,
16
+ int featureIndex,
17
+ set<double>& lhsCategories,
18
+ set<double>& rhsCategories,
19
+ double lhsSumZ,
20
+ double lhsSumW,
21
+ int lhsCount,
22
+ double rhsSumZ,
23
+ double rhsSumW,
24
+ int rhsCount,
25
+ double missingSumZ,
26
+ double missingSumW,
27
+ int missingCount,
28
+ double improvement);
29
+
30
+ SplitDefinition(shared_ptr<DecisionTreeNode> nodeToSplit,
31
+ int featureIndex,
32
+ double splitValue,
33
+ double lhsSumZ,
34
+ double lhsSumW,
35
+ int lhsCount,
36
+ double rhsSumZ,
37
+ double rhsSumW,
38
+ int rhsCount,
39
+ double missingSumZ,
40
+ double missingSumW,
41
+ int missingCount,
42
+ double improvement);
43
+
44
+ ~SplitDefinition();
45
+
46
+ int getFeatureIndex();
47
+ double getImprovement();
48
+ shared_ptr<DecisionTreeNode> getNodeToSplit();
49
+ double getLhsSumZ();
50
+ double getLhsSumW();
51
+ int getLhsExperimentCount();
52
+ double getRhsSumZ();
53
+ double getRhsSumW();
54
+ int getRhsExperimentCount();
55
+ double getMissingSumZ();
56
+ double getMissingSumW();
57
+ int getMissingExperimentCount();
58
+ set<double>& getLhsCategories();
59
+ set<double>& getRhsCategories();
60
+ double getSplitValue();
61
+ bool isCategorical();
62
+ protected:
63
+ shared_ptr<DecisionTreeNode> m_nodeToSplit;
64
+ int m_splitFeatureIndex;
65
+ set<double> m_lhsCategories;
66
+ set<double> m_rhsCategories;
67
+ double m_splitValue;
68
+ double m_lhsSumZ;
69
+ double m_rhsSumZ;
70
+ double m_missingSumZ;
71
+ double m_lhsSumW;
72
+ double m_rhsSumW;
73
+ double m_missingSumW;
74
+ int m_lhsCount;
75
+ int m_rhsCount;
76
+ int m_missingCount;
77
+ double m_improvement;
78
+ bool m_featureIsCategorical;
79
+ };
80
+
81
+ #endif // SplitDefinition_h__