ml4r 0.1.2 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. data/ext/ml4r/{LinearRegression.h → LinearRegression/LinearRegression.h} +25 -19
  2. data/ext/ml4r/LinearRegression/OLSLinearRegression.h +29 -0
  3. data/ext/ml4r/MachineLearning/DecisionTree/CategoryInfo.h +32 -0
  4. data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.h +30 -0
  5. data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.h +86 -0
  6. data/ext/ml4r/MachineLearning/DecisionTree/FeatureInteraction.h +31 -0
  7. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.h +45 -0
  8. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.h +17 -0
  9. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.h +16 -0
  10. data/ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.h +81 -0
  11. data/ext/ml4r/MachineLearning/GBM/BernoulliCalculator.h +29 -0
  12. data/ext/ml4r/MachineLearning/GBM/GBM.h +50 -0
  13. data/ext/ml4r/MachineLearning/GBM/GBMCalculator.h +31 -0
  14. data/ext/ml4r/MachineLearning/GBM/GBMData.h +0 -0
  15. data/ext/ml4r/MachineLearning/GBM/GBMEstimator.h +79 -0
  16. data/ext/ml4r/MachineLearning/GBM/GBMOutput.h +53 -0
  17. data/ext/ml4r/MachineLearning/GBM/GBMParameters.h +50 -0
  18. data/ext/ml4r/MachineLearning/GBM/GBMRunner.h +35 -0
  19. data/ext/ml4r/MachineLearning/GBM/GaussianCalculator.h +29 -0
  20. data/ext/ml4r/MachineLearning/GBM/ZenithGBM.h +27 -0
  21. data/ext/ml4r/MachineLearning/MLData/MLData.h +77 -0
  22. data/ext/ml4r/MachineLearning/MLData/MLDataFields.h +25 -0
  23. data/ext/ml4r/MachineLearning/MLData/MLDataReader.h +37 -0
  24. data/ext/ml4r/MachineLearning/MLData/ZenithMLData.h +13 -0
  25. data/ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.h +20 -0
  26. data/ext/ml4r/MachineLearning/MLEstimator.h +30 -0
  27. data/ext/ml4r/MachineLearning/MLEstimatorFactory.h +25 -0
  28. data/ext/ml4r/MachineLearning/MLExperiment.h +41 -0
  29. data/ext/ml4r/MachineLearning/MLOutput.h +45 -0
  30. data/ext/ml4r/MachineLearning/MLParameters.h +16 -0
  31. data/ext/ml4r/MachineLearning/MLRunner.h +47 -0
  32. data/ext/ml4r/MachineLearning/MLUtils.h +75 -0
  33. data/ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.h +47 -0
  34. data/ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.h +33 -0
  35. data/ext/ml4r/MachineLearning/RandomForest/RandomForestParameters.h +32 -0
  36. data/ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.h +34 -0
  37. data/ext/ml4r/extconf.rb +16 -3
  38. data/ext/ml4r/{MathUtils.h → utils/MathUtils.h} +0 -0
  39. data/ext/ml4r/{MatrixInversion.h → utils/MatrixInversion.h} +0 -0
  40. data/ext/ml4r/utils/StochasticUtils.h +33 -0
  41. data/ext/ml4r/utils/Utils.h +147 -0
  42. data/ext/ml4r/utils/VlcMessage.h +44 -0
  43. data/lib/ml4r/linear_regression.rb +7 -0
  44. metadata +45 -13
  45. data/ext/ml4r/LinearRegression/ZenithRegression.h +0 -17
  46. data/ext/ml4r/OLSLinearRegression.h +0 -23
  47. data/ext/ml4r/Utils.h +0 -53
  48. data/ext/ml4r/example.h +0 -18
  49. data/ext/ml4r/swig/example.h +0 -13
  50. data/ext/ml4r/swig/example_wrap.c +0 -2093
  51. data/ext/ml4r/utils/RubyUtils.h +0 -174
@@ -0,0 +1,29 @@
1
+ #ifndef BernoulliCalculator_h__
2
+ #define BernoulliCalculator_h__
3
+
4
+ #include "MachineLearning/GBM/GBMCalculator.h"
5
+
6
+ #include <boost/shared_ptr.hpp>
7
+ #include <vector>
8
+ using std::vector;
9
+ using boost::shared_ptr;
10
+
11
+ class BernoulliCalculator : public GBMCalculator
12
+ {
13
+ public:
14
+ BernoulliCalculator();
15
+ ~BernoulliCalculator();
16
+
17
+ double calculateDeviance(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
18
+ void populateInitialF(vector<shared_ptr<DecisionTreeExperiment> >& experiments, bool useInitialPredictions);
19
+ void updateZ(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
20
+ double computeFIncrement(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
21
+ void updatePredictions(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
22
+ double calculatePrediction(double f);
23
+ double calculateF(double prediction);
24
+ protected:
25
+
26
+ private:
27
+ };
28
+
29
+ #endif // BernoulliCalculator_h__
@@ -0,0 +1,50 @@
1
+ #ifndef __GBM_h__
2
+ #define __GBM_h__
3
+
4
+ #include "MachineLearning/GBM/GBMParameters.h"
5
+
6
+ #include <boost/shared_ptr.hpp>
7
+ #include <vector>
8
+ using std::vector;
9
+ using boost::shared_ptr;
10
+
11
+ class MLData;
12
+ class DecisionTreeExperiment;
13
+ class MLExperiment;
14
+ class MLDataFields;
15
+ class DecisionTreeNode;
16
+ class SplitDefinition;
17
+ class GBMOutput;
18
+ class GBMCalculator;
19
+
20
+ class GBM
21
+ {
22
+ public:
23
+ GBM();
24
+ ~GBM();
25
+ void estimate();
26
+ void estimateMore(int numTrees);
27
+
28
+ void setData(MLData* mlData);
29
+ vector<double> getPredictions(MLData* newData);
30
+ vector<double> getPredictions(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
31
+
32
+ vector<double> getMeanTrainingPredictions();
33
+ vector<double> getCrossValidationPredictions();
34
+
35
+ GBMParameters parameters;
36
+ protected:
37
+ MLData* m_data;
38
+ vector<shared_ptr<GBMOutput> > m_outputObjects;
39
+ shared_ptr<GBMCalculator> m_gbmCalculator;
40
+
41
+ void config();
42
+ void input();
43
+ void goNuts();
44
+ void output();
45
+ vector<shared_ptr<DecisionTreeExperiment> > makeDecisionTreeExperiments(vector<shared_ptr<MLExperiment> >& experiments);
46
+
47
+
48
+ };
49
+
50
+ #endif // GBM_h__
@@ -0,0 +1,31 @@
1
+ #ifndef GBMCalculator_h__
2
+ #define GBMCalculator_h__
3
+
4
+ #include <boost/shared_ptr.hpp>
5
+ #include <vector>
6
+ using std::vector;
7
+ using boost::shared_ptr;
8
+
9
+ class DecisionTreeExperiment;
10
+
11
+ class GBMCalculator
12
+ {
13
+ public:
14
+ GBMCalculator() {};
15
+ ~GBMCalculator() {};
16
+
17
+ virtual double calculateDeviance(vector<shared_ptr<DecisionTreeExperiment> >& experiments) = 0;
18
+ virtual void populateInitialF(vector<shared_ptr<DecisionTreeExperiment> >& experiments, bool useInitialPredictions) = 0;
19
+ virtual void updateZ(vector<shared_ptr<DecisionTreeExperiment> >& experiments) = 0;
20
+ virtual double computeFIncrement(vector<shared_ptr<DecisionTreeExperiment> >& experiments) = 0;
21
+ virtual void updatePredictions(vector<shared_ptr<DecisionTreeExperiment> >& experiments) = 0;
22
+ virtual double calculatePrediction(double f) = 0;
23
+ virtual double calculateF(double prediction) = 0;
24
+ protected:
25
+
26
+ private:
27
+ };
28
+
29
+ #endif // GBMCalculator_h__
30
+
31
+
File without changes
@@ -0,0 +1,79 @@
1
+ #ifndef GBMEstimator_h__
2
+ #define GBMEstimator_h__
3
+
4
+ #include "MachineLearning/DecisionTree/FeatureInteraction.h"
5
+ #include "MachineLearning/MLEstimator.h"
6
+
7
+ #include <boost/shared_ptr.hpp>
8
+ #include <vector>
9
+ #include <map>
10
+ #include <set>
11
+ #include <utility>
12
+ using std::pair;
13
+ using std::set;
14
+ using std::map;
15
+ using std::vector;
16
+ using boost::shared_ptr;
17
+
18
+
19
+ class GBMParameters;
20
+ class GBMOutput;
21
+ class DecisionTreeExperiment;
22
+ class DecisionTreeNode;
23
+ class SplitDefinition;
24
+ class MLData;
25
+ class GBMCalculator;
26
+ class MLExperiment;
27
+
28
+ class GBMEstimator : public MLEstimator
29
+ {
30
+ public:
31
+ GBMEstimator(MLData* data, vector<shared_ptr<MLExperiment> > experiments, shared_ptr<GBMParameters> parameters);
32
+ ~GBMEstimator();
33
+
34
+ shared_ptr<MLOutput> estimate();
35
+ void estimateMore(int numTrees);
36
+ vector<FeatureInteraction> findInteractions(int howMany);
37
+ protected:
38
+ void initializeEstimator();
39
+ void performIteration();
40
+ void constructFeatureIndices();
41
+ void initialiseGBMExperimentData();
42
+ void populateInitialF();
43
+ void updateZ(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
44
+ void updatePredictions(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
45
+
46
+ void sortTrainingExperiments();
47
+
48
+ void constructDecisionTree(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
49
+ void constructGenerousDecisionTree(vector<shared_ptr<DecisionTreeExperiment> >& experiments, int rfToLevel);
50
+
51
+
52
+
53
+ // map<int, vector<shared_ptr<DecisionTreeExperiment> > > partitionSortedExperiments(shared_ptr<SplitDefinition> splitDefinition, Partition partition);
54
+ map<int, vector<shared_ptr<DecisionTreeExperiment> > > bagSortedExperiments(vector<shared_ptr<DecisionTreeExperiment> >& baggedExperiments);
55
+
56
+ vector<int> getRandomFeatureList();
57
+
58
+ void calculateFIncrementPerDecisionTreeNode();
59
+ void applyFIncrementToInBagExperiments();
60
+ pair<vector<shared_ptr<DecisionTreeExperiment> >, vector<shared_ptr<DecisionTreeExperiment> > > bagExperiments();
61
+ void applyFIncrementToExperiments(vector<shared_ptr<DecisionTreeExperiment> >& outOfBagExperiments);
62
+ void reportDeviance(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
63
+ void deleteRedundantData();
64
+
65
+
66
+ shared_ptr<GBMOutput> m_output;
67
+ vector<shared_ptr<DecisionTreeExperiment> > m_decisionTreeExperiments;
68
+ shared_ptr<GBMParameters> m_parameters;
69
+ vector<int> m_featureIndices;
70
+ shared_ptr<DecisionTreeNode> m_decisionTreeHead;
71
+ set<shared_ptr<DecisionTreeNode> > m_terminalNodes;
72
+ map<shared_ptr<DecisionTreeNode>, double> m_FIncrements;
73
+ shared_ptr<GBMCalculator> m_gbmCalculator;
74
+ bool m_missingValueDefined;
75
+ double m_missingValue;
76
+ private:
77
+ };
78
+
79
+ #endif // GBMEstimator_h__
@@ -0,0 +1,53 @@
1
+ #ifndef GBMOutput_h__
2
+ #define GBMOutput_h__
3
+
4
+ #include "MachineLearning/GBM/GBMParameters.h"
5
+ #include "MachineLearning/MLOutput.h"
6
+
7
+ #include <boost/shared_ptr.hpp>
8
+ #include <vector>
9
+ #include <map>
10
+ using std::map;
11
+ using std::vector;
12
+ using boost::shared_ptr;
13
+
14
+ class DecisionTreeNode;
15
+ class DecisionTreeExperiment;
16
+ class MLData;
17
+ class GBMCalculator;
18
+
19
+ class GBMOutput : public MLOutput
20
+ {
21
+ public:
22
+ GBMOutput(MLData* trainingData, vector<int> trainingExperimentIndicies,shared_ptr<GBMParameters> parameters);
23
+ ~GBMOutput();
24
+
25
+ // recording inputs of GBM estimation
26
+ shared_ptr<GBMParameters> getParameters();
27
+
28
+ // recording outputs of GBM estimation
29
+ void setMeanY(double y);
30
+ void addHeadDecisionTreeNode(shared_ptr<DecisionTreeNode> node);
31
+ void addFIncrements(map<shared_ptr<DecisionTreeNode>, double> m_FIncrements);
32
+
33
+ // applying model to new data
34
+ double predictForExperiment(shared_ptr<MLExperiment> experiment);
35
+ void setPredictionForDecisionTreeExperiment(shared_ptr<DecisionTreeExperiment> experiment);
36
+
37
+ // stats
38
+ int getNumTrees();
39
+
40
+ void capTrees(int numTrees);
41
+
42
+ protected:
43
+
44
+ vector<shared_ptr<DecisionTreeNode> > m_headNodes;
45
+ vector<map<shared_ptr<DecisionTreeNode>, double> > m_fIncrements;
46
+ double m_meanY;
47
+ bool m_useMeanY;
48
+ shared_ptr<GBMParameters> m_parameters;
49
+ shared_ptr<GBMCalculator> m_gbmCalculator;
50
+ private:
51
+ };
52
+
53
+ #endif // GBMOutput_h__
@@ -0,0 +1,50 @@
1
+ #ifndef __GBMParameters_h__
2
+ #define __GBMParameters_h__
3
+
4
+ #include <string>
5
+ #include <vector>
6
+ #include <limits>
7
+ using std::vector;
8
+ using std::string;
9
+
10
+ enum GBMDistribution {
11
+ BERNOULLI,
12
+ GAUSSIAN
13
+ };
14
+
15
+ class GBMParameters
16
+ {
17
+ public:
18
+ GBMParameters()
19
+ : tryMVariables(-1),growKDecisionTreeNodes(5),bagFraction(1.0),
20
+ shrinkageFactor(0.01),numIterations(100),minObservations(10),
21
+ distribution(BERNOULLI), greedy(true), rfToLevel(0), verbose(false),
22
+ scale(std::numeric_limits<double>::infinity())
23
+ {};
24
+ ~GBMParameters() {};
25
+
26
+ // parameters will be public
27
+ vector<string> featuresToRun; // X's for this run
28
+
29
+ int tryMVariables;
30
+ int growKDecisionTreeNodes;
31
+
32
+ double bagFraction;
33
+ double shrinkageFactor;
34
+ int numIterations;
35
+
36
+ int minObservations;
37
+ vector<int> trainingExperimentIds;
38
+
39
+ GBMDistribution distribution;
40
+
41
+ int rfToLevel;
42
+ bool greedy;
43
+
44
+ double scale;
45
+
46
+ bool verbose;
47
+ protected:
48
+ };
49
+
50
+ #endif // GBMParameters_h__
@@ -0,0 +1,35 @@
1
+ #ifndef __GBM_h__
2
+ #define __GBM_h__
3
+
4
+ #include "MachineLearning/MLRunner.h"
5
+
6
+ #include <boost/shared_ptr.hpp>
7
+ #include <vector>
8
+ using std::vector;
9
+ using boost::shared_ptr;
10
+
11
+ class MLData;
12
+ class MLExperiment;
13
+ class FeatureInteraction;
14
+ class GBMParameters;
15
+
16
+ class GBMRunner : public MLRunner
17
+ {
18
+ public:
19
+ GBMRunner();
20
+ ~GBMRunner();
21
+
22
+ void estimateMore(int numTrees);
23
+ void capTrees(int numTrees);
24
+
25
+ vector<FeatureInteraction> getFeatureInteractions(int howMany);
26
+
27
+ shared_ptr<MLEstimator> createEstimator(MLData* data, vector<shared_ptr<MLExperiment> > trainingExperiments);
28
+
29
+ shared_ptr<GBMParameters> parameters;
30
+
31
+ protected:
32
+ void config();
33
+ };
34
+
35
+ #endif // GBM_h__
@@ -0,0 +1,29 @@
1
+ #ifndef GaussianCalculator_h__
2
+ #define GaussianCalculator_h__
3
+
4
+ #include "MachineLearning/GBM/GBMCalculator.h"
5
+
6
+ #include <boost/shared_ptr.hpp>
7
+ #include <vector>
8
+ using std::vector;
9
+ using boost::shared_ptr;
10
+
11
+ class GaussianCalculator : public GBMCalculator
12
+ {
13
+ public:
14
+ GaussianCalculator();
15
+ ~GaussianCalculator();
16
+
17
+ double calculateDeviance(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
18
+ void populateInitialF(vector<shared_ptr<DecisionTreeExperiment> >& experiments, bool useInitialPredictions);
19
+ void updateZ(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
20
+ double computeFIncrement(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
21
+ void updatePredictions(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
22
+ double calculatePrediction(double f);
23
+ double calculateF(double prediction);
24
+ protected:
25
+
26
+ private:
27
+ };
28
+
29
+ #endif // GaussianCalculator_h__
@@ -0,0 +1,27 @@
1
+ // #ifndef ZenithGBM_h__
2
+ // #define ZenithGBM_h__
3
+
4
+ // #include "stdafx.h"
5
+
6
+ // // ruby interface methods
7
+ // void zenith_gbm_Free(void* v);
8
+ // OtInterface::VALUE zenith_gbm_New(int argc, VALUE* argv, VALUE klass);
9
+ // OtInterface::VALUE zenith_gbm_Initialize(VALUE self);
10
+ // OtInterface::VALUE zenith_gbm_estimate(VALUE self);
11
+ // OtInterface::VALUE zenith_gbm_estimateMore(VALUE self, VALUE numTrees);
12
+ // OtInterface::VALUE zenith_gbm_setFeaturesToRun(VALUE self, VALUE featuresValue);
13
+ // OtInterface::VALUE zenith_gbm_setData(VALUE self, VALUE data);
14
+ // OtInterface::VALUE zenith_gbm_setDistribution(VALUE self, VALUE distribution);
15
+ // OtInterface::VALUE zenith_gbm_setTryMVariables(VALUE self, VALUE mVariablesValue);
16
+ // OtInterface::VALUE zenith_gbm_setKTerminalNodes(VALUE self, VALUE kNodesValue);
17
+ // OtInterface::VALUE zenith_gbm_setNumIterations(VALUE self, VALUE numIterationsValue);
18
+ // OtInterface::VALUE zenith_gbm_setShrinkageFactor(VALUE self, VALUE shrinkageFactorValue);
19
+ // OtInterface::VALUE zenith_gbm_setBagFraction(VALUE self, VALUE bagFractionValue);
20
+ // OtInterface::VALUE zenith_gbm_setTrainingExperimentIds(VALUE self, VALUE experimentIdsValue);
21
+ // OtInterface::VALUE zenith_gbm_minObservations(VALUE self, VALUE minObservations);
22
+ // OtInterface::VALUE zenith_gbm_verbose(VALUE self, VALUE verbose);
23
+ // OtInterface::VALUE zenith_gbm_predictions(VALUE self, VALUE newMlData);
24
+ // OtInterface::VALUE zenith_gbm_training_predictions(VALUE self);
25
+ // OtInterface::VALUE zenith_gbm_crossvalidation_predictions(VALUE self);
26
+
27
+ // #endif // ZenithGBM_h__
@@ -0,0 +1,77 @@
1
+ #ifndef __MLData_h__
2
+ #define __MLData_h__
3
+
4
+ #include <boost/shared_ptr.hpp>
5
+ #include <vector>
6
+ #include <set>
7
+ #include <map>
8
+ #include <utility>
9
+ #include <string>
10
+ using std::string;
11
+ using std::pair;
12
+ using std::set;
13
+ using std::map;
14
+ using std::vector;
15
+ using boost::shared_ptr;
16
+
17
+ class MLExperiment;
18
+
19
+ class MLData
20
+ {
21
+ public:
22
+ MLData();
23
+ ~MLData();
24
+
25
+ void setExperiments(vector<shared_ptr<MLExperiment> > experiments);
26
+ vector<shared_ptr<MLExperiment> >& getExperiments();
27
+ shared_ptr<MLExperiment> getExperiment(int experimentIndex);
28
+
29
+ void createFolds(int numFolds, int randomSeed);
30
+ void setFolds(vector<int> folds);
31
+
32
+ void setTrainingExperimentIds(vector<int>& experimentIds);
33
+ void setTrainingExperiments(vector<shared_ptr<MLExperiment> > experiments);
34
+
35
+ vector<int>& getFoldNumbers();
36
+ vector<int> getFolds();
37
+
38
+ vector<shared_ptr<MLExperiment> >& getTrainingExperiments(int fold);
39
+ vector<shared_ptr<MLExperiment> >& getCrossValidationExperiments(int fold);
40
+
41
+ shared_ptr<MLExperiment> getExperimentWithId(int experimentId);
42
+ // vector<shared_ptr<MLExperiment> >& getExperimentsSortedOnFeature(int featureIndex);
43
+
44
+ vector<string>& getFeatures();
45
+ void setFeatures(vector<string> features);
46
+ int getFeatureIndex(string& feature);
47
+
48
+ void constructCategories(vector<string> categoricalFeatures);
49
+ set<int>& getCategoricalFeatureIndices();
50
+
51
+ void setInitialPredictions(vector<double> initialPredictions);
52
+ void setInitialPredictionsDefined(bool defined);
53
+ bool initialPredictionsDefined();
54
+
55
+ int getNumFolds();
56
+
57
+ void setMissingValue(double missingValue);
58
+ bool missingValueDefined();
59
+ double getMissingValue();
60
+ protected:
61
+
62
+ vector<shared_ptr<MLExperiment> > m_experiments;
63
+ map<int, vector<shared_ptr<MLExperiment> > > m_trainingExperiments;
64
+ map<int, vector<shared_ptr<MLExperiment> > > m_cvExperiments;
65
+
66
+ map<int, shared_ptr<MLExperiment> > m_experimentsById;
67
+ vector<string> m_featureNames;
68
+ map<string, int> m_featureIndices;
69
+ set<int> m_categoricalFeatureIndices;
70
+ bool m_initialPredictionsDefined;
71
+ vector<int> m_foldNumbers;
72
+ double m_missingValue;
73
+ bool m_missingValueDefined;
74
+ // map<int, vector<shared_ptr<MLExperiment> > > m_experimentsSortedByFeature;
75
+ };
76
+
77
+ #endif // MLData_h__