ml4r 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. data/ext/ml4r/{LinearRegression.h → LinearRegression/LinearRegression.h} +25 -19
  2. data/ext/ml4r/LinearRegression/OLSLinearRegression.h +29 -0
  3. data/ext/ml4r/MachineLearning/DecisionTree/CategoryInfo.h +32 -0
  4. data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.h +30 -0
  5. data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.h +86 -0
  6. data/ext/ml4r/MachineLearning/DecisionTree/FeatureInteraction.h +31 -0
  7. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.h +45 -0
  8. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.h +17 -0
  9. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.h +16 -0
  10. data/ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.h +81 -0
  11. data/ext/ml4r/MachineLearning/GBM/BernoulliCalculator.h +29 -0
  12. data/ext/ml4r/MachineLearning/GBM/GBM.h +50 -0
  13. data/ext/ml4r/MachineLearning/GBM/GBMCalculator.h +31 -0
  14. data/ext/ml4r/MachineLearning/GBM/GBMData.h +0 -0
  15. data/ext/ml4r/MachineLearning/GBM/GBMEstimator.h +79 -0
  16. data/ext/ml4r/MachineLearning/GBM/GBMOutput.h +53 -0
  17. data/ext/ml4r/MachineLearning/GBM/GBMParameters.h +50 -0
  18. data/ext/ml4r/MachineLearning/GBM/GBMRunner.h +35 -0
  19. data/ext/ml4r/MachineLearning/GBM/GaussianCalculator.h +29 -0
  20. data/ext/ml4r/MachineLearning/GBM/ZenithGBM.h +27 -0
  21. data/ext/ml4r/MachineLearning/MLData/MLData.h +77 -0
  22. data/ext/ml4r/MachineLearning/MLData/MLDataFields.h +25 -0
  23. data/ext/ml4r/MachineLearning/MLData/MLDataReader.h +37 -0
  24. data/ext/ml4r/MachineLearning/MLData/ZenithMLData.h +13 -0
  25. data/ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.h +20 -0
  26. data/ext/ml4r/MachineLearning/MLEstimator.h +30 -0
  27. data/ext/ml4r/MachineLearning/MLEstimatorFactory.h +25 -0
  28. data/ext/ml4r/MachineLearning/MLExperiment.h +41 -0
  29. data/ext/ml4r/MachineLearning/MLOutput.h +45 -0
  30. data/ext/ml4r/MachineLearning/MLParameters.h +16 -0
  31. data/ext/ml4r/MachineLearning/MLRunner.h +47 -0
  32. data/ext/ml4r/MachineLearning/MLUtils.h +75 -0
  33. data/ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.h +47 -0
  34. data/ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.h +33 -0
  35. data/ext/ml4r/MachineLearning/RandomForest/RandomForestParameters.h +32 -0
  36. data/ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.h +34 -0
  37. data/ext/ml4r/extconf.rb +16 -3
  38. data/ext/ml4r/{MathUtils.h → utils/MathUtils.h} +0 -0
  39. data/ext/ml4r/{MatrixInversion.h → utils/MatrixInversion.h} +0 -0
  40. data/ext/ml4r/utils/StochasticUtils.h +33 -0
  41. data/ext/ml4r/utils/Utils.h +147 -0
  42. data/ext/ml4r/utils/VlcMessage.h +44 -0
  43. data/lib/ml4r/linear_regression.rb +7 -0
  44. metadata +45 -13
  45. data/ext/ml4r/LinearRegression/ZenithRegression.h +0 -17
  46. data/ext/ml4r/OLSLinearRegression.h +0 -23
  47. data/ext/ml4r/Utils.h +0 -53
  48. data/ext/ml4r/example.h +0 -18
  49. data/ext/ml4r/swig/example.h +0 -13
  50. data/ext/ml4r/swig/example_wrap.c +0 -2093
  51. data/ext/ml4r/utils/RubyUtils.h +0 -174
@@ -0,0 +1,29 @@
1
+ #ifndef BernoulliCalculator_h__
2
+ #define BernoulliCalculator_h__
3
+
4
+ #include "MachineLearning/GBM/GBMCalculator.h"
5
+
6
+ #include <boost/shared_ptr.hpp>
7
+ #include <vector>
8
+ using std::vector;
9
+ using boost::shared_ptr;
10
+
11
+ class BernoulliCalculator : public GBMCalculator
12
+ {
13
+ public:
14
+ BernoulliCalculator();
15
+ ~BernoulliCalculator();
16
+
17
+ double calculateDeviance(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
18
+ void populateInitialF(vector<shared_ptr<DecisionTreeExperiment> >& experiments, bool useInitialPredictions);
19
+ void updateZ(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
20
+ double computeFIncrement(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
21
+ void updatePredictions(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
22
+ double calculatePrediction(double f);
23
+ double calculateF(double prediction);
24
+ protected:
25
+
26
+ private:
27
+ };
28
+
29
+ #endif // BernoulliCalculator_h__
@@ -0,0 +1,50 @@
1
+ #ifndef __GBM_h__
2
+ #define __GBM_h__
3
+
4
+ #include "MachineLearning/GBM/GBMParameters.h"
5
+
6
+ #include <boost/shared_ptr.hpp>
7
+ #include <vector>
8
+ using std::vector;
9
+ using boost::shared_ptr;
10
+
11
+ class MLData;
12
+ class DecisionTreeExperiment;
13
+ class MLExperiment;
14
+ class MLDataFields;
15
+ class DecisionTreeNode;
16
+ class SplitDefinition;
17
+ class GBMOutput;
18
+ class GBMCalculator;
19
+
20
+ class GBM
21
+ {
22
+ public:
23
+ GBM();
24
+ ~GBM();
25
+ void estimate();
26
+ void estimateMore(int numTrees);
27
+
28
+ void setData(MLData* mlData);
29
+ vector<double> getPredictions(MLData* newData);
30
+ vector<double> getPredictions(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
31
+
32
+ vector<double> getMeanTrainingPredictions();
33
+ vector<double> getCrossValidationPredictions();
34
+
35
+ GBMParameters parameters;
36
+ protected:
37
+ MLData* m_data;
38
+ vector<shared_ptr<GBMOutput> > m_outputObjects;
39
+ shared_ptr<GBMCalculator> m_gbmCalculator;
40
+
41
+ void config();
42
+ void input();
43
+ void goNuts();
44
+ void output();
45
+ vector<shared_ptr<DecisionTreeExperiment> > makeDecisionTreeExperiments(vector<shared_ptr<MLExperiment> >& experiments);
46
+
47
+
48
+ };
49
+
50
+ #endif // GBM_h__
@@ -0,0 +1,31 @@
1
+ #ifndef GBMCalculator_h__
2
+ #define GBMCalculator_h__
3
+
4
+ #include <boost/shared_ptr.hpp>
5
+ #include <vector>
6
+ using std::vector;
7
+ using boost::shared_ptr;
8
+
9
+ class DecisionTreeExperiment;
10
+
11
+ class GBMCalculator
12
+ {
13
+ public:
14
+ GBMCalculator() {};
15
+ ~GBMCalculator() {};
16
+
17
+ virtual double calculateDeviance(vector<shared_ptr<DecisionTreeExperiment> >& experiments) = 0;
18
+ virtual void populateInitialF(vector<shared_ptr<DecisionTreeExperiment> >& experiments, bool useInitialPredictions) = 0;
19
+ virtual void updateZ(vector<shared_ptr<DecisionTreeExperiment> >& experiments) = 0;
20
+ virtual double computeFIncrement(vector<shared_ptr<DecisionTreeExperiment> >& experiments) = 0;
21
+ virtual void updatePredictions(vector<shared_ptr<DecisionTreeExperiment> >& experiments) = 0;
22
+ virtual double calculatePrediction(double f) = 0;
23
+ virtual double calculateF(double prediction) = 0;
24
+ protected:
25
+
26
+ private:
27
+ };
28
+
29
+ #endif // GBMCalculator_h__
30
+
31
+
File without changes
@@ -0,0 +1,79 @@
1
+ #ifndef GBMEstimator_h__
2
+ #define GBMEstimator_h__
3
+
4
+ #include "MachineLearning/DecisionTree/FeatureInteraction.h"
5
+ #include "MachineLearning/MLEstimator.h"
6
+
7
+ #include <boost/shared_ptr.hpp>
8
+ #include <vector>
9
+ #include <map>
10
+ #include <set>
11
+ #include <utility>
12
+ using std::pair;
13
+ using std::set;
14
+ using std::map;
15
+ using std::vector;
16
+ using boost::shared_ptr;
17
+
18
+
19
+ class GBMParameters;
20
+ class GBMOutput;
21
+ class DecisionTreeExperiment;
22
+ class DecisionTreeNode;
23
+ class SplitDefinition;
24
+ class MLData;
25
+ class GBMCalculator;
26
+ class MLExperiment;
27
+
28
+ class GBMEstimator : public MLEstimator
29
+ {
30
+ public:
31
+ GBMEstimator(MLData* data, vector<shared_ptr<MLExperiment> > experiments, shared_ptr<GBMParameters> parameters);
32
+ ~GBMEstimator();
33
+
34
+ shared_ptr<MLOutput> estimate();
35
+ void estimateMore(int numTrees);
36
+ vector<FeatureInteraction> findInteractions(int howMany);
37
+ protected:
38
+ void initializeEstimator();
39
+ void performIteration();
40
+ void constructFeatureIndices();
41
+ void initialiseGBMExperimentData();
42
+ void populateInitialF();
43
+ void updateZ(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
44
+ void updatePredictions(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
45
+
46
+ void sortTrainingExperiments();
47
+
48
+ void constructDecisionTree(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
49
+ void constructGenerousDecisionTree(vector<shared_ptr<DecisionTreeExperiment> >& experiments, int rfToLevel);
50
+
51
+
52
+
53
+ // map<int, vector<shared_ptr<DecisionTreeExperiment> > > partitionSortedExperiments(shared_ptr<SplitDefinition> splitDefinition, Partition partition);
54
+ map<int, vector<shared_ptr<DecisionTreeExperiment> > > bagSortedExperiments(vector<shared_ptr<DecisionTreeExperiment> >& baggedExperiments);
55
+
56
+ vector<int> getRandomFeatureList();
57
+
58
+ void calculateFIncrementPerDecisionTreeNode();
59
+ void applyFIncrementToInBagExperiments();
60
+ pair<vector<shared_ptr<DecisionTreeExperiment> >, vector<shared_ptr<DecisionTreeExperiment> > > bagExperiments();
61
+ void applyFIncrementToExperiments(vector<shared_ptr<DecisionTreeExperiment> >& outOfBagExperiments);
62
+ void reportDeviance(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
63
+ void deleteRedundantData();
64
+
65
+
66
+ shared_ptr<GBMOutput> m_output;
67
+ vector<shared_ptr<DecisionTreeExperiment> > m_decisionTreeExperiments;
68
+ shared_ptr<GBMParameters> m_parameters;
69
+ vector<int> m_featureIndices;
70
+ shared_ptr<DecisionTreeNode> m_decisionTreeHead;
71
+ set<shared_ptr<DecisionTreeNode> > m_terminalNodes;
72
+ map<shared_ptr<DecisionTreeNode>, double> m_FIncrements;
73
+ shared_ptr<GBMCalculator> m_gbmCalculator;
74
+ bool m_missingValueDefined;
75
+ double m_missingValue;
76
+ private:
77
+ };
78
+
79
+ #endif // GBMEstimator_h__
@@ -0,0 +1,53 @@
1
+ #ifndef GBMOutput_h__
2
+ #define GBMOutput_h__
3
+
4
+ #include "MachineLearning/GBM/GBMParameters.h"
5
+ #include "MachineLearning/MLOutput.h"
6
+
7
+ #include <boost/shared_ptr.hpp>
8
+ #include <vector>
9
+ #include <map>
10
+ using std::map;
11
+ using std::vector;
12
+ using boost::shared_ptr;
13
+
14
+ class DecisionTreeNode;
15
+ class DecisionTreeExperiment;
16
+ class MLData;
17
+ class GBMCalculator;
18
+
19
+ class GBMOutput : public MLOutput
20
+ {
21
+ public:
22
+ GBMOutput(MLData* trainingData, vector<int> trainingExperimentIndicies,shared_ptr<GBMParameters> parameters);
23
+ ~GBMOutput();
24
+
25
+ // recording inputs of GBM estimation
26
+ shared_ptr<GBMParameters> getParameters();
27
+
28
+ // recording outputs of GBM estimation
29
+ void setMeanY(double y);
30
+ void addHeadDecisionTreeNode(shared_ptr<DecisionTreeNode> node);
31
+ void addFIncrements(map<shared_ptr<DecisionTreeNode>, double> m_FIncrements);
32
+
33
+ // applying model to new data
34
+ double predictForExperiment(shared_ptr<MLExperiment> experiment);
35
+ void setPredictionForDecisionTreeExperiment(shared_ptr<DecisionTreeExperiment> experiment);
36
+
37
+ // stats
38
+ int getNumTrees();
39
+
40
+ void capTrees(int numTrees);
41
+
42
+ protected:
43
+
44
+ vector<shared_ptr<DecisionTreeNode> > m_headNodes;
45
+ vector<map<shared_ptr<DecisionTreeNode>, double> > m_fIncrements;
46
+ double m_meanY;
47
+ bool m_useMeanY;
48
+ shared_ptr<GBMParameters> m_parameters;
49
+ shared_ptr<GBMCalculator> m_gbmCalculator;
50
+ private:
51
+ };
52
+
53
+ #endif // GBMOutput_h__
@@ -0,0 +1,50 @@
1
+ #ifndef __GBMParameters_h__
2
+ #define __GBMParameters_h__
3
+
4
+ #include <string>
5
+ #include <vector>
6
+ #include <limits>
7
+ using std::vector;
8
+ using std::string;
9
+
10
+ enum GBMDistribution {
11
+ BERNOULLI,
12
+ GAUSSIAN
13
+ };
14
+
15
+ class GBMParameters
16
+ {
17
+ public:
18
+ GBMParameters()
19
+ : tryMVariables(-1),growKDecisionTreeNodes(5),bagFraction(1.0),
20
+ shrinkageFactor(0.01),numIterations(100),minObservations(10),
21
+ distribution(BERNOULLI), greedy(true), rfToLevel(0), verbose(false),
22
+ scale(std::numeric_limits<double>::infinity())
23
+ {};
24
+ ~GBMParameters() {};
25
+
26
+ // parameters will be public
27
+ vector<string> featuresToRun; // X's for this run
28
+
29
+ int tryMVariables;
30
+ int growKDecisionTreeNodes;
31
+
32
+ double bagFraction;
33
+ double shrinkageFactor;
34
+ int numIterations;
35
+
36
+ int minObservations;
37
+ vector<int> trainingExperimentIds;
38
+
39
+ GBMDistribution distribution;
40
+
41
+ int rfToLevel;
42
+ bool greedy;
43
+
44
+ double scale;
45
+
46
+ bool verbose;
47
+ protected:
48
+ };
49
+
50
+ #endif // GBMParameters_h__
@@ -0,0 +1,35 @@
1
+ #ifndef __GBM_h__
2
+ #define __GBM_h__
3
+
4
+ #include "MachineLearning/MLRunner.h"
5
+
6
+ #include <boost/shared_ptr.hpp>
7
+ #include <vector>
8
+ using std::vector;
9
+ using boost::shared_ptr;
10
+
11
+ class MLData;
12
+ class MLExperiment;
13
+ class FeatureInteraction;
14
+ class GBMParameters;
15
+
16
+ class GBMRunner : public MLRunner
17
+ {
18
+ public:
19
+ GBMRunner();
20
+ ~GBMRunner();
21
+
22
+ void estimateMore(int numTrees);
23
+ void capTrees(int numTrees);
24
+
25
+ vector<FeatureInteraction> getFeatureInteractions(int howMany);
26
+
27
+ shared_ptr<MLEstimator> createEstimator(MLData* data, vector<shared_ptr<MLExperiment> > trainingExperiments);
28
+
29
+ shared_ptr<GBMParameters> parameters;
30
+
31
+ protected:
32
+ void config();
33
+ };
34
+
35
+ #endif // GBM_h__
@@ -0,0 +1,29 @@
1
+ #ifndef GaussianCalculator_h__
2
+ #define GaussianCalculator_h__
3
+
4
+ #include "MachineLearning/GBM/GBMCalculator.h"
5
+
6
+ #include <boost/shared_ptr.hpp>
7
+ #include <vector>
8
+ using std::vector;
9
+ using boost::shared_ptr;
10
+
11
+ class GaussianCalculator : public GBMCalculator
12
+ {
13
+ public:
14
+ GaussianCalculator();
15
+ ~GaussianCalculator();
16
+
17
+ double calculateDeviance(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
18
+ void populateInitialF(vector<shared_ptr<DecisionTreeExperiment> >& experiments, bool useInitialPredictions);
19
+ void updateZ(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
20
+ double computeFIncrement(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
21
+ void updatePredictions(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
22
+ double calculatePrediction(double f);
23
+ double calculateF(double prediction);
24
+ protected:
25
+
26
+ private:
27
+ };
28
+
29
+ #endif // GaussianCalculator_h__
@@ -0,0 +1,27 @@
1
+ // #ifndef ZenithGBM_h__
2
+ // #define ZenithGBM_h__
3
+
4
+ // #include "stdafx.h"
5
+
6
+ // // ruby interface methods
7
+ // void zenith_gbm_Free(void* v);
8
+ // OtInterface::VALUE zenith_gbm_New(int argc, VALUE* argv, VALUE klass);
9
+ // OtInterface::VALUE zenith_gbm_Initialize(VALUE self);
10
+ // OtInterface::VALUE zenith_gbm_estimate(VALUE self);
11
+ // OtInterface::VALUE zenith_gbm_estimateMore(VALUE self, VALUE numTrees);
12
+ // OtInterface::VALUE zenith_gbm_setFeaturesToRun(VALUE self, VALUE featuresValue);
13
+ // OtInterface::VALUE zenith_gbm_setData(VALUE self, VALUE data);
14
+ // OtInterface::VALUE zenith_gbm_setDistribution(VALUE self, VALUE distribution);
15
+ // OtInterface::VALUE zenith_gbm_setTryMVariables(VALUE self, VALUE mVariablesValue);
16
+ // OtInterface::VALUE zenith_gbm_setKTerminalNodes(VALUE self, VALUE kNodesValue);
17
+ // OtInterface::VALUE zenith_gbm_setNumIterations(VALUE self, VALUE numIterationsValue);
18
+ // OtInterface::VALUE zenith_gbm_setShrinkageFactor(VALUE self, VALUE shrinkageFactorValue);
19
+ // OtInterface::VALUE zenith_gbm_setBagFraction(VALUE self, VALUE bagFractionValue);
20
+ // OtInterface::VALUE zenith_gbm_setTrainingExperimentIds(VALUE self, VALUE experimentIdsValue);
21
+ // OtInterface::VALUE zenith_gbm_minObservations(VALUE self, VALUE minObservations);
22
+ // OtInterface::VALUE zenith_gbm_verbose(VALUE self, VALUE verbose);
23
+ // OtInterface::VALUE zenith_gbm_predictions(VALUE self, VALUE newMlData);
24
+ // OtInterface::VALUE zenith_gbm_training_predictions(VALUE self);
25
+ // OtInterface::VALUE zenith_gbm_crossvalidation_predictions(VALUE self);
26
+
27
+ // #endif // ZenithGBM_h__
@@ -0,0 +1,77 @@
1
+ #ifndef __MLData_h__
2
+ #define __MLData_h__
3
+
4
+ #include <boost/shared_ptr.hpp>
5
+ #include <vector>
6
+ #include <set>
7
+ #include <map>
8
+ #include <utility>
9
+ #include <string>
10
+ using std::string;
11
+ using std::pair;
12
+ using std::set;
13
+ using std::map;
14
+ using std::vector;
15
+ using boost::shared_ptr;
16
+
17
+ class MLExperiment;
18
+
19
+ class MLData
20
+ {
21
+ public:
22
+ MLData();
23
+ ~MLData();
24
+
25
+ void setExperiments(vector<shared_ptr<MLExperiment> > experiments);
26
+ vector<shared_ptr<MLExperiment> >& getExperiments();
27
+ shared_ptr<MLExperiment> getExperiment(int experimentIndex);
28
+
29
+ void createFolds(int numFolds, int randomSeed);
30
+ void setFolds(vector<int> folds);
31
+
32
+ void setTrainingExperimentIds(vector<int>& experimentIds);
33
+ void setTrainingExperiments(vector<shared_ptr<MLExperiment> > experiments);
34
+
35
+ vector<int>& getFoldNumbers();
36
+ vector<int> getFolds();
37
+
38
+ vector<shared_ptr<MLExperiment> >& getTrainingExperiments(int fold);
39
+ vector<shared_ptr<MLExperiment> >& getCrossValidationExperiments(int fold);
40
+
41
+ shared_ptr<MLExperiment> getExperimentWithId(int experimentId);
42
+ // vector<shared_ptr<MLExperiment> >& getExperimentsSortedOnFeature(int featureIndex);
43
+
44
+ vector<string>& getFeatures();
45
+ void setFeatures(vector<string> features);
46
+ int getFeatureIndex(string& feature);
47
+
48
+ void constructCategories(vector<string> categoricalFeatures);
49
+ set<int>& getCategoricalFeatureIndices();
50
+
51
+ void setInitialPredictions(vector<double> initialPredictions);
52
+ void setInitialPredictionsDefined(bool defined);
53
+ bool initialPredictionsDefined();
54
+
55
+ int getNumFolds();
56
+
57
+ void setMissingValue(double missingValue);
58
+ bool missingValueDefined();
59
+ double getMissingValue();
60
+ protected:
61
+
62
+ vector<shared_ptr<MLExperiment> > m_experiments;
63
+ map<int, vector<shared_ptr<MLExperiment> > > m_trainingExperiments;
64
+ map<int, vector<shared_ptr<MLExperiment> > > m_cvExperiments;
65
+
66
+ map<int, shared_ptr<MLExperiment> > m_experimentsById;
67
+ vector<string> m_featureNames;
68
+ map<string, int> m_featureIndices;
69
+ set<int> m_categoricalFeatureIndices;
70
+ bool m_initialPredictionsDefined;
71
+ vector<int> m_foldNumbers;
72
+ double m_missingValue;
73
+ bool m_missingValueDefined;
74
+ // map<int, vector<shared_ptr<MLExperiment> > > m_experimentsSortedByFeature;
75
+ };
76
+
77
+ #endif // MLData_h__