ml4r 0.1.2 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. data/ext/ml4r/{LinearRegression.h → LinearRegression/LinearRegression.h} +25 -19
  2. data/ext/ml4r/LinearRegression/OLSLinearRegression.h +29 -0
  3. data/ext/ml4r/MachineLearning/DecisionTree/CategoryInfo.h +32 -0
  4. data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.h +30 -0
  5. data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.h +86 -0
  6. data/ext/ml4r/MachineLearning/DecisionTree/FeatureInteraction.h +31 -0
  7. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.h +45 -0
  8. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.h +17 -0
  9. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.h +16 -0
  10. data/ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.h +81 -0
  11. data/ext/ml4r/MachineLearning/GBM/BernoulliCalculator.h +29 -0
  12. data/ext/ml4r/MachineLearning/GBM/GBM.h +50 -0
  13. data/ext/ml4r/MachineLearning/GBM/GBMCalculator.h +31 -0
  14. data/ext/ml4r/MachineLearning/GBM/GBMData.h +0 -0
  15. data/ext/ml4r/MachineLearning/GBM/GBMEstimator.h +79 -0
  16. data/ext/ml4r/MachineLearning/GBM/GBMOutput.h +53 -0
  17. data/ext/ml4r/MachineLearning/GBM/GBMParameters.h +50 -0
  18. data/ext/ml4r/MachineLearning/GBM/GBMRunner.h +35 -0
  19. data/ext/ml4r/MachineLearning/GBM/GaussianCalculator.h +29 -0
  20. data/ext/ml4r/MachineLearning/GBM/ZenithGBM.h +27 -0
  21. data/ext/ml4r/MachineLearning/MLData/MLData.h +77 -0
  22. data/ext/ml4r/MachineLearning/MLData/MLDataFields.h +25 -0
  23. data/ext/ml4r/MachineLearning/MLData/MLDataReader.h +37 -0
  24. data/ext/ml4r/MachineLearning/MLData/ZenithMLData.h +13 -0
  25. data/ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.h +20 -0
  26. data/ext/ml4r/MachineLearning/MLEstimator.h +30 -0
  27. data/ext/ml4r/MachineLearning/MLEstimatorFactory.h +25 -0
  28. data/ext/ml4r/MachineLearning/MLExperiment.h +41 -0
  29. data/ext/ml4r/MachineLearning/MLOutput.h +45 -0
  30. data/ext/ml4r/MachineLearning/MLParameters.h +16 -0
  31. data/ext/ml4r/MachineLearning/MLRunner.h +47 -0
  32. data/ext/ml4r/MachineLearning/MLUtils.h +75 -0
  33. data/ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.h +47 -0
  34. data/ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.h +33 -0
  35. data/ext/ml4r/MachineLearning/RandomForest/RandomForestParameters.h +32 -0
  36. data/ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.h +34 -0
  37. data/ext/ml4r/extconf.rb +16 -3
  38. data/ext/ml4r/{MathUtils.h → utils/MathUtils.h} +0 -0
  39. data/ext/ml4r/{MatrixInversion.h → utils/MatrixInversion.h} +0 -0
  40. data/ext/ml4r/utils/StochasticUtils.h +33 -0
  41. data/ext/ml4r/utils/Utils.h +147 -0
  42. data/ext/ml4r/utils/VlcMessage.h +44 -0
  43. data/lib/ml4r/linear_regression.rb +7 -0
  44. metadata +45 -13
  45. data/ext/ml4r/LinearRegression/ZenithRegression.h +0 -17
  46. data/ext/ml4r/OLSLinearRegression.h +0 -23
  47. data/ext/ml4r/Utils.h +0 -53
  48. data/ext/ml4r/example.h +0 -18
  49. data/ext/ml4r/swig/example.h +0 -13
  50. data/ext/ml4r/swig/example_wrap.c +0 -2093
  51. data/ext/ml4r/utils/RubyUtils.h +0 -174
@@ -0,0 +1,25 @@
1
+ #ifndef __MLDataFields_h__
2
+ #define __MLDataFields_h__
3
+
4
+ #include <string>
5
+ #include <vector>
6
+ using std::vector;
7
+ using std::string;
8
+
9
+ class MLDataFields
10
+ {
11
+ public:
12
+ MLDataFields() {};
13
+ ~MLDataFields() {};
14
+
15
+ string experimentIdField;
16
+ string weightsField;
17
+ string actualYField;
18
+ string initialPredictionsField;
19
+
20
+ vector<string> featuresFields;
21
+
22
+ protected:
23
+ };
24
+
25
+ #endif // MLDataFields_h__
@@ -0,0 +1,37 @@
1
+ #ifndef __MLDataReader_h__
2
+ #define __MLDataReader_h__
3
+
4
+ #include <boost/shared_ptr.hpp>
5
+ #include <vector>
6
+ #include <string>
7
+ using std::string;
8
+ using std::vector;
9
+ using boost::shared_ptr;
10
+
11
+ class MLData;
12
+ class MLDataFields;
13
+
14
+ class MLDataReader
15
+ {
16
+ public:
17
+ MLDataReader();
18
+ ~MLDataReader();
19
+
20
+ void execute(MLData* mlData);
21
+
22
+ string databaseName;
23
+ string tableName;
24
+
25
+ shared_ptr<MLDataFields> fieldsSpec;
26
+ vector<string> categoricalFeatures;
27
+ double missingValue;
28
+ bool missingValueDefined;
29
+ void reportOnData(MLData* data, shared_ptr<MLDataFields> fieldsSpec);
30
+
31
+ protected:
32
+
33
+ string getSelectSql();
34
+
35
+ };
36
+
37
+ #endif // MLDataReader_h__
@@ -0,0 +1,13 @@
1
+ // #ifndef ZenithMLData_h__
2
+ // #define ZenithMLData_h__
3
+
4
+ // #include "stdafx.h"
5
+
6
+
7
+ // // ruby interface methods
8
+ // void zenith_mldata_Free(void* v);
9
+ // OtInterface::VALUE zenith_mldata_New(int argc, VALUE* argv, VALUE klass);
10
+ // OtInterface::VALUE zenith_mldata_Initialize(VALUE self);
11
+ // OtInterface::VALUE zenith_mldata_createFolds(VALUE self, VALUE numFolds, VALUE randomSeed);
12
+ // OtInterface::VALUE zenith_mldata_getResponse(VALUE self);
13
+ // #endif // ZenithMLData_h__
@@ -0,0 +1,20 @@
1
+ // #ifndef ZenithMLDataReader_h__
2
+ // #define ZenithMLDataReader_h__
3
+
4
+ // #include "stdafx.h"
5
+
6
+ // // ruby interface methods
7
+ // void zenith_mldatareader_Free(void* v);
8
+ // OtInterface::VALUE zenith_mldatareader_New(int argc, VALUE* argv, VALUE klass);
9
+ // OtInterface::VALUE zenith_mldatareader_Initialize(VALUE self);
10
+ // OtInterface::VALUE zenith_mldatareader_execute(VALUE self);
11
+ // OtInterface::VALUE zenith_mldatareader_setFeaturesToLoad(VALUE self, VALUE featuresValue);
12
+ // OtInterface::VALUE zenith_mldatareader_setCategoricalFeatures(VALUE self, VALUE categoricalFeaturesValue);
13
+ // OtInterface::VALUE zenith_mldatareader_setDatabaseName(VALUE self, VALUE databaseNameValue);
14
+ // OtInterface::VALUE zenith_mldatareader_setTableName(VALUE self, VALUE tableNameValue);
15
+ // OtInterface::VALUE zenith_mldatareader_setActualYField(VALUE self, VALUE yFieldValue);
16
+ // OtInterface::VALUE zenith_mldatareader_setExperimentIdField(VALUE self, VALUE experimentIdFieldValue);
17
+ // OtInterface::VALUE zenith_mldatareader_setWeightsField(VALUE self, VALUE weightsFieldValue);
18
+ // OtInterface::VALUE zenith_mldatareader_setInitialPredictionsField(VALUE self, VALUE initialEstimatesFieldValue);
19
+ // OtInterface::VALUE zenith_mldatareader_setMissingValue(VALUE self, VALUE missingValue);
20
+ // #endif // ZenithMLDataReader_h__
@@ -0,0 +1,30 @@
1
+ #ifndef MLEstimator_h__
2
+ #define MLEstimator_h__
3
+
4
+ class MLOutput;
5
+ class MLData;
6
+ class MLExperiment;
7
+
8
+ #include <vector>
9
+ #include <boost/shared_ptr.hpp>
10
+ using boost::shared_ptr;
11
+ using std::vector;
12
+
13
+ // pure virtual class
14
+ class MLEstimator
15
+ {
16
+ public:
17
+ MLEstimator(MLData* data, vector<shared_ptr<MLExperiment> > experiments)
18
+ : m_data(data), m_trainingExperiments(experiments)
19
+ {};
20
+ ~MLEstimator() {};
21
+
22
+ virtual shared_ptr<MLOutput> estimate() = 0;
23
+ protected:
24
+ MLData* m_data;
25
+ vector<shared_ptr<MLExperiment> > m_trainingExperiments;
26
+
27
+ private:
28
+ };
29
+
30
+ #endif // MLAlgorithm_h__
@@ -0,0 +1,25 @@
1
+ #ifndef MLEstimatorFactory_h__
2
+ #define MLEstimatorFactory_h__
3
+
4
+ class MLExperiment;
5
+ class MLData;
6
+ class MLParameters;
7
+ class MLEstimator;
8
+
9
+ #include <boost/shared_ptr.hpp>
10
+ #include <vector>
11
+ using std::vector;
12
+ using boost::shared_ptr;
13
+
14
+ class MLEstimatorFactory
15
+ {
16
+ public:
17
+ MLEstimatorFactory() {};
18
+ ~MLEstimatorFactory() {};
19
+
20
+ virtual shared_ptr<MLEstimator> create(MLData* data, vector<shared_ptr<MLExperiment> > trainingExperiments) = 0;
21
+ protected:
22
+
23
+ private:
24
+ };
25
+ #endif // MLEstimatorFactory_h__
@@ -0,0 +1,41 @@
1
+ #ifndef __MLExperiment_h__
2
+ #define __MLExperiment_h__
3
+
4
+ #include <vector>
5
+ #include <map>
6
+ #include <boost/shared_ptr.hpp>
7
+ using boost::shared_ptr;
8
+ using std::map;
9
+ using std::vector;
10
+
11
+ class MLExperiment
12
+ {
13
+ public:
14
+ MLExperiment();
15
+ MLExperiment(int experimentId, int experimentIndex, double y, double initialPrediction,
16
+ double weight, vector<double> features);
17
+
18
+ MLExperiment(shared_ptr<MLExperiment> experiment);
19
+
20
+ ~MLExperiment();
21
+
22
+ double getY();
23
+ int getExperimentId();
24
+ int getExperimentIndex();
25
+ double getPrediction();
26
+ double getWeight();
27
+ vector<double>& getFeatureValues();
28
+ double getFeatureValue(int featureIndex);
29
+ void setPrediction(double prediction);
30
+
31
+ protected:
32
+ double m_yValue;
33
+ int m_experimentId;
34
+ int m_experimentIndex;
35
+ double m_prediction;
36
+ double m_weight;
37
+ vector<double> m_features;
38
+
39
+ };
40
+
41
+ #endif // MLExperiment_h__
@@ -0,0 +1,45 @@
1
+ #ifndef MLOutput_h__
2
+ #define MLOutput_h__
3
+
4
+ #include <boost/shared_ptr.hpp>
5
+ #include <vector>
6
+ #include <stdexcept>
7
+ using std::runtime_error;
8
+ using std::vector;
9
+ using boost::shared_ptr;
10
+
11
+ #include "utils/Utils.h"
12
+
13
+ class MLData;
14
+ class MLExperiment;
15
+ class GBMParameters;
16
+
17
+ // pure virtual class
18
+
19
+ class MLOutput
20
+ {
21
+ public:
22
+ MLOutput(MLData* data, vector<int> trainingExperimentIndicies)
23
+ : m_trainingData(data), m_trainingExperimentIndicies(trainingExperimentIndicies)
24
+ {};
25
+ ~MLOutput() {};
26
+
27
+ virtual double predictForExperiment(shared_ptr<MLExperiment> experiment) = 0;
28
+ virtual double calculateAveragePredictions(vector<double> predictions)
29
+ {
30
+ if (predictions.size() == 0)
31
+ throw std::runtime_error("[MLOutput::calculateAveragePredictions] Cannot calculate an average on an empty vector!");
32
+
33
+ return Utils::vectorSum<double>(predictions) / predictions.size();
34
+ };
35
+
36
+ vector<int> getTrainingExperimentIndicies() { return m_trainingExperimentIndicies; };
37
+
38
+ MLData* getTrainingData() { return m_trainingData; }
39
+ protected:
40
+ MLData* m_trainingData;
41
+ vector<int> m_trainingExperimentIndicies;
42
+
43
+ private:
44
+ };
45
+ #endif // MLOutput_h__
@@ -0,0 +1,16 @@
1
+ #ifndef MLParameters_h__
2
+ #define MLParameters_h__
3
+
4
+ class MLParameters
5
+ {
6
+ public:
7
+ MLParameters() : verbose(false) {};
8
+ ~MLParameters() {};
9
+
10
+ bool verbose;
11
+ protected:
12
+
13
+ private:
14
+ };
15
+
16
+ #endif // MLParameters_h__
@@ -0,0 +1,47 @@
1
+ #ifndef MLRunner_h__
2
+ #define MLRunner_h__
3
+
4
+ #include <boost/shared_ptr.hpp>
5
+ #include <vector>
6
+ using std::vector;
7
+ using boost::shared_ptr;
8
+
9
+ class MLData;
10
+ class MLExperiment;
11
+ class MLParameters;
12
+ class MLOutput;
13
+ class MLEstimator;
14
+
15
+ class MLRunner
16
+ {
17
+ public:
18
+ MLRunner();
19
+ ~MLRunner();
20
+
21
+ void execute();
22
+
23
+ void setData(MLData* data);
24
+ MLData* getData();
25
+
26
+ vector<double> getPredictions(MLData* newData);
27
+ vector<double> getMeanTrainingPredictions();
28
+ vector<double> getCrossValidationPredictions();
29
+
30
+ virtual shared_ptr<MLEstimator> createEstimator(MLData* data, vector<shared_ptr<MLExperiment> > trainingExperiments) = 0;
31
+
32
+ protected:
33
+ virtual void checks();
34
+ virtual void config();
35
+ virtual void input();
36
+ virtual void estimate();
37
+ virtual void output();
38
+
39
+ vector<double> getPredictions(vector<shared_ptr<MLExperiment> > experiments);
40
+
41
+ MLData* m_data;
42
+ vector<shared_ptr<MLOutput> > m_outputObjects;
43
+ vector<shared_ptr<MLEstimator> > m_estimators;
44
+
45
+ private:
46
+ };
47
+ #endif // MLRunner_h__
@@ -0,0 +1,75 @@
1
+ #ifndef __MLUtils_h__
2
+ #define __MLUtils_h__
3
+
4
+ #include "MachineLearning/MLExperiment.h"
5
+
6
+ #include <vector>
7
+ #include <boost/shared_ptr.hpp>
8
+ #include <utility>
9
+ using std::pair;
10
+ using boost::shared_ptr;
11
+ using std::vector;
12
+
13
+ namespace MLUtils
14
+ {
15
+ double getMeanY(vector<shared_ptr<MLExperiment> > experiments);
16
+
17
+ template <class T>
18
+ vector<T> bagObjectsWithReplacement(vector<T> experiments, int bagSize);
19
+
20
+ template <class T>
21
+ pair<vector<T>,vector<T> > bagObjectsWithoutReplacement(vector<T> experiments, int bagSize);
22
+ };
23
+
24
+
25
+ template <class T>
26
+ vector<T> MLUtils::bagObjectsWithReplacement(vector<T> objects, int bagSize)
27
+ {
28
+ vector<T> inBagObjects;
29
+
30
+ int numTrainingExperiments = (int) objects.size();
31
+ int numBagged = 0;
32
+
33
+ inBagObjects.reserve(bagSize);
34
+
35
+ for (int i = 0; i < bagSize; ++i)
36
+ {
37
+ double unit_rand = rand() * 1.0 / (RAND_MAX + 1.0);
38
+ long index = unit_rand * objects.size();
39
+ inBagObjects.push_back(objects.at(index));
40
+ }
41
+ return inBagObjects;
42
+ }
43
+
44
+ template <class T>
45
+ pair<vector<T>,vector<T> >
46
+ MLUtils::bagObjectsWithoutReplacement(vector<T> objects, int bagSize)
47
+ {
48
+ vector<T> inBagObjects;
49
+ vector<T> outOfBagObjects;
50
+
51
+ // int numTotalExperiments = allExperiments.size();
52
+ long numObjects = objects.size();
53
+ int numBagged = 0;
54
+
55
+ inBagObjects.reserve(bagSize);
56
+ outOfBagObjects.reserve(numObjects - bagSize);
57
+
58
+ for (int i = 0; i < numObjects; ++i)
59
+ {
60
+ double unit_rand = rand() * 1.0 / (RAND_MAX + 1.0);
61
+ T object = objects.at(i);
62
+
63
+ if (unit_rand * (numObjects - i) < bagSize - numBagged)
64
+ {
65
+ inBagObjects.push_back(object);
66
+ ++numBagged;
67
+ }
68
+ else
69
+ outOfBagObjects.push_back(object);
70
+ }
71
+
72
+ return make_pair(inBagObjects, outOfBagObjects);
73
+ }
74
+
75
+ #endif // MLUtils_h__
@@ -0,0 +1,47 @@
1
+ #ifndef RandomForestEstimator_h__
2
+ #define RandomForestEstimator_h__
3
+
4
+ #include "MachineLearning/RandomForest/RandomForestParameters.h"
5
+ #include "MachineLearning/MLEstimator.h"
6
+
7
+ #include <boost/shared_ptr.hpp>
8
+ #include <vector>
9
+ using std::vector;
10
+ using boost::shared_ptr;
11
+
12
+ class RandomForestOutput;
13
+ class DecisionTreeExperiment;
14
+ class MLData;
15
+ class DecisionTreeNode;
16
+
17
+ class RandomForestEstimator : public MLEstimator
18
+ {
19
+ public:
20
+ RandomForestEstimator(MLData* data, vector<shared_ptr<MLExperiment> > experiments, shared_ptr<RandomForestParameters> parameters);
21
+ ~RandomForestEstimator();
22
+
23
+ shared_ptr<MLOutput> estimate();
24
+ shared_ptr<MLOutput> estimateMore(int numTrees);
25
+
26
+ protected:
27
+ void updateZ();
28
+ void performIteration();
29
+ void initializeEstimator();
30
+ void constructFeatureIndices();
31
+ void constructDecisionTree(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
32
+
33
+ MLData* m_data;
34
+ vector<shared_ptr<DecisionTreeExperiment> > m_decisionTreeExperiments;
35
+ shared_ptr<RandomForestParameters> m_parameters;
36
+
37
+ shared_ptr<RandomForestOutput> m_output;
38
+ shared_ptr<DecisionTreeNode> m_decisionTreeHead;
39
+
40
+ bool m_missingValueDefined;
41
+ double m_missingValue;
42
+
43
+ vector<int> m_featureIndices;
44
+ private:
45
+ };
46
+
47
+ #endif // RandomForestEstimator_h__
@@ -0,0 +1,33 @@
1
+ #ifndef RandomForestOutput_h__
2
+ #define RandomForestOutput_h__
3
+
4
+ #include "MachineLearning/MLOutput.h"
5
+
6
+ class MLData;
7
+ class RandomForestParameters;
8
+ class MLExperiment;
9
+ class DecisionTreeNode;
10
+
11
+ class RandomForestOutput : public MLOutput
12
+ {
13
+ public:
14
+ RandomForestOutput( MLData* trainingData,
15
+ vector<int> trainingExperimentIndicies,
16
+ shared_ptr<RandomForestParameters> parameters);
17
+ ~RandomForestOutput();
18
+
19
+ MLData* getTrainingData();
20
+ shared_ptr<RandomForestParameters> getParameters();
21
+
22
+ double predictForExperiment(shared_ptr<MLExperiment> experiment);
23
+ void addHeadDecisionTreeNode(shared_ptr<DecisionTreeNode> node);
24
+ // stats
25
+ int getNumTrees();
26
+
27
+ protected:
28
+ shared_ptr<RandomForestParameters> m_parameters;
29
+ vector<shared_ptr<DecisionTreeNode> > m_headNodes;
30
+ private:
31
+ };
32
+
33
+ #endif // RandomForestOutput_h__