ml4r 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. data/ext/ml4r/{LinearRegression.h → LinearRegression/LinearRegression.h} +25 -19
  2. data/ext/ml4r/LinearRegression/OLSLinearRegression.h +29 -0
  3. data/ext/ml4r/MachineLearning/DecisionTree/CategoryInfo.h +32 -0
  4. data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeExperiment.h +30 -0
  5. data/ext/ml4r/MachineLearning/DecisionTree/DecisionTreeNode.h +86 -0
  6. data/ext/ml4r/MachineLearning/DecisionTree/FeatureInteraction.h +31 -0
  7. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitter.h +45 -0
  8. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterCategorical.h +17 -0
  9. data/ext/ml4r/MachineLearning/DecisionTree/NodeSplitterContinuous.h +16 -0
  10. data/ext/ml4r/MachineLearning/DecisionTree/SplitDefinition.h +81 -0
  11. data/ext/ml4r/MachineLearning/GBM/BernoulliCalculator.h +29 -0
  12. data/ext/ml4r/MachineLearning/GBM/GBM.h +50 -0
  13. data/ext/ml4r/MachineLearning/GBM/GBMCalculator.h +31 -0
  14. data/ext/ml4r/MachineLearning/GBM/GBMData.h +0 -0
  15. data/ext/ml4r/MachineLearning/GBM/GBMEstimator.h +79 -0
  16. data/ext/ml4r/MachineLearning/GBM/GBMOutput.h +53 -0
  17. data/ext/ml4r/MachineLearning/GBM/GBMParameters.h +50 -0
  18. data/ext/ml4r/MachineLearning/GBM/GBMRunner.h +35 -0
  19. data/ext/ml4r/MachineLearning/GBM/GaussianCalculator.h +29 -0
  20. data/ext/ml4r/MachineLearning/GBM/ZenithGBM.h +27 -0
  21. data/ext/ml4r/MachineLearning/MLData/MLData.h +77 -0
  22. data/ext/ml4r/MachineLearning/MLData/MLDataFields.h +25 -0
  23. data/ext/ml4r/MachineLearning/MLData/MLDataReader.h +37 -0
  24. data/ext/ml4r/MachineLearning/MLData/ZenithMLData.h +13 -0
  25. data/ext/ml4r/MachineLearning/MLData/ZenithMLDataReader.h +20 -0
  26. data/ext/ml4r/MachineLearning/MLEstimator.h +30 -0
  27. data/ext/ml4r/MachineLearning/MLEstimatorFactory.h +25 -0
  28. data/ext/ml4r/MachineLearning/MLExperiment.h +41 -0
  29. data/ext/ml4r/MachineLearning/MLOutput.h +45 -0
  30. data/ext/ml4r/MachineLearning/MLParameters.h +16 -0
  31. data/ext/ml4r/MachineLearning/MLRunner.h +47 -0
  32. data/ext/ml4r/MachineLearning/MLUtils.h +75 -0
  33. data/ext/ml4r/MachineLearning/RandomForest/RandomForestEstimator.h +47 -0
  34. data/ext/ml4r/MachineLearning/RandomForest/RandomForestOutput.h +33 -0
  35. data/ext/ml4r/MachineLearning/RandomForest/RandomForestParameters.h +32 -0
  36. data/ext/ml4r/MachineLearning/RandomForest/RandomForestRunner.h +34 -0
  37. data/ext/ml4r/extconf.rb +16 -3
  38. data/ext/ml4r/{MathUtils.h → utils/MathUtils.h} +0 -0
  39. data/ext/ml4r/{MatrixInversion.h → utils/MatrixInversion.h} +0 -0
  40. data/ext/ml4r/utils/StochasticUtils.h +33 -0
  41. data/ext/ml4r/utils/Utils.h +147 -0
  42. data/ext/ml4r/utils/VlcMessage.h +44 -0
  43. data/lib/ml4r/linear_regression.rb +7 -0
  44. metadata +45 -13
  45. data/ext/ml4r/LinearRegression/ZenithRegression.h +0 -17
  46. data/ext/ml4r/OLSLinearRegression.h +0 -23
  47. data/ext/ml4r/Utils.h +0 -53
  48. data/ext/ml4r/example.h +0 -18
  49. data/ext/ml4r/swig/example.h +0 -13
  50. data/ext/ml4r/swig/example_wrap.c +0 -2093
  51. data/ext/ml4r/utils/RubyUtils.h +0 -174
@@ -0,0 +1,25 @@
1
+ #ifndef __MLDataFields_h__
2
+ #define __MLDataFields_h__
3
+
4
+ #include <string>
5
+ #include <vector>
6
+ using std::vector;
7
+ using std::string;
8
+
9
+ class MLDataFields
10
+ {
11
+ public:
12
+ MLDataFields() {};
13
+ ~MLDataFields() {};
14
+
15
+ string experimentIdField;
16
+ string weightsField;
17
+ string actualYField;
18
+ string initialPredictionsField;
19
+
20
+ vector<string> featuresFields;
21
+
22
+ protected:
23
+ };
24
+
25
+ #endif // MLDataFields_h__
@@ -0,0 +1,37 @@
1
+ #ifndef __MLDataReader_h__
2
+ #define __MLDataReader_h__
3
+
4
+ #include <boost/shared_ptr.hpp>
5
+ #include <vector>
6
+ #include <string>
7
+ using std::string;
8
+ using std::vector;
9
+ using boost::shared_ptr;
10
+
11
+ class MLData;
12
+ class MLDataFields;
13
+
14
+ class MLDataReader
15
+ {
16
+ public:
17
+ MLDataReader();
18
+ ~MLDataReader();
19
+
20
+ void execute(MLData* mlData);
21
+
22
+ string databaseName;
23
+ string tableName;
24
+
25
+ shared_ptr<MLDataFields> fieldsSpec;
26
+ vector<string> categoricalFeatures;
27
+ double missingValue;
28
+ bool missingValueDefined;
29
+ void reportOnData(MLData* data, shared_ptr<MLDataFields> fieldsSpec);
30
+
31
+ protected:
32
+
33
+ string getSelectSql();
34
+
35
+ };
36
+
37
+ #endif // MLDataReader_h__
@@ -0,0 +1,13 @@
1
+ // #ifndef ZenithMLData_h__
2
+ // #define ZenithMLData_h__
3
+
4
+ // #include "stdafx.h"
5
+
6
+
7
+ // // ruby interface methods
8
+ // void zenith_mldata_Free(void* v);
9
+ // OtInterface::VALUE zenith_mldata_New(int argc, VALUE* argv, VALUE klass);
10
+ // OtInterface::VALUE zenith_mldata_Initialize(VALUE self);
11
+ // OtInterface::VALUE zenith_mldata_createFolds(VALUE self, VALUE numFolds, VALUE randomSeed);
12
+ // OtInterface::VALUE zenith_mldata_getResponse(VALUE self);
13
+ // #endif // ZenithMLData_h__
@@ -0,0 +1,20 @@
1
+ // #ifndef ZenithMLDataReader_h__
2
+ // #define ZenithMLDataReader_h__
3
+
4
+ // #include "stdafx.h"
5
+
6
+ // // ruby interface methods
7
+ // void zenith_mldatareader_Free(void* v);
8
+ // OtInterface::VALUE zenith_mldatareader_New(int argc, VALUE* argv, VALUE klass);
9
+ // OtInterface::VALUE zenith_mldatareader_Initialize(VALUE self);
10
+ // OtInterface::VALUE zenith_mldatareader_execute(VALUE self);
11
+ // OtInterface::VALUE zenith_mldatareader_setFeaturesToLoad(VALUE self, VALUE featuresValue);
12
+ // OtInterface::VALUE zenith_mldatareader_setCategoricalFeatures(VALUE self, VALUE categoricalFeaturesValue);
13
+ // OtInterface::VALUE zenith_mldatareader_setDatabaseName(VALUE self, VALUE databaseNameValue);
14
+ // OtInterface::VALUE zenith_mldatareader_setTableName(VALUE self, VALUE tableNameValue);
15
+ // OtInterface::VALUE zenith_mldatareader_setActualYField(VALUE self, VALUE yFieldValue);
16
+ // OtInterface::VALUE zenith_mldatareader_setExperimentIdField(VALUE self, VALUE experimentIdFieldValue);
17
+ // OtInterface::VALUE zenith_mldatareader_setWeightsField(VALUE self, VALUE weightsFieldValue);
18
+ // OtInterface::VALUE zenith_mldatareader_setInitialPredictionsField(VALUE self, VALUE initialEstimatesFieldValue);
19
+ // OtInterface::VALUE zenith_mldatareader_setMissingValue(VALUE self, VALUE missingValue);
20
+ // #endif // ZenithMLDataReader_h__
@@ -0,0 +1,30 @@
1
+ #ifndef MLEstimator_h__
2
+ #define MLEstimator_h__
3
+
4
+ class MLOutput;
5
+ class MLData;
6
+ class MLExperiment;
7
+
8
+ #include <vector>
9
+ #include <boost/shared_ptr.hpp>
10
+ using boost::shared_ptr;
11
+ using std::vector;
12
+
13
+ // pure virtual class
14
+ class MLEstimator
15
+ {
16
+ public:
17
+ MLEstimator(MLData* data, vector<shared_ptr<MLExperiment> > experiments)
18
+ : m_data(data), m_trainingExperiments(experiments)
19
+ {};
20
+ ~MLEstimator() {};
21
+
22
+ virtual shared_ptr<MLOutput> estimate() = 0;
23
+ protected:
24
+ MLData* m_data;
25
+ vector<shared_ptr<MLExperiment> > m_trainingExperiments;
26
+
27
+ private:
28
+ };
29
+
30
+ #endif // MLAlgorithm_h__
@@ -0,0 +1,25 @@
1
+ #ifndef MLEstimatorFactory_h__
2
+ #define MLEstimatorFactory_h__
3
+
4
+ class MLExperiment;
5
+ class MLData;
6
+ class MLParameters;
7
+ class MLEstimator;
8
+
9
+ #include <boost/shared_ptr.hpp>
10
+ #include <vector>
11
+ using std::vector;
12
+ using boost::shared_ptr;
13
+
14
+ class MLEstimatorFactory
15
+ {
16
+ public:
17
+ MLEstimatorFactory() {};
18
+ ~MLEstimatorFactory() {};
19
+
20
+ virtual shared_ptr<MLEstimator> create(MLData* data, vector<shared_ptr<MLExperiment> > trainingExperiments) = 0;
21
+ protected:
22
+
23
+ private:
24
+ };
25
+ #endif // MLEstimatorFactory_h__
@@ -0,0 +1,41 @@
1
+ #ifndef __MLExperiment_h__
2
+ #define __MLExperiment_h__
3
+
4
+ #include <vector>
5
+ #include <map>
6
+ #include <boost/shared_ptr.hpp>
7
+ using boost::shared_ptr;
8
+ using std::map;
9
+ using std::vector;
10
+
11
+ class MLExperiment
12
+ {
13
+ public:
14
+ MLExperiment();
15
+ MLExperiment(int experimentId, int experimentIndex, double y, double initialPrediction,
16
+ double weight, vector<double> features);
17
+
18
+ MLExperiment(shared_ptr<MLExperiment> experiment);
19
+
20
+ ~MLExperiment();
21
+
22
+ double getY();
23
+ int getExperimentId();
24
+ int getExperimentIndex();
25
+ double getPrediction();
26
+ double getWeight();
27
+ vector<double>& getFeatureValues();
28
+ double getFeatureValue(int featureIndex);
29
+ void setPrediction(double prediction);
30
+
31
+ protected:
32
+ double m_yValue;
33
+ int m_experimentId;
34
+ int m_experimentIndex;
35
+ double m_prediction;
36
+ double m_weight;
37
+ vector<double> m_features;
38
+
39
+ };
40
+
41
+ #endif // MLExperiment_h__
@@ -0,0 +1,45 @@
1
+ #ifndef MLOutput_h__
2
+ #define MLOutput_h__
3
+
4
+ #include <boost/shared_ptr.hpp>
5
+ #include <vector>
6
+ #include <stdexcept>
7
+ using std::runtime_error;
8
+ using std::vector;
9
+ using boost::shared_ptr;
10
+
11
+ #include "utils/Utils.h"
12
+
13
+ class MLData;
14
+ class MLExperiment;
15
+ class GBMParameters;
16
+
17
+ // pure virtual class
18
+
19
+ class MLOutput
20
+ {
21
+ public:
22
+ MLOutput(MLData* data, vector<int> trainingExperimentIndicies)
23
+ : m_trainingData(data), m_trainingExperimentIndicies(trainingExperimentIndicies)
24
+ {};
25
+ ~MLOutput() {};
26
+
27
+ virtual double predictForExperiment(shared_ptr<MLExperiment> experiment) = 0;
28
+ virtual double calculateAveragePredictions(vector<double> predictions)
29
+ {
30
+ if (predictions.size() == 0)
31
+ throw std::runtime_error("[MLOutput::calculateAveragePredictions] Cannot calculate an average on an empty vector!");
32
+
33
+ return Utils::vectorSum<double>(predictions) / predictions.size();
34
+ };
35
+
36
+ vector<int> getTrainingExperimentIndicies() { return m_trainingExperimentIndicies; };
37
+
38
+ MLData* getTrainingData() { return m_trainingData; }
39
+ protected:
40
+ MLData* m_trainingData;
41
+ vector<int> m_trainingExperimentIndicies;
42
+
43
+ private:
44
+ };
45
+ #endif // MLOutput_h__
@@ -0,0 +1,16 @@
1
+ #ifndef MLParameters_h__
2
+ #define MLParameters_h__
3
+
4
+ class MLParameters
5
+ {
6
+ public:
7
+ MLParameters() : verbose(false) {};
8
+ ~MLParameters() {};
9
+
10
+ bool verbose;
11
+ protected:
12
+
13
+ private:
14
+ };
15
+
16
+ #endif // MLParameters_h__
@@ -0,0 +1,47 @@
1
+ #ifndef MLRunner_h__
2
+ #define MLRunner_h__
3
+
4
+ #include <boost/shared_ptr.hpp>
5
+ #include <vector>
6
+ using std::vector;
7
+ using boost::shared_ptr;
8
+
9
+ class MLData;
10
+ class MLExperiment;
11
+ class MLParameters;
12
+ class MLOutput;
13
+ class MLEstimator;
14
+
15
+ class MLRunner
16
+ {
17
+ public:
18
+ MLRunner();
19
+ ~MLRunner();
20
+
21
+ void execute();
22
+
23
+ void setData(MLData* data);
24
+ MLData* getData();
25
+
26
+ vector<double> getPredictions(MLData* newData);
27
+ vector<double> getMeanTrainingPredictions();
28
+ vector<double> getCrossValidationPredictions();
29
+
30
+ virtual shared_ptr<MLEstimator> createEstimator(MLData* data, vector<shared_ptr<MLExperiment> > trainingExperiments) = 0;
31
+
32
+ protected:
33
+ virtual void checks();
34
+ virtual void config();
35
+ virtual void input();
36
+ virtual void estimate();
37
+ virtual void output();
38
+
39
+ vector<double> getPredictions(vector<shared_ptr<MLExperiment> > experiments);
40
+
41
+ MLData* m_data;
42
+ vector<shared_ptr<MLOutput> > m_outputObjects;
43
+ vector<shared_ptr<MLEstimator> > m_estimators;
44
+
45
+ private:
46
+ };
47
+ #endif // MLRunner_h__
@@ -0,0 +1,75 @@
1
+ #ifndef __MLUtils_h__
2
+ #define __MLUtils_h__
3
+
4
+ #include "MachineLearning/MLExperiment.h"
5
+
6
+ #include <vector>
7
+ #include <boost/shared_ptr.hpp>
8
+ #include <utility>
9
+ using std::pair;
10
+ using boost::shared_ptr;
11
+ using std::vector;
12
+
13
+ namespace MLUtils
14
+ {
15
+ double getMeanY(vector<shared_ptr<MLExperiment> > experiments);
16
+
17
+ template <class T>
18
+ vector<T> bagObjectsWithReplacement(vector<T> experiments, int bagSize);
19
+
20
+ template <class T>
21
+ pair<vector<T>,vector<T> > bagObjectsWithoutReplacement(vector<T> experiments, int bagSize);
22
+ };
23
+
24
+
25
+ template <class T>
26
+ vector<T> MLUtils::bagObjectsWithReplacement(vector<T> objects, int bagSize)
27
+ {
28
+ vector<T> inBagObjects;
29
+
30
+ int numTrainingExperiments = (int) objects.size();
31
+ int numBagged = 0;
32
+
33
+ inBagObjects.reserve(bagSize);
34
+
35
+ for (int i = 0; i < bagSize; ++i)
36
+ {
37
+ double unit_rand = rand() * 1.0 / (RAND_MAX + 1.0);
38
+ long index = unit_rand * objects.size();
39
+ inBagObjects.push_back(objects.at(index));
40
+ }
41
+ return inBagObjects;
42
+ }
43
+
44
+ template <class T>
45
+ pair<vector<T>,vector<T> >
46
+ MLUtils::bagObjectsWithoutReplacement(vector<T> objects, int bagSize)
47
+ {
48
+ vector<T> inBagObjects;
49
+ vector<T> outOfBagObjects;
50
+
51
+ // int numTotalExperiments = allExperiments.size();
52
+ long numObjects = objects.size();
53
+ int numBagged = 0;
54
+
55
+ inBagObjects.reserve(bagSize);
56
+ outOfBagObjects.reserve(numObjects - bagSize);
57
+
58
+ for (int i = 0; i < numObjects; ++i)
59
+ {
60
+ double unit_rand = rand() * 1.0 / (RAND_MAX + 1.0);
61
+ T object = objects.at(i);
62
+
63
+ if (unit_rand * (numObjects - i) < bagSize - numBagged)
64
+ {
65
+ inBagObjects.push_back(object);
66
+ ++numBagged;
67
+ }
68
+ else
69
+ outOfBagObjects.push_back(object);
70
+ }
71
+
72
+ return make_pair(inBagObjects, outOfBagObjects);
73
+ }
74
+
75
+ #endif // MLUtils_h__
@@ -0,0 +1,47 @@
1
+ #ifndef RandomForestEstimator_h__
2
+ #define RandomForestEstimator_h__
3
+
4
+ #include "MachineLearning/RandomForest/RandomForestParameters.h"
5
+ #include "MachineLearning/MLEstimator.h"
6
+
7
+ #include <boost/shared_ptr.hpp>
8
+ #include <vector>
9
+ using std::vector;
10
+ using boost::shared_ptr;
11
+
12
+ class RandomForestOutput;
13
+ class DecisionTreeExperiment;
14
+ class MLData;
15
+ class DecisionTreeNode;
16
+
17
+ class RandomForestEstimator : public MLEstimator
18
+ {
19
+ public:
20
+ RandomForestEstimator(MLData* data, vector<shared_ptr<MLExperiment> > experiments, shared_ptr<RandomForestParameters> parameters);
21
+ ~RandomForestEstimator();
22
+
23
+ shared_ptr<MLOutput> estimate();
24
+ shared_ptr<MLOutput> estimateMore(int numTrees);
25
+
26
+ protected:
27
+ void updateZ();
28
+ void performIteration();
29
+ void initializeEstimator();
30
+ void constructFeatureIndices();
31
+ void constructDecisionTree(vector<shared_ptr<DecisionTreeExperiment> >& experiments);
32
+
33
+ MLData* m_data;
34
+ vector<shared_ptr<DecisionTreeExperiment> > m_decisionTreeExperiments;
35
+ shared_ptr<RandomForestParameters> m_parameters;
36
+
37
+ shared_ptr<RandomForestOutput> m_output;
38
+ shared_ptr<DecisionTreeNode> m_decisionTreeHead;
39
+
40
+ bool m_missingValueDefined;
41
+ double m_missingValue;
42
+
43
+ vector<int> m_featureIndices;
44
+ private:
45
+ };
46
+
47
+ #endif // RandomForestEstimator_h__
@@ -0,0 +1,33 @@
1
+ #ifndef RandomForestOutput_h__
2
+ #define RandomForestOutput_h__
3
+
4
+ #include "MachineLearning/MLOutput.h"
5
+
6
+ class MLData;
7
+ class RandomForestParameters;
8
+ class MLExperiment;
9
+ class DecisionTreeNode;
10
+
11
+ class RandomForestOutput : public MLOutput
12
+ {
13
+ public:
14
+ RandomForestOutput( MLData* trainingData,
15
+ vector<int> trainingExperimentIndicies,
16
+ shared_ptr<RandomForestParameters> parameters);
17
+ ~RandomForestOutput();
18
+
19
+ MLData* getTrainingData();
20
+ shared_ptr<RandomForestParameters> getParameters();
21
+
22
+ double predictForExperiment(shared_ptr<MLExperiment> experiment);
23
+ void addHeadDecisionTreeNode(shared_ptr<DecisionTreeNode> node);
24
+ // stats
25
+ int getNumTrees();
26
+
27
+ protected:
28
+ shared_ptr<RandomForestParameters> m_parameters;
29
+ vector<shared_ptr<DecisionTreeNode> > m_headNodes;
30
+ private:
31
+ };
32
+
33
+ #endif // RandomForestOutput_h__