thera 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +56 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +20 -0
- data/LICENSE.txt +1 -0
- data/README.rdoc +8 -0
- data/Rakefile +1 -0
- data/ext/Makefile +225 -0
- data/ext/extconf.rb +29 -0
- data/ext/quarry/quarry_toolkit.cpp +148 -0
- data/lib/quarry/Makefile.linux +2 -0
- data/lib/quarry/Makefile.osx +6 -0
- data/lib/quarry/Makefile.targets +23 -0
- data/lib/quarry/obj/.gitkeep +0 -0
- data/lib/quarry/src/classifier/aode/aode_classifier.cpp +0 -0
- data/lib/quarry/src/classifier/aode/aode_classifier.h +0 -0
- data/lib/quarry/src/classifier/centroid/centroid_classifier.cpp +0 -0
- data/lib/quarry/src/classifier/centroid/centroid_classifier.h +0 -0
- data/lib/quarry/src/classifier/classifier.cpp +32 -0
- data/lib/quarry/src/classifier/classifier.h +59 -0
- data/lib/quarry/src/classifier/knn/knn_classifier.cpp +0 -0
- data/lib/quarry/src/classifier/knn/knn_classifier.h +0 -0
- data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.cpp +40 -0
- data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.h +18 -0
- data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +80 -0
- data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.h +52 -0
- data/lib/quarry/src/data_set/data_set.cpp +130 -0
- data/lib/quarry/src/data_set/data_set.h +78 -0
- data/lib/quarry/src/data_set/dense/dense_data_set.h +39 -0
- data/lib/quarry/src/data_set/dense/dense_example.h +44 -0
- data/lib/quarry/src/data_set/example.cpp +10 -0
- data/lib/quarry/src/data_set/example.h +23 -0
- data/lib/quarry/src/data_set/feature.h +36 -0
- data/lib/quarry/src/data_set/features/nominal_feature.cpp +57 -0
- data/lib/quarry/src/data_set/features/nominal_feature.h +76 -0
- data/lib/quarry/src/data_set/features/numeric_feature.cpp +69 -0
- data/lib/quarry/src/data_set/features/numeric_feature.h +78 -0
- data/lib/quarry/src/data_set/sparse/sparse_data_set.h +40 -0
- data/lib/quarry/src/data_set/sparse/sparse_example.cpp +82 -0
- data/lib/quarry/src/data_set/sparse/sparse_example.h +38 -0
- data/lib/quarry/src/metrics/confusion_matrix.cpp +129 -0
- data/lib/quarry/src/metrics/confusion_matrix.h +82 -0
- data/lib/quarry/src/model/model.cpp +29 -0
- data/lib/quarry/src/model/model.h +50 -0
- data/lib/quarry/src/preprocessing/examples/example_preprocessor.h +20 -0
- data/lib/quarry/src/preprocessing/examples/weights/binary_weight.h +20 -0
- data/lib/quarry/src/preprocessing/examples/weights/local_weight.h +29 -0
- data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +19 -0
- data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +59 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/downcase.h +26 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h +17 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer.h +44 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer_original.cpp +375 -0
- data/lib/quarry/src/preprocessing/text/text_pipeline.cpp +29 -0
- data/lib/quarry/src/preprocessing/text/text_pipeline.h +37 -0
- data/lib/quarry/src/preprocessing/text/token_selector/pos_tag_selector.h +21 -0
- data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp +82 -0
- data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h +20 -0
- data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h +17 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.cpp +29 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.h +20 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h +19 -0
- data/lib/quarry/src/quarry.cpp +1 -0
- data/lib/quarry/src/quarry.h +29 -0
- data/lib/quarry/src/storage/arff.cpp +198 -0
- data/lib/quarry/src/storage/arff.h +26 -0
- data/lib/quarry/src/storage/binary.cpp +457 -0
- data/lib/quarry/src/storage/binary.h +79 -0
- data/lib/quarry/src/storage/folders.cpp +98 -0
- data/lib/quarry/src/storage/folders.h +25 -0
- data/lib/quarry/src/storage/storage.h +19 -0
- data/lib/quarry/src/test.cpp +6 -0
- data/lib/quarry_rb/classifier/classifier.rb +22 -0
- data/lib/quarry_rb/classifier/naive_bayes_classifier.rb +10 -0
- data/lib/quarry_rb/confusion_matrix.rb +58 -0
- data/lib/quarry_rb/data_set/data_set.rb +42 -0
- data/lib/quarry_rb/data_set/example.rb +33 -0
- data/lib/quarry_rb/data_set/feature.rb +28 -0
- data/lib/quarry_rb/enumerable_helper.rb +32 -0
- data/lib/quarry_rb/model/model.rb +56 -0
- data/lib/quarry_rb/storage/arff.rb +11 -0
- data/lib/quarry_rb/storage/binary.rb +23 -0
- data/lib/quarry_rb/storage/folders.rb +11 -0
- data/lib/quarry_rb/text_pipeline.rb +16 -0
- data/lib/thera.rb +20 -0
- data/test/helper.rb +19 -0
- data/test/test_quarry.rb +33 -0
- data/thera.gemspec +21 -0
- metadata +148 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
#include "multinomial_bayes_classifier.h"
|
|
2
|
+
#include "data_set/data_set.h"
|
|
3
|
+
#include <math.h>
|
|
4
|
+
|
|
5
|
+
double Classifier::MultinomialBayesClassifier::score(DataSet::Category *category, DataSet::Example *example) {
|
|
6
|
+
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
void Classifier::MultinomialBayesClassifier::prepare() {
|
|
10
|
+
numeric_feature_probabilities.resize(data_set->categories_size() + 1);
|
|
11
|
+
nominal_feature_probabilities.resize(data_set->categories_size() + 1);
|
|
12
|
+
DataSet::NumericFeature *numeric_feature = NULL;
|
|
13
|
+
DataSet::NominalFeature *nominal_feature = NULL;
|
|
14
|
+
int feature_count = data_set->features.size();
|
|
15
|
+
double category_sum = 0.0;
|
|
16
|
+
|
|
17
|
+
data_set->count();
|
|
18
|
+
|
|
19
|
+
// determine the category probabilities for each feature
|
|
20
|
+
for(int i = 1; i <= data_set->categories_size(); i++) {
|
|
21
|
+
numeric_feature_probabilities[i].reserve(feature_count);
|
|
22
|
+
nominal_feature_probabilities[i].reserve(feature_count);
|
|
23
|
+
|
|
24
|
+
// sum the counts of each numeric feature for this category
|
|
25
|
+
category_sum = 0.0
|
|
26
|
+
for(int j = 0; j < numeric_features.size(); j++)
|
|
27
|
+
category_sum += numeric_features[j]->category_sum(i);
|
|
28
|
+
category_sum += numeric_features.size();
|
|
29
|
+
|
|
30
|
+
// weight each numeric feature only by the number of other numeric features; nominal features are handled separately
|
|
31
|
+
for(int j = 0; j < numeric_features.size(); j++)
|
|
32
|
+
numeric_feature_probabilities[i][j] = (1.0 + numeric_features[j]->category_sum(i)) / (category_sum);
|
|
33
|
+
|
|
34
|
+
// each value of a nominal feature is treated as if it were another feature in itself
|
|
35
|
+
for(int j = 0; j < nominal_features.size(); j++) {
|
|
36
|
+
nominal_feature = nominal_features[j];
|
|
37
|
+
nominal_feature_probabilities[j].resize(nominal_feature->values.size());
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
#ifndef __multinomial_bayes_classifier_h__
|
|
2
|
+
#define __multinomial_bayes_classifier_h__
|
|
3
|
+
#include "classifier/classifier.h"
|
|
4
|
+
#include <vector>
|
|
5
|
+
|
|
6
|
+
namespace Classifier {
|
|
7
|
+
class MultinomialBayesClassifier : public Classifier {
|
|
8
|
+
vector<vector<double> > numeric_feature_probabilities;
|
|
9
|
+
vector<vector<vector<double> > > nominal_feature_probabilities;
|
|
10
|
+
|
|
11
|
+
public:
|
|
12
|
+
MultinomialBayesClassifier(DataSet::DataSet *data_set) : Classifier(data_set) {}
|
|
13
|
+
double score(DataSet::Category *category, DataSet::Example *example);
|
|
14
|
+
void prepare();
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
#endif
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
#include "naive_bayes_classifier.h"
|
|
2
|
+
#include "data_set/data_set.h"
|
|
3
|
+
#include "data_set/dense/dense_data_set.h"
|
|
4
|
+
#include "storage/binary.h"
|
|
5
|
+
#include <math.h>
|
|
6
|
+
|
|
7
|
+
Classifier::NaiveBayesClassifier *Classifier::NaiveBayesClassifier::clone(DataSet::DataSet *new_data_set) {
|
|
8
|
+
return new NaiveBayesClassifier(new_data_set);
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
void Classifier::NaiveBayesClassifier::prepare() {
|
|
13
|
+
feature_caches.resize(data_set->categories_size() + 1);
|
|
14
|
+
int features_size = data_set->features_size();
|
|
15
|
+
DataSet::NumericFeature *feature = NULL;
|
|
16
|
+
|
|
17
|
+
data_set->count();
|
|
18
|
+
category_probabilities = data_set->category_feature()->probabilities;
|
|
19
|
+
|
|
20
|
+
for(int i = 1; i <= data_set->categories_size(); i++) {
|
|
21
|
+
feature_caches[i].resize(features_size);
|
|
22
|
+
|
|
23
|
+
for(int j = 0; j < features_size; j++) {
|
|
24
|
+
if(!numeric_features[j])
|
|
25
|
+
continue;
|
|
26
|
+
feature = (DataSet::NumericFeature *) data_set->features[j];
|
|
27
|
+
feature_caches[i][j].denom = 2 * feature->category_variance(i);
|
|
28
|
+
feature_caches[i][j].lhs = 1 / sqrt(2 * M_PI * feature->category_variance(i));
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
double Classifier::NaiveBayesClassifier::score(int category, DataSet::Example *example) {
|
|
35
|
+
DataSet::SparseExample *sparse_example = NULL;
|
|
36
|
+
DataSet::SparseExample::Value *sparse_value = NULL;
|
|
37
|
+
DataSet::DenseExample *dense_example = NULL;
|
|
38
|
+
double dense_value = 0.0, probability = 0.0;
|
|
39
|
+
|
|
40
|
+
if(typeid(*example) == typeid(DataSet::SparseExample)) {
|
|
41
|
+
sparse_example = (DataSet::SparseExample *) example;
|
|
42
|
+
for(int i = 0; i < example->size; i++) {
|
|
43
|
+
sparse_value = &(sparse_example->values[i]);
|
|
44
|
+
if(numeric_features[sparse_value->index])
|
|
45
|
+
score_numeric_feature(sparse_value->index, sparse_value->value, category, &probability);
|
|
46
|
+
else if(sparse_value->index != data_set->category_index)
|
|
47
|
+
score_nominal_feature(sparse_value->index, sparse_value->value, category, &probability);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
} else {
|
|
51
|
+
dense_example = (DataSet::DenseExample *) example;
|
|
52
|
+
for(int i = 0; i < example->size; i++) {
|
|
53
|
+
dense_value = dense_example->get_value(i);
|
|
54
|
+
if(numeric_features[i])
|
|
55
|
+
score_numeric_feature(i, dense_value, category, &probability);
|
|
56
|
+
else if(i != data_set->category_index)
|
|
57
|
+
score_nominal_feature(i, dense_value, category, &probability);
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
return probability * category_probabilities[category];
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
void Classifier::NaiveBayesClassifier::write_binary(Storage::Binary *file) {
|
|
65
|
+
int category_count = feature_caches.size();
|
|
66
|
+
file->write_int(category_count);
|
|
67
|
+
file->write_vector<double>(&category_probabilities);
|
|
68
|
+
|
|
69
|
+
for(int i = 1; i <= category_count; i++)
|
|
70
|
+
file->write_vector<NumericFeatureCache>(&feature_caches[i]);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
void Classifier::NaiveBayesClassifier::read_binary(Storage::Binary *file) {
|
|
74
|
+
int category_count = file->read_int();
|
|
75
|
+
feature_caches.resize(category_count + 1);
|
|
76
|
+
category_probabilities = *(file->read_vector<double>());
|
|
77
|
+
|
|
78
|
+
for(int i = 1; i <= category_count; i++)
|
|
79
|
+
feature_caches[i] = *(file->read_vector<NumericFeatureCache>());
|
|
80
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
#ifndef __naive_bayes_classifier_h__
|
|
2
|
+
#define __naive_bayes_classifier_h__
|
|
3
|
+
#include "classifier/classifier.h"
|
|
4
|
+
#include <vector>
|
|
5
|
+
|
|
6
|
+
namespace Classifier {
|
|
7
|
+
class NaiveBayesClassifier : public Classifier {
|
|
8
|
+
typedef struct {
|
|
9
|
+
double denom;
|
|
10
|
+
double lhs;
|
|
11
|
+
} NumericFeatureCache;
|
|
12
|
+
vector<vector<NumericFeatureCache> > feature_caches; // features_caches[category_index][numeric_feature]
|
|
13
|
+
vector<double> category_probabilities;
|
|
14
|
+
|
|
15
|
+
// (1 / sqrt(2PI * var)) * e^(-((value - mean) ^ 2) / (2 * var))
|
|
16
|
+
void score_numeric_feature(int index, double value, int category, double *probability) {
|
|
17
|
+
DataSet::NumericFeature *numeric_feature = (DataSet::NumericFeature *) data_set->features[index];
|
|
18
|
+
if(feature_caches[category][index].denom != 0.0) {
|
|
19
|
+
double numerator = -1 * pow(value - numeric_feature->category_mean(category), 2);
|
|
20
|
+
if(*probability == 0.0)
|
|
21
|
+
*probability = 1.0;
|
|
22
|
+
*probability = *probability * (feature_caches[category][index].lhs * exp(numerator / feature_caches[category][index].denom));
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
void score_nominal_feature(int index, double value, int category, double *probability) {
|
|
27
|
+
DataSet::NominalFeature *nominal_feature = (DataSet::NominalFeature *) data_set->features[index];
|
|
28
|
+
*probability = *probability * nominal_feature->category_value_probability(category, (int)value);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
public:
|
|
32
|
+
static const uint32_t file_mark = 'naiv';
|
|
33
|
+
NaiveBayesClassifier(DataSet::DataSet *data_set) : Classifier(data_set) {}
|
|
34
|
+
NaiveBayesClassifier *clone(DataSet::DataSet *new_data_set);
|
|
35
|
+
|
|
36
|
+
double score(int category, DataSet::Example *example);
|
|
37
|
+
void prepare();
|
|
38
|
+
void write_binary(Storage::Binary *file);
|
|
39
|
+
void read_binary(Storage::Binary *file);
|
|
40
|
+
uint32_t mark() { return file_mark; }
|
|
41
|
+
|
|
42
|
+
void print() {
|
|
43
|
+
cout << "NB:" << endl;
|
|
44
|
+
for(unsigned int i = 0; i < feature_caches.size(); i++) {
|
|
45
|
+
for(unsigned int j = 0; j < feature_caches[i].size(); j++)
|
|
46
|
+
cout << "C" << i << "F" << j << ":" << feature_caches[i][j].denom << ", l:" << feature_caches[i][j].lhs << endl;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
#endif
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
#include <iostream>
|
|
2
|
+
#include "data_set.h"
|
|
3
|
+
#include "classifier/classifier.h"
|
|
4
|
+
#include "metrics/confusion_matrix.h"
|
|
5
|
+
#include "dense/dense_data_set.h"
|
|
6
|
+
#include "dense/dense_example.h"
|
|
7
|
+
#include "sparse/sparse_data_set.h"
|
|
8
|
+
#include "sparse/sparse_example.h"
|
|
9
|
+
|
|
10
|
+
DataSet::NominalFeature *DataSet::DataSet::new_nominal_feature(string name) {
|
|
11
|
+
NominalFeature *feature = new NominalFeature(name, features.size());
|
|
12
|
+
feature_names[name] = feature;
|
|
13
|
+
features.push_back(feature);
|
|
14
|
+
return feature;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
DataSet::NumericFeature *DataSet::DataSet::new_numeric_feature(string name) {
|
|
19
|
+
NumericFeature *feature = new NumericFeature(name, features.size());
|
|
20
|
+
feature_names[name] = feature;
|
|
21
|
+
features.push_back(feature);
|
|
22
|
+
return feature;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
void DataSet::DataSet::set_category_index(int index) {
|
|
27
|
+
category_index = index;
|
|
28
|
+
for(vector<Feature *>::iterator feature = features.begin(); feature < features.end(); feature++)
|
|
29
|
+
(*feature)->reset();
|
|
30
|
+
counted = false;
|
|
31
|
+
indexed = false;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
void DataSet::DataSet::count() {
|
|
36
|
+
if(counted)
|
|
37
|
+
return;
|
|
38
|
+
|
|
39
|
+
// initialise each feature for counting
|
|
40
|
+
for(vector<Feature *>::iterator feature = features.begin(); feature < features.end(); feature++)
|
|
41
|
+
(*feature)->prepare_for_counting(this);
|
|
42
|
+
|
|
43
|
+
// implementation optimised count
|
|
44
|
+
perform_count();
|
|
45
|
+
|
|
46
|
+
// calculate and finalise counts
|
|
47
|
+
for(vector<Feature *>::iterator feature = features.begin(); feature < features.end(); feature++)
|
|
48
|
+
(*feature)->finalise_counting(this);
|
|
49
|
+
|
|
50
|
+
counted = true;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
void DataSet::DataSet::index() {
|
|
55
|
+
if(indexed)
|
|
56
|
+
return;
|
|
57
|
+
|
|
58
|
+
// initialise each feature for indexing
|
|
59
|
+
for(vector<Feature *>::iterator feature = features.begin(); feature < features.end(); feature++)
|
|
60
|
+
(*feature)->prepare_for_indexing(this);
|
|
61
|
+
|
|
62
|
+
// index
|
|
63
|
+
for(vector<Example *>::iterator example = examples.begin(); example < examples.end(); example++) {
|
|
64
|
+
for(int i = 0; i < features.size(); i++)
|
|
65
|
+
features[i]->index_example((*example)->get_value(i), *example);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// finalise indexing
|
|
69
|
+
for(vector<Feature *>::iterator feature = features.begin(); feature < features.end(); feature++)
|
|
70
|
+
(*feature)->finalise_indexing(this);
|
|
71
|
+
|
|
72
|
+
indexed = true;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
// TODO: this isn't really stratification; categories should be proportionally represented
|
|
77
|
+
vector<vector<DataSet::Example *> > *DataSet::DataSet::stratify(int number_of_folds) {
|
|
78
|
+
int examples_per_fold = examples.size() / number_of_folds;
|
|
79
|
+
Example *example;
|
|
80
|
+
vector<vector<Example *> > *folds = new vector<vector<Example *> >(number_of_folds, vector<Example *>(examples_per_fold, NULL));
|
|
81
|
+
|
|
82
|
+
for(int fold = 0; fold < number_of_folds; fold++) {
|
|
83
|
+
for(int i = 0; i < examples_per_fold; i++) {
|
|
84
|
+
(*folds)[fold][i] = examples[fold + (i * number_of_folds)];
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
return folds;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
ConfusionMatrix *DataSet::DataSet::cross_fold_validation(Classifier::Classifier *classifier, int number_of_folds) {
|
|
93
|
+
vector<vector<Example *> > *folds = stratify(number_of_folds);
|
|
94
|
+
ConfusionMatrix *overall_matrix = new ConfusionMatrix(this);
|
|
95
|
+
Classifier::Classifier *test_classifier = NULL;
|
|
96
|
+
ConfusionMatrix *result = NULL;
|
|
97
|
+
DataSet *test_set = NULL;
|
|
98
|
+
Example *example = NULL;
|
|
99
|
+
int predicted = 0;
|
|
100
|
+
|
|
101
|
+
int examples_per_fold = examples.size() / number_of_folds;
|
|
102
|
+
int examples_per_test = examples_per_fold * (number_of_folds - 1);
|
|
103
|
+
|
|
104
|
+
for(int fold = 0; fold < number_of_folds; fold++) {
|
|
105
|
+
cout << "Running fold " << fold << endl;
|
|
106
|
+
test_set = clone_without_examples();
|
|
107
|
+
test_set->examples.reserve(examples_per_test);
|
|
108
|
+
for(int i = 0; i < number_of_folds; i++) {
|
|
109
|
+
if(i != fold)
|
|
110
|
+
test_set->examples.insert(test_set->examples.begin() + (examples_per_fold * (i > fold ? i - 1 : i)), (*folds)[i].begin(), (*folds)[i].end());
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
test_classifier = classifier->clone(test_set);
|
|
114
|
+
result = new ConfusionMatrix(test_set);
|
|
115
|
+
test_classifier->prepare();
|
|
116
|
+
|
|
117
|
+
for(int i = 0; i < examples_per_fold; i++) {
|
|
118
|
+
example = (*folds)[fold][i];
|
|
119
|
+
predicted = test_classifier->classify(example);
|
|
120
|
+
result->add(predicted, (int)example->get_value(category_index));
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
overall_matrix->merge(result);
|
|
124
|
+
delete test_classifier;
|
|
125
|
+
delete test_set;
|
|
126
|
+
delete result;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
return overall_matrix;
|
|
130
|
+
}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
#ifndef __data_set_h__
|
|
2
|
+
#define __data_set_h__
|
|
3
|
+
#include <tr1/unordered_map>
|
|
4
|
+
#include <vector>
|
|
5
|
+
#include <string>
|
|
6
|
+
#include "example.h"
|
|
7
|
+
#include "feature.h"
|
|
8
|
+
#include "features/numeric_feature.h"
|
|
9
|
+
#include "features/nominal_feature.h"
|
|
10
|
+
using namespace std;
|
|
11
|
+
|
|
12
|
+
namespace Classifier { class Classifier; }
|
|
13
|
+
class ConfusionMatrix;
|
|
14
|
+
|
|
15
|
+
namespace DataSet {
|
|
16
|
+
class DataSet {
|
|
17
|
+
virtual void perform_count() {}
|
|
18
|
+
virtual void perform_index() {}
|
|
19
|
+
public:
|
|
20
|
+
DataSet() : category_index(-1), counted(false), indexed(false) {}
|
|
21
|
+
DataSet(DataSet *other) : name(other->name), category_index(other->category_index), counted(false), indexed(false) {
|
|
22
|
+
Feature *feature = NULL;
|
|
23
|
+
for(unsigned int i = 0; i < other->features.size(); i++) {
|
|
24
|
+
feature = other->features[i]->clone();
|
|
25
|
+
feature_names[feature->name] = feature;
|
|
26
|
+
features.push_back(feature);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
virtual DataSet *clone_without_examples() { return NULL; }
|
|
30
|
+
|
|
31
|
+
tr1::unordered_map<string, Feature *> feature_names;
|
|
32
|
+
vector<Feature *> features;
|
|
33
|
+
vector<Example *> examples;
|
|
34
|
+
string name;
|
|
35
|
+
int category_index;
|
|
36
|
+
bool counted;
|
|
37
|
+
bool indexed;
|
|
38
|
+
|
|
39
|
+
NumericFeature *new_numeric_feature(string name);
|
|
40
|
+
NominalFeature *new_nominal_feature(string name);
|
|
41
|
+
virtual Example *new_example() { return NULL; }
|
|
42
|
+
|
|
43
|
+
void count();
|
|
44
|
+
void index();
|
|
45
|
+
|
|
46
|
+
vector<vector<Example *> > *stratify(int number_of_folds);
|
|
47
|
+
ConfusionMatrix *cross_fold_validation(Classifier::Classifier *classifier, int number_of_folds);
|
|
48
|
+
|
|
49
|
+
string get_name() { return name; }
|
|
50
|
+
void set_name(string new_name) { name = new_name; }
|
|
51
|
+
int get_category_index() { return category_index; }
|
|
52
|
+
int features_size() { return features.size(); }
|
|
53
|
+
int examples_size() { return examples.size(); }
|
|
54
|
+
int categories_size() { return ((NominalFeature *)features[category_index])->indexes.size(); }
|
|
55
|
+
NominalFeature *category_feature() { return (NominalFeature *)features[category_index]; }
|
|
56
|
+
void set_category_index(int index);
|
|
57
|
+
|
|
58
|
+
Feature *get_feature_by_name(string name) { return feature_names[name]; }
|
|
59
|
+
Feature *get_feature_by_index(int index) { return features[index]; }
|
|
60
|
+
Example *get_example_by_index(int index) { return examples[index]; }
|
|
61
|
+
|
|
62
|
+
NumericFeature *get_or_create_numeric_feature_by_name(string name) {
|
|
63
|
+
NumericFeature *feature = (NumericFeature *)feature_names[name];
|
|
64
|
+
if(feature == NULL)
|
|
65
|
+
feature = new_numeric_feature(name);
|
|
66
|
+
return feature;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
NominalFeature *get_or_create_nominal_feature_by_name(string name) {
|
|
70
|
+
NominalFeature *feature = (NominalFeature *)feature_names[name];
|
|
71
|
+
if(feature == NULL)
|
|
72
|
+
feature = new_nominal_feature(name);
|
|
73
|
+
return feature;
|
|
74
|
+
}
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
#endif
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
#ifndef __dense_data_set_h__
|
|
2
|
+
#define __dense_data_set_h__
|
|
3
|
+
#include "dense_example.h"
|
|
4
|
+
|
|
5
|
+
namespace DataSet {
|
|
6
|
+
class DenseDataSet : public DataSet {
|
|
7
|
+
void perform_count() {
|
|
8
|
+
int example_category_index = 0;
|
|
9
|
+
double value = 0.0;
|
|
10
|
+
|
|
11
|
+
for(vector<Example *>::iterator example = examples.begin(); example < examples.end(); example++) {
|
|
12
|
+
example_category_index = (int)((*example)->get_value(category_index));
|
|
13
|
+
for(unsigned int i = 0; i < features.size(); i++) {
|
|
14
|
+
value = (*example)->get_value(i);
|
|
15
|
+
features[i]->count_example(value, example_category_index);
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
void perform_index() {
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
public:
|
|
24
|
+
DenseDataSet() : DataSet() {}
|
|
25
|
+
DenseDataSet(DataSet *other) : DataSet(other) {}
|
|
26
|
+
|
|
27
|
+
DenseDataSet *clone_without_examples() {
|
|
28
|
+
return new DenseDataSet(this);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
DenseExample *new_example() {
|
|
32
|
+
DenseExample *example = new DenseExample(features.size());
|
|
33
|
+
examples.push_back(example);
|
|
34
|
+
return example;
|
|
35
|
+
}
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
#endif
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
#include <stdexcept>
|
|
2
|
+
#ifndef __dense_data_set_example_h__
|
|
3
|
+
#define __dense_data_set_example_h__
|
|
4
|
+
#include "dense_data_set.h"
|
|
5
|
+
#include <iostream>
|
|
6
|
+
|
|
7
|
+
namespace DataSet {
|
|
8
|
+
class DenseExample : public Example {
|
|
9
|
+
public:
|
|
10
|
+
double *values;
|
|
11
|
+
|
|
12
|
+
DenseExample(int size) : Example(size) {
|
|
13
|
+
values = (double *) calloc(size, sizeof(double));
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
~DenseExample() {
|
|
17
|
+
free(values);
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
double get_value(int index) {
|
|
21
|
+
return values[index];
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
void set_value(int index, double new_value) {
|
|
25
|
+
values[index] = new_value;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
double euclidean_distance(Example *other_example) {
|
|
29
|
+
return 0.0;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
double cosine_distance(Example *other_example) {
|
|
33
|
+
return 0.0;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
void print() {
|
|
37
|
+
for(int i = 0; i < size; i++)
|
|
38
|
+
cout << values[i] << ",";
|
|
39
|
+
cout << endl;
|
|
40
|
+
}
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
#endif
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
#include "data_set.h"
|
|
2
|
+
#include "example.h"
|
|
3
|
+
|
|
4
|
+
int DataSet::Example::category_index(DataSet *data_set) {
|
|
5
|
+
return (int)get_value(data_set->category_index);
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
void DataSet::Example::set_category_index(DataSet *data_set, int index) {
|
|
9
|
+
set_value(data_set->category_index, index);
|
|
10
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
#ifndef __example_h__
|
|
2
|
+
#define __example_h__
|
|
3
|
+
|
|
4
|
+
namespace DataSet {
|
|
5
|
+
class DataSet;
|
|
6
|
+
class Example {
|
|
7
|
+
public:
|
|
8
|
+
int size;
|
|
9
|
+
|
|
10
|
+
Example(int size) : size(size) {}
|
|
11
|
+
virtual ~Example() {}
|
|
12
|
+
|
|
13
|
+
int category_index(DataSet *data_set);
|
|
14
|
+
void set_category_index(DataSet *data_set, int index);
|
|
15
|
+
virtual double get_value(int index) { return 0.0; }
|
|
16
|
+
virtual void set_value(int index, double new_value) {}
|
|
17
|
+
virtual double euclidean_distance(Example *other_example) { return 0.0; }
|
|
18
|
+
virtual double cosine_distance(Example *other_example) { return 0.0; }
|
|
19
|
+
virtual void print() {}
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
#endif
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
#ifndef __feature_h__
|
|
2
|
+
#define __feature_h__
|
|
3
|
+
#include <string>
|
|
4
|
+
using namespace std;
|
|
5
|
+
|
|
6
|
+
namespace DataSet {
|
|
7
|
+
class DataSet;
|
|
8
|
+
|
|
9
|
+
class Feature {
|
|
10
|
+
public:
|
|
11
|
+
string name;
|
|
12
|
+
int index;
|
|
13
|
+
|
|
14
|
+
string get_name() { return name; }
|
|
15
|
+
int get_index() { return index; }
|
|
16
|
+
void set_name(string new_name) { name = new_name; }
|
|
17
|
+
void set_index(int new_index) { index = new_index; }
|
|
18
|
+
|
|
19
|
+
Feature(string name, int index) : name(name), index(index) {}
|
|
20
|
+
virtual Feature *clone() { return NULL; }
|
|
21
|
+
virtual void reset() {}
|
|
22
|
+
virtual void print() {}
|
|
23
|
+
|
|
24
|
+
// counting
|
|
25
|
+
virtual void prepare_for_counting(DataSet *data_set) {}
|
|
26
|
+
virtual void count_example(double value, int category_index) {}
|
|
27
|
+
virtual void finalise_counting(DataSet *data_set) {}
|
|
28
|
+
|
|
29
|
+
// indexing
|
|
30
|
+
virtual void prepare_for_indexing(DataSet *data_set) {}
|
|
31
|
+
virtual void index_example(double value, Example *example) {}
|
|
32
|
+
virtual void finalise_indexing(DataSet *data_set) {}
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
#endif
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
#include "nominal_feature.h"
|
|
2
|
+
#include "data_set/data_set.h"
|
|
3
|
+
#include "data_set/example.h"
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
void DataSet::NominalFeature::prepare_for_counting(DataSet *data_set) {
|
|
7
|
+
int categories_count = data_set->categories_size();
|
|
8
|
+
int values_count = names.size();
|
|
9
|
+
|
|
10
|
+
category_frequencies.resize(categories_count + 1);
|
|
11
|
+
category_probabilities.resize(categories_count + 1);
|
|
12
|
+
probabilities.resize(values_count);
|
|
13
|
+
frequencies.resize(values_count);
|
|
14
|
+
|
|
15
|
+
for(int i = 1; i <= categories_count; i++) {
|
|
16
|
+
category_frequencies[i].resize(values_count);
|
|
17
|
+
category_probabilities[i].resize(values_count);
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
void DataSet::NominalFeature::count_example(double value, int category_index) {
|
|
22
|
+
frequencies[(int)value] += 1;
|
|
23
|
+
category_frequencies[category_index][(int)value] += 1;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
void DataSet::NominalFeature::finalise_counting(DataSet *data_set) {
|
|
27
|
+
int categories_count = data_set->categories_size();
|
|
28
|
+
int examples_count = data_set->examples.size();
|
|
29
|
+
int values_count = names.size();
|
|
30
|
+
|
|
31
|
+
// overall value probabilities
|
|
32
|
+
for(int i = 0; i < values_count; i++)
|
|
33
|
+
probabilities[i] = ((double)frequencies[i] + 1) / examples_count;
|
|
34
|
+
|
|
35
|
+
// value probabilities per category
|
|
36
|
+
if(index != data_set->category_index) {
|
|
37
|
+
for(int i = 1; i <= categories_count; i++) {
|
|
38
|
+
for(int j = 0; j < values_count; j++)
|
|
39
|
+
category_probabilities[i][j] = ((double)category_frequencies[i][j] + 1) / data_set->category_feature()->value_frequency(i);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
void DataSet::NominalFeature::prepare_for_indexing(DataSet *data_set) {
|
|
45
|
+
examples_with_value.resize(names.size());
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
void DataSet::NominalFeature::index_example(double value, Example *example) {
|
|
49
|
+
examples_with_value[(int)value].push_back(example);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
void DataSet::NominalFeature::finalise_indexing(DataSet *data_set) {}
|
|
53
|
+
|
|
54
|
+
void DataSet::NominalFeature::print() {
|
|
55
|
+
for(int i = 1; i < names.size(); i++)
|
|
56
|
+
cout << i << ": " << names[i] << endl;
|
|
57
|
+
}
|