thera 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +56 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +20 -0
- data/LICENSE.txt +1 -0
- data/README.rdoc +8 -0
- data/Rakefile +1 -0
- data/ext/Makefile +225 -0
- data/ext/extconf.rb +29 -0
- data/ext/quarry/quarry_toolkit.cpp +148 -0
- data/lib/quarry/Makefile.linux +2 -0
- data/lib/quarry/Makefile.osx +6 -0
- data/lib/quarry/Makefile.targets +23 -0
- data/lib/quarry/obj/.gitkeep +0 -0
- data/lib/quarry/src/classifier/aode/aode_classifier.cpp +0 -0
- data/lib/quarry/src/classifier/aode/aode_classifier.h +0 -0
- data/lib/quarry/src/classifier/centroid/centroid_classifier.cpp +0 -0
- data/lib/quarry/src/classifier/centroid/centroid_classifier.h +0 -0
- data/lib/quarry/src/classifier/classifier.cpp +32 -0
- data/lib/quarry/src/classifier/classifier.h +59 -0
- data/lib/quarry/src/classifier/knn/knn_classifier.cpp +0 -0
- data/lib/quarry/src/classifier/knn/knn_classifier.h +0 -0
- data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.cpp +40 -0
- data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.h +18 -0
- data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +80 -0
- data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.h +52 -0
- data/lib/quarry/src/data_set/data_set.cpp +130 -0
- data/lib/quarry/src/data_set/data_set.h +78 -0
- data/lib/quarry/src/data_set/dense/dense_data_set.h +39 -0
- data/lib/quarry/src/data_set/dense/dense_example.h +44 -0
- data/lib/quarry/src/data_set/example.cpp +10 -0
- data/lib/quarry/src/data_set/example.h +23 -0
- data/lib/quarry/src/data_set/feature.h +36 -0
- data/lib/quarry/src/data_set/features/nominal_feature.cpp +57 -0
- data/lib/quarry/src/data_set/features/nominal_feature.h +76 -0
- data/lib/quarry/src/data_set/features/numeric_feature.cpp +69 -0
- data/lib/quarry/src/data_set/features/numeric_feature.h +78 -0
- data/lib/quarry/src/data_set/sparse/sparse_data_set.h +40 -0
- data/lib/quarry/src/data_set/sparse/sparse_example.cpp +82 -0
- data/lib/quarry/src/data_set/sparse/sparse_example.h +38 -0
- data/lib/quarry/src/metrics/confusion_matrix.cpp +129 -0
- data/lib/quarry/src/metrics/confusion_matrix.h +82 -0
- data/lib/quarry/src/model/model.cpp +29 -0
- data/lib/quarry/src/model/model.h +50 -0
- data/lib/quarry/src/preprocessing/examples/example_preprocessor.h +20 -0
- data/lib/quarry/src/preprocessing/examples/weights/binary_weight.h +20 -0
- data/lib/quarry/src/preprocessing/examples/weights/local_weight.h +29 -0
- data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +19 -0
- data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +59 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/downcase.h +26 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h +17 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer.h +44 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer_original.cpp +375 -0
- data/lib/quarry/src/preprocessing/text/text_pipeline.cpp +29 -0
- data/lib/quarry/src/preprocessing/text/text_pipeline.h +37 -0
- data/lib/quarry/src/preprocessing/text/token_selector/pos_tag_selector.h +21 -0
- data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp +82 -0
- data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h +20 -0
- data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h +17 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.cpp +29 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.h +20 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h +19 -0
- data/lib/quarry/src/quarry.cpp +1 -0
- data/lib/quarry/src/quarry.h +29 -0
- data/lib/quarry/src/storage/arff.cpp +198 -0
- data/lib/quarry/src/storage/arff.h +26 -0
- data/lib/quarry/src/storage/binary.cpp +457 -0
- data/lib/quarry/src/storage/binary.h +79 -0
- data/lib/quarry/src/storage/folders.cpp +98 -0
- data/lib/quarry/src/storage/folders.h +25 -0
- data/lib/quarry/src/storage/storage.h +19 -0
- data/lib/quarry/src/test.cpp +6 -0
- data/lib/quarry_rb/classifier/classifier.rb +22 -0
- data/lib/quarry_rb/classifier/naive_bayes_classifier.rb +10 -0
- data/lib/quarry_rb/confusion_matrix.rb +58 -0
- data/lib/quarry_rb/data_set/data_set.rb +42 -0
- data/lib/quarry_rb/data_set/example.rb +33 -0
- data/lib/quarry_rb/data_set/feature.rb +28 -0
- data/lib/quarry_rb/enumerable_helper.rb +32 -0
- data/lib/quarry_rb/model/model.rb +56 -0
- data/lib/quarry_rb/storage/arff.rb +11 -0
- data/lib/quarry_rb/storage/binary.rb +23 -0
- data/lib/quarry_rb/storage/folders.rb +11 -0
- data/lib/quarry_rb/text_pipeline.rb +16 -0
- data/lib/thera.rb +20 -0
- data/test/helper.rb +19 -0
- data/test/test_quarry.rb +33 -0
- data/thera.gemspec +21 -0
- metadata +148 -0
@@ -0,0 +1,40 @@
|
|
1
|
+
#include "multinomial_bayes_classifier.h"
|
2
|
+
#include "data_set/data_set.h"
|
3
|
+
#include <math.h>
|
4
|
+
|
5
|
+
double Classifier::MultinomialBayesClassifier::score(DataSet::Category *category, DataSet::Example *example) {
|
6
|
+
|
7
|
+
}
|
8
|
+
|
9
|
+
void Classifier::MultinomialBayesClassifier::prepare() {
|
10
|
+
numeric_feature_probabilities.resize(data_set->categories_size() + 1);
|
11
|
+
nominal_feature_probabilities.resize(data_set->categories_size() + 1);
|
12
|
+
DataSet::NumericFeature *numeric_feature = NULL;
|
13
|
+
DataSet::NominalFeature *nominal_feature = NULL;
|
14
|
+
int feature_count = data_set->features.size();
|
15
|
+
double category_sum = 0.0;
|
16
|
+
|
17
|
+
data_set->count();
|
18
|
+
|
19
|
+
// determine the category probabilities for each feature
|
20
|
+
for(int i = 1; i <= data_set->categories_size(); i++) {
|
21
|
+
numeric_feature_probabilities[i].reserve(feature_count);
|
22
|
+
nominal_feature_probabilities[i].reserve(feature_count);
|
23
|
+
|
24
|
+
// sum the counts of each numeric feature for this category
|
25
|
+
category_sum = 0.0
|
26
|
+
for(int j = 0; j < numeric_features.size(); j++)
|
27
|
+
category_sum += numeric_features[j]->category_sum(i);
|
28
|
+
category_sum += numeric_features.size();
|
29
|
+
|
30
|
+
// weight each numeric feature only by the number of other numeric features; nominal features are handled separately
|
31
|
+
for(int j = 0; j < numeric_features.size(); j++)
|
32
|
+
numeric_feature_probabilities[i][j] = (1.0 + numeric_features[j]->category_sum(i)) / (category_sum);
|
33
|
+
|
34
|
+
// each value of a nominal feature is treated as if it were another feature in itself
|
35
|
+
for(int j = 0; j < nominal_features.size(); j++) {
|
36
|
+
nominal_feature = nominal_features[j];
|
37
|
+
nominal_feature_probabilities[j].resize(nominal_feature->values.size());
|
38
|
+
}
|
39
|
+
}
|
40
|
+
}
|
@@ -0,0 +1,18 @@
|
|
1
|
+
#ifndef __multinomial_bayes_classifier_h__
|
2
|
+
#define __multinomial_bayes_classifier_h__
|
3
|
+
#include "classifier/classifier.h"
|
4
|
+
#include <vector>
|
5
|
+
|
6
|
+
namespace Classifier {
|
7
|
+
class MultinomialBayesClassifier : public Classifier {
|
8
|
+
vector<vector<double> > numeric_feature_probabilities;
|
9
|
+
vector<vector<vector<double> > > nominal_feature_probabilities;
|
10
|
+
|
11
|
+
public:
|
12
|
+
MultinomialBayesClassifier(DataSet::DataSet *data_set) : Classifier(data_set) {}
|
13
|
+
double score(DataSet::Category *category, DataSet::Example *example);
|
14
|
+
void prepare();
|
15
|
+
};
|
16
|
+
}
|
17
|
+
|
18
|
+
#endif
|
@@ -0,0 +1,80 @@
|
|
1
|
+
#include "naive_bayes_classifier.h"
|
2
|
+
#include "data_set/data_set.h"
|
3
|
+
#include "data_set/dense/dense_data_set.h"
|
4
|
+
#include "storage/binary.h"
|
5
|
+
#include <math.h>
|
6
|
+
|
7
|
+
Classifier::NaiveBayesClassifier *Classifier::NaiveBayesClassifier::clone(DataSet::DataSet *new_data_set) {
|
8
|
+
return new NaiveBayesClassifier(new_data_set);
|
9
|
+
}
|
10
|
+
|
11
|
+
|
12
|
+
void Classifier::NaiveBayesClassifier::prepare() {
|
13
|
+
feature_caches.resize(data_set->categories_size() + 1);
|
14
|
+
int features_size = data_set->features_size();
|
15
|
+
DataSet::NumericFeature *feature = NULL;
|
16
|
+
|
17
|
+
data_set->count();
|
18
|
+
category_probabilities = data_set->category_feature()->probabilities;
|
19
|
+
|
20
|
+
for(int i = 1; i <= data_set->categories_size(); i++) {
|
21
|
+
feature_caches[i].resize(features_size);
|
22
|
+
|
23
|
+
for(int j = 0; j < features_size; j++) {
|
24
|
+
if(!numeric_features[j])
|
25
|
+
continue;
|
26
|
+
feature = (DataSet::NumericFeature *) data_set->features[j];
|
27
|
+
feature_caches[i][j].denom = 2 * feature->category_variance(i);
|
28
|
+
feature_caches[i][j].lhs = 1 / sqrt(2 * M_PI * feature->category_variance(i));
|
29
|
+
}
|
30
|
+
}
|
31
|
+
}
|
32
|
+
|
33
|
+
|
34
|
+
double Classifier::NaiveBayesClassifier::score(int category, DataSet::Example *example) {
|
35
|
+
DataSet::SparseExample *sparse_example = NULL;
|
36
|
+
DataSet::SparseExample::Value *sparse_value = NULL;
|
37
|
+
DataSet::DenseExample *dense_example = NULL;
|
38
|
+
double dense_value = 0.0, probability = 0.0;
|
39
|
+
|
40
|
+
if(typeid(*example) == typeid(DataSet::SparseExample)) {
|
41
|
+
sparse_example = (DataSet::SparseExample *) example;
|
42
|
+
for(int i = 0; i < example->size; i++) {
|
43
|
+
sparse_value = &(sparse_example->values[i]);
|
44
|
+
if(numeric_features[sparse_value->index])
|
45
|
+
score_numeric_feature(sparse_value->index, sparse_value->value, category, &probability);
|
46
|
+
else if(sparse_value->index != data_set->category_index)
|
47
|
+
score_nominal_feature(sparse_value->index, sparse_value->value, category, &probability);
|
48
|
+
}
|
49
|
+
|
50
|
+
} else {
|
51
|
+
dense_example = (DataSet::DenseExample *) example;
|
52
|
+
for(int i = 0; i < example->size; i++) {
|
53
|
+
dense_value = dense_example->get_value(i);
|
54
|
+
if(numeric_features[i])
|
55
|
+
score_numeric_feature(i, dense_value, category, &probability);
|
56
|
+
else if(i != data_set->category_index)
|
57
|
+
score_nominal_feature(i, dense_value, category, &probability);
|
58
|
+
}
|
59
|
+
}
|
60
|
+
|
61
|
+
return probability * category_probabilities[category];
|
62
|
+
}
|
63
|
+
|
64
|
+
void Classifier::NaiveBayesClassifier::write_binary(Storage::Binary *file) {
|
65
|
+
int category_count = feature_caches.size();
|
66
|
+
file->write_int(category_count);
|
67
|
+
file->write_vector<double>(&category_probabilities);
|
68
|
+
|
69
|
+
for(int i = 1; i <= category_count; i++)
|
70
|
+
file->write_vector<NumericFeatureCache>(&feature_caches[i]);
|
71
|
+
}
|
72
|
+
|
73
|
+
void Classifier::NaiveBayesClassifier::read_binary(Storage::Binary *file) {
|
74
|
+
int category_count = file->read_int();
|
75
|
+
feature_caches.resize(category_count + 1);
|
76
|
+
category_probabilities = *(file->read_vector<double>());
|
77
|
+
|
78
|
+
for(int i = 1; i <= category_count; i++)
|
79
|
+
feature_caches[i] = *(file->read_vector<NumericFeatureCache>());
|
80
|
+
}
|
@@ -0,0 +1,52 @@
|
|
1
|
+
#ifndef __naive_bayes_classifier_h__
|
2
|
+
#define __naive_bayes_classifier_h__
|
3
|
+
#include "classifier/classifier.h"
|
4
|
+
#include <vector>
|
5
|
+
|
6
|
+
namespace Classifier {
|
7
|
+
class NaiveBayesClassifier : public Classifier {
|
8
|
+
typedef struct {
|
9
|
+
double denom;
|
10
|
+
double lhs;
|
11
|
+
} NumericFeatureCache;
|
12
|
+
vector<vector<NumericFeatureCache> > feature_caches; // features_caches[category_index][numeric_feature]
|
13
|
+
vector<double> category_probabilities;
|
14
|
+
|
15
|
+
// (1 / sqrt(2PI * var)) * e^(-((value - mean) ^ 2) / (2 * var))
|
16
|
+
void score_numeric_feature(int index, double value, int category, double *probability) {
|
17
|
+
DataSet::NumericFeature *numeric_feature = (DataSet::NumericFeature *) data_set->features[index];
|
18
|
+
if(feature_caches[category][index].denom != 0.0) {
|
19
|
+
double numerator = -1 * pow(value - numeric_feature->category_mean(category), 2);
|
20
|
+
if(*probability == 0.0)
|
21
|
+
*probability = 1.0;
|
22
|
+
*probability = *probability * (feature_caches[category][index].lhs * exp(numerator / feature_caches[category][index].denom));
|
23
|
+
}
|
24
|
+
}
|
25
|
+
|
26
|
+
void score_nominal_feature(int index, double value, int category, double *probability) {
|
27
|
+
DataSet::NominalFeature *nominal_feature = (DataSet::NominalFeature *) data_set->features[index];
|
28
|
+
*probability = *probability * nominal_feature->category_value_probability(category, (int)value);
|
29
|
+
}
|
30
|
+
|
31
|
+
public:
|
32
|
+
static const uint32_t file_mark = 'naiv';
|
33
|
+
NaiveBayesClassifier(DataSet::DataSet *data_set) : Classifier(data_set) {}
|
34
|
+
NaiveBayesClassifier *clone(DataSet::DataSet *new_data_set);
|
35
|
+
|
36
|
+
double score(int category, DataSet::Example *example);
|
37
|
+
void prepare();
|
38
|
+
void write_binary(Storage::Binary *file);
|
39
|
+
void read_binary(Storage::Binary *file);
|
40
|
+
uint32_t mark() { return file_mark; }
|
41
|
+
|
42
|
+
void print() {
|
43
|
+
cout << "NB:" << endl;
|
44
|
+
for(unsigned int i = 0; i < feature_caches.size(); i++) {
|
45
|
+
for(unsigned int j = 0; j < feature_caches[i].size(); j++)
|
46
|
+
cout << "C" << i << "F" << j << ":" << feature_caches[i][j].denom << ", l:" << feature_caches[i][j].lhs << endl;
|
47
|
+
}
|
48
|
+
}
|
49
|
+
};
|
50
|
+
}
|
51
|
+
|
52
|
+
#endif
|
@@ -0,0 +1,130 @@
|
|
1
|
+
#include <iostream>
|
2
|
+
#include "data_set.h"
|
3
|
+
#include "classifier/classifier.h"
|
4
|
+
#include "metrics/confusion_matrix.h"
|
5
|
+
#include "dense/dense_data_set.h"
|
6
|
+
#include "dense/dense_example.h"
|
7
|
+
#include "sparse/sparse_data_set.h"
|
8
|
+
#include "sparse/sparse_example.h"
|
9
|
+
|
10
|
+
DataSet::NominalFeature *DataSet::DataSet::new_nominal_feature(string name) {
|
11
|
+
NominalFeature *feature = new NominalFeature(name, features.size());
|
12
|
+
feature_names[name] = feature;
|
13
|
+
features.push_back(feature);
|
14
|
+
return feature;
|
15
|
+
}
|
16
|
+
|
17
|
+
|
18
|
+
DataSet::NumericFeature *DataSet::DataSet::new_numeric_feature(string name) {
|
19
|
+
NumericFeature *feature = new NumericFeature(name, features.size());
|
20
|
+
feature_names[name] = feature;
|
21
|
+
features.push_back(feature);
|
22
|
+
return feature;
|
23
|
+
}
|
24
|
+
|
25
|
+
|
26
|
+
void DataSet::DataSet::set_category_index(int index) {
|
27
|
+
category_index = index;
|
28
|
+
for(vector<Feature *>::iterator feature = features.begin(); feature < features.end(); feature++)
|
29
|
+
(*feature)->reset();
|
30
|
+
counted = false;
|
31
|
+
indexed = false;
|
32
|
+
}
|
33
|
+
|
34
|
+
|
35
|
+
void DataSet::DataSet::count() {
|
36
|
+
if(counted)
|
37
|
+
return;
|
38
|
+
|
39
|
+
// initialise each feature for counting
|
40
|
+
for(vector<Feature *>::iterator feature = features.begin(); feature < features.end(); feature++)
|
41
|
+
(*feature)->prepare_for_counting(this);
|
42
|
+
|
43
|
+
// implementation optimised count
|
44
|
+
perform_count();
|
45
|
+
|
46
|
+
// calculate and finalise counts
|
47
|
+
for(vector<Feature *>::iterator feature = features.begin(); feature < features.end(); feature++)
|
48
|
+
(*feature)->finalise_counting(this);
|
49
|
+
|
50
|
+
counted = true;
|
51
|
+
}
|
52
|
+
|
53
|
+
|
54
|
+
void DataSet::DataSet::index() {
|
55
|
+
if(indexed)
|
56
|
+
return;
|
57
|
+
|
58
|
+
// initialise each feature for indexing
|
59
|
+
for(vector<Feature *>::iterator feature = features.begin(); feature < features.end(); feature++)
|
60
|
+
(*feature)->prepare_for_indexing(this);
|
61
|
+
|
62
|
+
// index
|
63
|
+
for(vector<Example *>::iterator example = examples.begin(); example < examples.end(); example++) {
|
64
|
+
for(int i = 0; i < features.size(); i++)
|
65
|
+
features[i]->index_example((*example)->get_value(i), *example);
|
66
|
+
}
|
67
|
+
|
68
|
+
// finalise indexing
|
69
|
+
for(vector<Feature *>::iterator feature = features.begin(); feature < features.end(); feature++)
|
70
|
+
(*feature)->finalise_indexing(this);
|
71
|
+
|
72
|
+
indexed = true;
|
73
|
+
}
|
74
|
+
|
75
|
+
|
76
|
+
// TODO: this isn't really stratification; categories should be proportionally represented
|
77
|
+
vector<vector<DataSet::Example *> > *DataSet::DataSet::stratify(int number_of_folds) {
|
78
|
+
int examples_per_fold = examples.size() / number_of_folds;
|
79
|
+
Example *example;
|
80
|
+
vector<vector<Example *> > *folds = new vector<vector<Example *> >(number_of_folds, vector<Example *>(examples_per_fold, NULL));
|
81
|
+
|
82
|
+
for(int fold = 0; fold < number_of_folds; fold++) {
|
83
|
+
for(int i = 0; i < examples_per_fold; i++) {
|
84
|
+
(*folds)[fold][i] = examples[fold + (i * number_of_folds)];
|
85
|
+
}
|
86
|
+
}
|
87
|
+
|
88
|
+
return folds;
|
89
|
+
}
|
90
|
+
|
91
|
+
|
92
|
+
ConfusionMatrix *DataSet::DataSet::cross_fold_validation(Classifier::Classifier *classifier, int number_of_folds) {
|
93
|
+
vector<vector<Example *> > *folds = stratify(number_of_folds);
|
94
|
+
ConfusionMatrix *overall_matrix = new ConfusionMatrix(this);
|
95
|
+
Classifier::Classifier *test_classifier = NULL;
|
96
|
+
ConfusionMatrix *result = NULL;
|
97
|
+
DataSet *test_set = NULL;
|
98
|
+
Example *example = NULL;
|
99
|
+
int predicted = 0;
|
100
|
+
|
101
|
+
int examples_per_fold = examples.size() / number_of_folds;
|
102
|
+
int examples_per_test = examples_per_fold * (number_of_folds - 1);
|
103
|
+
|
104
|
+
for(int fold = 0; fold < number_of_folds; fold++) {
|
105
|
+
cout << "Running fold " << fold << endl;
|
106
|
+
test_set = clone_without_examples();
|
107
|
+
test_set->examples.reserve(examples_per_test);
|
108
|
+
for(int i = 0; i < number_of_folds; i++) {
|
109
|
+
if(i != fold)
|
110
|
+
test_set->examples.insert(test_set->examples.begin() + (examples_per_fold * (i > fold ? i - 1 : i)), (*folds)[i].begin(), (*folds)[i].end());
|
111
|
+
}
|
112
|
+
|
113
|
+
test_classifier = classifier->clone(test_set);
|
114
|
+
result = new ConfusionMatrix(test_set);
|
115
|
+
test_classifier->prepare();
|
116
|
+
|
117
|
+
for(int i = 0; i < examples_per_fold; i++) {
|
118
|
+
example = (*folds)[fold][i];
|
119
|
+
predicted = test_classifier->classify(example);
|
120
|
+
result->add(predicted, (int)example->get_value(category_index));
|
121
|
+
}
|
122
|
+
|
123
|
+
overall_matrix->merge(result);
|
124
|
+
delete test_classifier;
|
125
|
+
delete test_set;
|
126
|
+
delete result;
|
127
|
+
}
|
128
|
+
|
129
|
+
return overall_matrix;
|
130
|
+
}
|
@@ -0,0 +1,78 @@
|
|
1
|
+
#ifndef __data_set_h__
|
2
|
+
#define __data_set_h__
|
3
|
+
#include <tr1/unordered_map>
|
4
|
+
#include <vector>
|
5
|
+
#include <string>
|
6
|
+
#include "example.h"
|
7
|
+
#include "feature.h"
|
8
|
+
#include "features/numeric_feature.h"
|
9
|
+
#include "features/nominal_feature.h"
|
10
|
+
using namespace std;
|
11
|
+
|
12
|
+
namespace Classifier { class Classifier; }
|
13
|
+
class ConfusionMatrix;
|
14
|
+
|
15
|
+
namespace DataSet {
|
16
|
+
class DataSet {
|
17
|
+
virtual void perform_count() {}
|
18
|
+
virtual void perform_index() {}
|
19
|
+
public:
|
20
|
+
DataSet() : category_index(-1), counted(false), indexed(false) {}
|
21
|
+
DataSet(DataSet *other) : name(other->name), category_index(other->category_index), counted(false), indexed(false) {
|
22
|
+
Feature *feature = NULL;
|
23
|
+
for(unsigned int i = 0; i < other->features.size(); i++) {
|
24
|
+
feature = other->features[i]->clone();
|
25
|
+
feature_names[feature->name] = feature;
|
26
|
+
features.push_back(feature);
|
27
|
+
}
|
28
|
+
}
|
29
|
+
virtual DataSet *clone_without_examples() { return NULL; }
|
30
|
+
|
31
|
+
tr1::unordered_map<string, Feature *> feature_names;
|
32
|
+
vector<Feature *> features;
|
33
|
+
vector<Example *> examples;
|
34
|
+
string name;
|
35
|
+
int category_index;
|
36
|
+
bool counted;
|
37
|
+
bool indexed;
|
38
|
+
|
39
|
+
NumericFeature *new_numeric_feature(string name);
|
40
|
+
NominalFeature *new_nominal_feature(string name);
|
41
|
+
virtual Example *new_example() { return NULL; }
|
42
|
+
|
43
|
+
void count();
|
44
|
+
void index();
|
45
|
+
|
46
|
+
vector<vector<Example *> > *stratify(int number_of_folds);
|
47
|
+
ConfusionMatrix *cross_fold_validation(Classifier::Classifier *classifier, int number_of_folds);
|
48
|
+
|
49
|
+
string get_name() { return name; }
|
50
|
+
void set_name(string new_name) { name = new_name; }
|
51
|
+
int get_category_index() { return category_index; }
|
52
|
+
int features_size() { return features.size(); }
|
53
|
+
int examples_size() { return examples.size(); }
|
54
|
+
int categories_size() { return ((NominalFeature *)features[category_index])->indexes.size(); }
|
55
|
+
NominalFeature *category_feature() { return (NominalFeature *)features[category_index]; }
|
56
|
+
void set_category_index(int index);
|
57
|
+
|
58
|
+
Feature *get_feature_by_name(string name) { return feature_names[name]; }
|
59
|
+
Feature *get_feature_by_index(int index) { return features[index]; }
|
60
|
+
Example *get_example_by_index(int index) { return examples[index]; }
|
61
|
+
|
62
|
+
NumericFeature *get_or_create_numeric_feature_by_name(string name) {
|
63
|
+
NumericFeature *feature = (NumericFeature *)feature_names[name];
|
64
|
+
if(feature == NULL)
|
65
|
+
feature = new_numeric_feature(name);
|
66
|
+
return feature;
|
67
|
+
}
|
68
|
+
|
69
|
+
NominalFeature *get_or_create_nominal_feature_by_name(string name) {
|
70
|
+
NominalFeature *feature = (NominalFeature *)feature_names[name];
|
71
|
+
if(feature == NULL)
|
72
|
+
feature = new_nominal_feature(name);
|
73
|
+
return feature;
|
74
|
+
}
|
75
|
+
};
|
76
|
+
}
|
77
|
+
|
78
|
+
#endif
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#ifndef __dense_data_set_h__
|
2
|
+
#define __dense_data_set_h__
|
3
|
+
#include "dense_example.h"
|
4
|
+
|
5
|
+
namespace DataSet {
|
6
|
+
class DenseDataSet : public DataSet {
|
7
|
+
void perform_count() {
|
8
|
+
int example_category_index = 0;
|
9
|
+
double value = 0.0;
|
10
|
+
|
11
|
+
for(vector<Example *>::iterator example = examples.begin(); example < examples.end(); example++) {
|
12
|
+
example_category_index = (int)((*example)->get_value(category_index));
|
13
|
+
for(unsigned int i = 0; i < features.size(); i++) {
|
14
|
+
value = (*example)->get_value(i);
|
15
|
+
features[i]->count_example(value, example_category_index);
|
16
|
+
}
|
17
|
+
}
|
18
|
+
}
|
19
|
+
|
20
|
+
void perform_index() {
|
21
|
+
}
|
22
|
+
|
23
|
+
public:
|
24
|
+
DenseDataSet() : DataSet() {}
|
25
|
+
DenseDataSet(DataSet *other) : DataSet(other) {}
|
26
|
+
|
27
|
+
DenseDataSet *clone_without_examples() {
|
28
|
+
return new DenseDataSet(this);
|
29
|
+
}
|
30
|
+
|
31
|
+
DenseExample *new_example() {
|
32
|
+
DenseExample *example = new DenseExample(features.size());
|
33
|
+
examples.push_back(example);
|
34
|
+
return example;
|
35
|
+
}
|
36
|
+
};
|
37
|
+
}
|
38
|
+
|
39
|
+
#endif
|
@@ -0,0 +1,44 @@
|
|
1
|
+
#include <stdexcept>
|
2
|
+
#ifndef __dense_data_set_example_h__
|
3
|
+
#define __dense_data_set_example_h__
|
4
|
+
#include "dense_data_set.h"
|
5
|
+
#include <iostream>
|
6
|
+
|
7
|
+
namespace DataSet {
|
8
|
+
class DenseExample : public Example {
|
9
|
+
public:
|
10
|
+
double *values;
|
11
|
+
|
12
|
+
DenseExample(int size) : Example(size) {
|
13
|
+
values = (double *) calloc(size, sizeof(double));
|
14
|
+
}
|
15
|
+
|
16
|
+
~DenseExample() {
|
17
|
+
free(values);
|
18
|
+
}
|
19
|
+
|
20
|
+
double get_value(int index) {
|
21
|
+
return values[index];
|
22
|
+
}
|
23
|
+
|
24
|
+
void set_value(int index, double new_value) {
|
25
|
+
values[index] = new_value;
|
26
|
+
}
|
27
|
+
|
28
|
+
double euclidean_distance(Example *other_example) {
|
29
|
+
return 0.0;
|
30
|
+
}
|
31
|
+
|
32
|
+
double cosine_distance(Example *other_example) {
|
33
|
+
return 0.0;
|
34
|
+
}
|
35
|
+
|
36
|
+
void print() {
|
37
|
+
for(int i = 0; i < size; i++)
|
38
|
+
cout << values[i] << ",";
|
39
|
+
cout << endl;
|
40
|
+
}
|
41
|
+
};
|
42
|
+
}
|
43
|
+
|
44
|
+
#endif
|
@@ -0,0 +1,10 @@
|
|
1
|
+
#include "data_set.h"
|
2
|
+
#include "example.h"
|
3
|
+
|
4
|
+
int DataSet::Example::category_index(DataSet *data_set) {
|
5
|
+
return (int)get_value(data_set->category_index);
|
6
|
+
}
|
7
|
+
|
8
|
+
void DataSet::Example::set_category_index(DataSet *data_set, int index) {
|
9
|
+
set_value(data_set->category_index, index);
|
10
|
+
}
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#ifndef __example_h__
|
2
|
+
#define __example_h__
|
3
|
+
|
4
|
+
namespace DataSet {
|
5
|
+
class DataSet;
|
6
|
+
class Example {
|
7
|
+
public:
|
8
|
+
int size;
|
9
|
+
|
10
|
+
Example(int size) : size(size) {}
|
11
|
+
virtual ~Example() {}
|
12
|
+
|
13
|
+
int category_index(DataSet *data_set);
|
14
|
+
void set_category_index(DataSet *data_set, int index);
|
15
|
+
virtual double get_value(int index) { return 0.0; }
|
16
|
+
virtual void set_value(int index, double new_value) {}
|
17
|
+
virtual double euclidean_distance(Example *other_example) { return 0.0; }
|
18
|
+
virtual double cosine_distance(Example *other_example) { return 0.0; }
|
19
|
+
virtual void print() {}
|
20
|
+
};
|
21
|
+
}
|
22
|
+
|
23
|
+
#endif
|
@@ -0,0 +1,36 @@
|
|
1
|
+
#ifndef __feature_h__
|
2
|
+
#define __feature_h__
|
3
|
+
#include <string>
|
4
|
+
using namespace std;
|
5
|
+
|
6
|
+
namespace DataSet {
|
7
|
+
class DataSet;
|
8
|
+
|
9
|
+
class Feature {
|
10
|
+
public:
|
11
|
+
string name;
|
12
|
+
int index;
|
13
|
+
|
14
|
+
string get_name() { return name; }
|
15
|
+
int get_index() { return index; }
|
16
|
+
void set_name(string new_name) { name = new_name; }
|
17
|
+
void set_index(int new_index) { index = new_index; }
|
18
|
+
|
19
|
+
Feature(string name, int index) : name(name), index(index) {}
|
20
|
+
virtual Feature *clone() { return NULL; }
|
21
|
+
virtual void reset() {}
|
22
|
+
virtual void print() {}
|
23
|
+
|
24
|
+
// counting
|
25
|
+
virtual void prepare_for_counting(DataSet *data_set) {}
|
26
|
+
virtual void count_example(double value, int category_index) {}
|
27
|
+
virtual void finalise_counting(DataSet *data_set) {}
|
28
|
+
|
29
|
+
// indexing
|
30
|
+
virtual void prepare_for_indexing(DataSet *data_set) {}
|
31
|
+
virtual void index_example(double value, Example *example) {}
|
32
|
+
virtual void finalise_indexing(DataSet *data_set) {}
|
33
|
+
};
|
34
|
+
}
|
35
|
+
|
36
|
+
#endif
|
@@ -0,0 +1,57 @@
|
|
1
|
+
#include "nominal_feature.h"
|
2
|
+
#include "data_set/data_set.h"
|
3
|
+
#include "data_set/example.h"
|
4
|
+
|
5
|
+
|
6
|
+
void DataSet::NominalFeature::prepare_for_counting(DataSet *data_set) {
|
7
|
+
int categories_count = data_set->categories_size();
|
8
|
+
int values_count = names.size();
|
9
|
+
|
10
|
+
category_frequencies.resize(categories_count + 1);
|
11
|
+
category_probabilities.resize(categories_count + 1);
|
12
|
+
probabilities.resize(values_count);
|
13
|
+
frequencies.resize(values_count);
|
14
|
+
|
15
|
+
for(int i = 1; i <= categories_count; i++) {
|
16
|
+
category_frequencies[i].resize(values_count);
|
17
|
+
category_probabilities[i].resize(values_count);
|
18
|
+
}
|
19
|
+
}
|
20
|
+
|
21
|
+
void DataSet::NominalFeature::count_example(double value, int category_index) {
|
22
|
+
frequencies[(int)value] += 1;
|
23
|
+
category_frequencies[category_index][(int)value] += 1;
|
24
|
+
}
|
25
|
+
|
26
|
+
void DataSet::NominalFeature::finalise_counting(DataSet *data_set) {
|
27
|
+
int categories_count = data_set->categories_size();
|
28
|
+
int examples_count = data_set->examples.size();
|
29
|
+
int values_count = names.size();
|
30
|
+
|
31
|
+
// overall value probabilities
|
32
|
+
for(int i = 0; i < values_count; i++)
|
33
|
+
probabilities[i] = ((double)frequencies[i] + 1) / examples_count;
|
34
|
+
|
35
|
+
// value probabilities per category
|
36
|
+
if(index != data_set->category_index) {
|
37
|
+
for(int i = 1; i <= categories_count; i++) {
|
38
|
+
for(int j = 0; j < values_count; j++)
|
39
|
+
category_probabilities[i][j] = ((double)category_frequencies[i][j] + 1) / data_set->category_feature()->value_frequency(i);
|
40
|
+
}
|
41
|
+
}
|
42
|
+
}
|
43
|
+
|
44
|
+
void DataSet::NominalFeature::prepare_for_indexing(DataSet *data_set) {
|
45
|
+
examples_with_value.resize(names.size());
|
46
|
+
}
|
47
|
+
|
48
|
+
void DataSet::NominalFeature::index_example(double value, Example *example) {
|
49
|
+
examples_with_value[(int)value].push_back(example);
|
50
|
+
}
|
51
|
+
|
52
|
+
void DataSet::NominalFeature::finalise_indexing(DataSet *data_set) {}
|
53
|
+
|
54
|
+
void DataSet::NominalFeature::print() {
|
55
|
+
for(int i = 1; i < names.size(); i++)
|
56
|
+
cout << i << ": " << names[i] << endl;
|
57
|
+
}
|