thera 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +56 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +20 -0
- data/LICENSE.txt +1 -0
- data/README.rdoc +8 -0
- data/Rakefile +1 -0
- data/ext/Makefile +225 -0
- data/ext/extconf.rb +29 -0
- data/ext/quarry/quarry_toolkit.cpp +148 -0
- data/lib/quarry/Makefile.linux +2 -0
- data/lib/quarry/Makefile.osx +6 -0
- data/lib/quarry/Makefile.targets +23 -0
- data/lib/quarry/obj/.gitkeep +0 -0
- data/lib/quarry/src/classifier/aode/aode_classifier.cpp +0 -0
- data/lib/quarry/src/classifier/aode/aode_classifier.h +0 -0
- data/lib/quarry/src/classifier/centroid/centroid_classifier.cpp +0 -0
- data/lib/quarry/src/classifier/centroid/centroid_classifier.h +0 -0
- data/lib/quarry/src/classifier/classifier.cpp +32 -0
- data/lib/quarry/src/classifier/classifier.h +59 -0
- data/lib/quarry/src/classifier/knn/knn_classifier.cpp +0 -0
- data/lib/quarry/src/classifier/knn/knn_classifier.h +0 -0
- data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.cpp +40 -0
- data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.h +18 -0
- data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +80 -0
- data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.h +52 -0
- data/lib/quarry/src/data_set/data_set.cpp +130 -0
- data/lib/quarry/src/data_set/data_set.h +78 -0
- data/lib/quarry/src/data_set/dense/dense_data_set.h +39 -0
- data/lib/quarry/src/data_set/dense/dense_example.h +44 -0
- data/lib/quarry/src/data_set/example.cpp +10 -0
- data/lib/quarry/src/data_set/example.h +23 -0
- data/lib/quarry/src/data_set/feature.h +36 -0
- data/lib/quarry/src/data_set/features/nominal_feature.cpp +57 -0
- data/lib/quarry/src/data_set/features/nominal_feature.h +76 -0
- data/lib/quarry/src/data_set/features/numeric_feature.cpp +69 -0
- data/lib/quarry/src/data_set/features/numeric_feature.h +78 -0
- data/lib/quarry/src/data_set/sparse/sparse_data_set.h +40 -0
- data/lib/quarry/src/data_set/sparse/sparse_example.cpp +82 -0
- data/lib/quarry/src/data_set/sparse/sparse_example.h +38 -0
- data/lib/quarry/src/metrics/confusion_matrix.cpp +129 -0
- data/lib/quarry/src/metrics/confusion_matrix.h +82 -0
- data/lib/quarry/src/model/model.cpp +29 -0
- data/lib/quarry/src/model/model.h +50 -0
- data/lib/quarry/src/preprocessing/examples/example_preprocessor.h +20 -0
- data/lib/quarry/src/preprocessing/examples/weights/binary_weight.h +20 -0
- data/lib/quarry/src/preprocessing/examples/weights/local_weight.h +29 -0
- data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +19 -0
- data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +59 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/downcase.h +26 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h +17 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer.h +44 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer_original.cpp +375 -0
- data/lib/quarry/src/preprocessing/text/text_pipeline.cpp +29 -0
- data/lib/quarry/src/preprocessing/text/text_pipeline.h +37 -0
- data/lib/quarry/src/preprocessing/text/token_selector/pos_tag_selector.h +21 -0
- data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp +82 -0
- data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h +20 -0
- data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h +17 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.cpp +29 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.h +20 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h +19 -0
- data/lib/quarry/src/quarry.cpp +1 -0
- data/lib/quarry/src/quarry.h +29 -0
- data/lib/quarry/src/storage/arff.cpp +198 -0
- data/lib/quarry/src/storage/arff.h +26 -0
- data/lib/quarry/src/storage/binary.cpp +457 -0
- data/lib/quarry/src/storage/binary.h +79 -0
- data/lib/quarry/src/storage/folders.cpp +98 -0
- data/lib/quarry/src/storage/folders.h +25 -0
- data/lib/quarry/src/storage/storage.h +19 -0
- data/lib/quarry/src/test.cpp +6 -0
- data/lib/quarry_rb/classifier/classifier.rb +22 -0
- data/lib/quarry_rb/classifier/naive_bayes_classifier.rb +10 -0
- data/lib/quarry_rb/confusion_matrix.rb +58 -0
- data/lib/quarry_rb/data_set/data_set.rb +42 -0
- data/lib/quarry_rb/data_set/example.rb +33 -0
- data/lib/quarry_rb/data_set/feature.rb +28 -0
- data/lib/quarry_rb/enumerable_helper.rb +32 -0
- data/lib/quarry_rb/model/model.rb +56 -0
- data/lib/quarry_rb/storage/arff.rb +11 -0
- data/lib/quarry_rb/storage/binary.rb +23 -0
- data/lib/quarry_rb/storage/folders.rb +11 -0
- data/lib/quarry_rb/text_pipeline.rb +16 -0
- data/lib/thera.rb +20 -0
- data/test/helper.rb +19 -0
- data/test/test_quarry.rb +33 -0
- data/thera.gemspec +21 -0
- metadata +148 -0
@@ -0,0 +1,76 @@
|
|
1
|
+
#ifndef __nominal_feature_h__
|
2
|
+
#define __nominal_feature_h__
|
3
|
+
#include "data_set/example.h"
|
4
|
+
#include "data_set/feature.h"
|
5
|
+
#include <vector>
|
6
|
+
#include <map>
|
7
|
+
#include <iostream>
|
8
|
+
|
9
|
+
namespace DataSet {
|
10
|
+
class DataSet;
|
11
|
+
|
12
|
+
class NominalFeature : public Feature {
|
13
|
+
public:
|
14
|
+
NominalFeature(string name, int index) : Feature(name, index), names(1, "") {}
|
15
|
+
NominalFeature(NominalFeature *other) : Feature(other->name, other->index), indexes(other->indexes), names(other->names) {}
|
16
|
+
|
17
|
+
NominalFeature *clone() {
|
18
|
+
return new NominalFeature(this);
|
19
|
+
}
|
20
|
+
|
21
|
+
void reset() {
|
22
|
+
frequencies.clear();
|
23
|
+
probabilities.clear();
|
24
|
+
category_frequencies.clear();
|
25
|
+
category_probabilities.clear();
|
26
|
+
examples_with_value.clear();
|
27
|
+
}
|
28
|
+
|
29
|
+
void print();
|
30
|
+
|
31
|
+
// nominal values are referenced by index in examples
|
32
|
+
// (e.g "CategoryA" -> 2; would be stored as 2 in an example)
|
33
|
+
map<string, int> indexes;
|
34
|
+
vector<string> names;
|
35
|
+
void add_value(string name) {
|
36
|
+
int index = indexes.size() + 1;
|
37
|
+
indexes[name] = index;
|
38
|
+
names.push_back(name);
|
39
|
+
}
|
40
|
+
|
41
|
+
int value_index(string name) {
|
42
|
+
int index = indexes[name];
|
43
|
+
if(index == 0) {
|
44
|
+
index = indexes.size();
|
45
|
+
indexes[name] = index;
|
46
|
+
names.push_back(name);
|
47
|
+
}
|
48
|
+
return index;
|
49
|
+
}
|
50
|
+
|
51
|
+
// counts
|
52
|
+
void prepare_for_counting(DataSet *data_set);
|
53
|
+
void count_example(double value, int category_index);
|
54
|
+
void finalise_counting(DataSet *data_set);
|
55
|
+
|
56
|
+
// counts for this feature over the entire data set
|
57
|
+
vector<int> frequencies;
|
58
|
+
vector<double> probabilities;
|
59
|
+
int value_frequency(int index) { return frequencies[index]; }
|
60
|
+
double value_probability(int index) { return probabilities[index]; }
|
61
|
+
|
62
|
+
// counts for this feature per category
|
63
|
+
vector<vector<int> > category_frequencies;
|
64
|
+
vector<vector<double> > category_probabilities;
|
65
|
+
int category_value_frequency(int category, int index) { return category_frequencies[category][index]; }
|
66
|
+
double category_value_probability(int category, int index) { return category_probabilities[category][index]; }
|
67
|
+
|
68
|
+
// indexes
|
69
|
+
void prepare_for_indexing(DataSet *data_set);
|
70
|
+
void index_example(double value, Example *example);
|
71
|
+
void finalise_indexing(DataSet *data_set);
|
72
|
+
vector<vector<Example *> > examples_with_value;
|
73
|
+
};
|
74
|
+
}
|
75
|
+
|
76
|
+
#endif
|
@@ -0,0 +1,69 @@
|
|
1
|
+
#include "data_set/data_set.h"
|
2
|
+
#include "data_set/example.h"
|
3
|
+
#include "numeric_feature.h"
|
4
|
+
#include "stdlib.h"
|
5
|
+
|
6
|
+
void DataSet::NumericFeature::prepare_for_counting(DataSet *data_set) {
|
7
|
+
category_counts = (Counts *) calloc(sizeof(Counts), data_set->categories_size() + 1);
|
8
|
+
}
|
9
|
+
|
10
|
+
void DataSet::NumericFeature::count_example(double value, int category_index) {
|
11
|
+
// non zero count
|
12
|
+
if(value != 0.0) {
|
13
|
+
counts.non_zero_count++;
|
14
|
+
category_counts[category_index].non_zero_count++;
|
15
|
+
}
|
16
|
+
|
17
|
+
// minima
|
18
|
+
if(value < counts.min)
|
19
|
+
counts.min = value;
|
20
|
+
if(value < category_counts[category_index].min)
|
21
|
+
category_counts[category_index].min = value;
|
22
|
+
|
23
|
+
// maxima
|
24
|
+
if(value > counts.max)
|
25
|
+
counts.max = value;
|
26
|
+
if(value > category_counts[category_index].max)
|
27
|
+
category_counts[category_index].max = value;
|
28
|
+
|
29
|
+
// sum
|
30
|
+
counts.sum += value;
|
31
|
+
category_counts[category_index].sum += value;
|
32
|
+
|
33
|
+
// squared sum
|
34
|
+
counts.sq_sum += (value * value);
|
35
|
+
category_counts[category_index].sq_sum += (value * value);
|
36
|
+
}
|
37
|
+
|
38
|
+
void DataSet::NumericFeature::finalise_counting(DataSet *data_set) {
|
39
|
+
int categories_count = data_set->categories_size();
|
40
|
+
int examples_count = data_set->examples.size();
|
41
|
+
|
42
|
+
// mean
|
43
|
+
counts.mean = counts.sum / examples_count;
|
44
|
+
for(int i = 1; i <= categories_count; i++)
|
45
|
+
category_counts[i].mean = category_counts[i].sum / data_set->category_feature()->value_frequency(i);
|
46
|
+
|
47
|
+
// variance
|
48
|
+
counts.variance = (counts.sq_sum / examples_count) - (counts.mean * counts.mean);
|
49
|
+
for(int i = 1; i <= categories_count; i++)
|
50
|
+
category_counts[i].variance = (category_counts[i].sq_sum / data_set->category_feature()->value_frequency(i)) - (category_counts[i].mean * category_counts[i].mean);
|
51
|
+
}
|
52
|
+
|
53
|
+
void DataSet::NumericFeature::prepare_for_indexing(DataSet *data_set) {}
|
54
|
+
|
55
|
+
void DataSet::NumericFeature::index_example(double value, Example *example) {
|
56
|
+
if(value != 0.0)
|
57
|
+
non_zero_examples.push_back(example);
|
58
|
+
}
|
59
|
+
|
60
|
+
void DataSet::NumericFeature::finalise_indexing(DataSet *data_set) {}
|
61
|
+
|
62
|
+
void DataSet::NumericFeature::print(DataSet::DataSet *data_set) {
|
63
|
+
cout << "F" << index << ", " << name << endl;
|
64
|
+
print_counts(&counts);
|
65
|
+
for(int i = 0; i < (data_set->categories_size() + 1); i++) {
|
66
|
+
cout << "C" << i << ":";
|
67
|
+
print_counts(&(category_counts[i]));
|
68
|
+
}
|
69
|
+
}
|
@@ -0,0 +1,78 @@
|
|
1
|
+
#ifndef __numeric_feature_h__
|
2
|
+
#define __numeric_feature_h__
|
3
|
+
#include "data_set/example.h"
|
4
|
+
#include "data_set/feature.h"
|
5
|
+
#include <iostream>
|
6
|
+
|
7
|
+
namespace DataSet {
|
8
|
+
class DataSet;
|
9
|
+
|
10
|
+
class NumericFeature : public Feature {
|
11
|
+
public:
|
12
|
+
NumericFeature(string name, int index) : Feature(name, index), category_counts(NULL), non_zero_examples() {
|
13
|
+
reset();
|
14
|
+
}
|
15
|
+
|
16
|
+
NumericFeature *clone() {
|
17
|
+
return new NumericFeature(name, index);
|
18
|
+
}
|
19
|
+
|
20
|
+
void reset() {
|
21
|
+
memset(&counts, 0, sizeof(Counts));
|
22
|
+
if(category_counts != NULL)
|
23
|
+
free(category_counts);
|
24
|
+
category_counts = NULL;
|
25
|
+
non_zero_examples.clear();
|
26
|
+
}
|
27
|
+
|
28
|
+
void prepare_for_counting(DataSet *data_set);
|
29
|
+
void count_example(double value, int category_index);
|
30
|
+
void finalise_counting(DataSet *data_set);
|
31
|
+
|
32
|
+
void prepare_for_indexing(DataSet *data_set);
|
33
|
+
void index_example(double value, Example *example);
|
34
|
+
void finalise_indexing(DataSet *data_set);
|
35
|
+
|
36
|
+
// counts
|
37
|
+
typedef struct {
|
38
|
+
int non_zero_count;
|
39
|
+
double sum;
|
40
|
+
double sq_sum;
|
41
|
+
double min;
|
42
|
+
double max;
|
43
|
+
double mean;
|
44
|
+
double variance;
|
45
|
+
} Counts;
|
46
|
+
|
47
|
+
// counts for this feature over the entire data set
|
48
|
+
Counts counts;
|
49
|
+
int non_zero_count() { return counts.non_zero_count; }
|
50
|
+
double sum() { return counts.sum; }
|
51
|
+
double sq_sum() { return counts.sq_sum; }
|
52
|
+
double min() { return counts.min; }
|
53
|
+
double max() { return counts.max; }
|
54
|
+
double mean() { return counts.mean; }
|
55
|
+
double variance() { return counts.variance; }
|
56
|
+
|
57
|
+
// counts for this feature per category
|
58
|
+
Counts *category_counts;
|
59
|
+
int category_non_zero_count(int index) { return category_counts[index].non_zero_count; }
|
60
|
+
double category_sum(int index) { return category_counts[index].sum; }
|
61
|
+
double category_sq_sum(int index) { return category_counts[index].sq_sum; }
|
62
|
+
double category_min(int index) { return category_counts[index].min; }
|
63
|
+
double category_max(int index) { return category_counts[index].max; }
|
64
|
+
double category_mean(int index) { return category_counts[index].mean; }
|
65
|
+
double category_variance(int index) { return category_counts[index].variance; }
|
66
|
+
|
67
|
+
void print(DataSet *data_set);
|
68
|
+
|
69
|
+
void print_counts(Counts *c) {
|
70
|
+
cout << c->non_zero_count << ";" << c->sum << ";" << c->sq_sum << ";" << c->min << ";" << c->max << ";" << c->mean << ";" << c->variance << endl;
|
71
|
+
}
|
72
|
+
|
73
|
+
// indexes
|
74
|
+
vector<Example *> non_zero_examples;
|
75
|
+
};
|
76
|
+
}
|
77
|
+
|
78
|
+
#endif
|
@@ -0,0 +1,40 @@
|
|
1
|
+
#ifndef __sparse_data_set_h__
|
2
|
+
#define __sparse_data_set_h__
|
3
|
+
#include "data_set/data_set.h"
|
4
|
+
#include "sparse_example.h"
|
5
|
+
|
6
|
+
namespace DataSet {
|
7
|
+
class SparseDataSet : public DataSet {
|
8
|
+
void perform_count() {
|
9
|
+
int example_category_index = 0;
|
10
|
+
SparseExample::Value *value;
|
11
|
+
|
12
|
+
for(vector<Example *>::iterator example = examples.begin(); example < examples.end(); example++) {
|
13
|
+
example_category_index = (int)((*example)->get_value(category_index));
|
14
|
+
for(int i = 0; i < (*example)->size; i++) {
|
15
|
+
value = &(((SparseExample *)(*example))->values[i]);
|
16
|
+
features[value->index]->count_example(value->value, example_category_index);
|
17
|
+
}
|
18
|
+
}
|
19
|
+
}
|
20
|
+
|
21
|
+
void perform_index() {
|
22
|
+
}
|
23
|
+
|
24
|
+
public:
|
25
|
+
SparseDataSet() : DataSet() {}
|
26
|
+
SparseDataSet(DataSet *other) : DataSet(other) {}
|
27
|
+
|
28
|
+
SparseDataSet *clone_without_examples() {
|
29
|
+
return new SparseDataSet(this);
|
30
|
+
}
|
31
|
+
|
32
|
+
SparseExample *new_example(int buffer_size = 0) {
|
33
|
+
SparseExample *example = new SparseExample(buffer_size);
|
34
|
+
examples.push_back(example);
|
35
|
+
return example;
|
36
|
+
}
|
37
|
+
};
|
38
|
+
}
|
39
|
+
|
40
|
+
#endif
|
@@ -0,0 +1,82 @@
|
|
1
|
+
#include "sparse_data_set.h"
|
2
|
+
#include "sparse_example.h"
|
3
|
+
#include <stdlib.h>
|
4
|
+
|
5
|
+
double DataSet::SparseExample::get_value(int feature_index) {
|
6
|
+
if(feature_index == 0 && size != 0)
|
7
|
+
return values[0].value;
|
8
|
+
|
9
|
+
int low = 0;
|
10
|
+
int high = size - 1;
|
11
|
+
int mid = high / 2;
|
12
|
+
|
13
|
+
// branch prediction makes this triple clause if statement faster
|
14
|
+
// than a double clause "single comparison" search. precondition
|
15
|
+
// loops also seem to be faster than post condition loops in GCC,
|
16
|
+
// really don't know why... this implementation ends up being
|
17
|
+
// around 30% faster than well known single comparison versions.
|
18
|
+
while(low <= high) {
|
19
|
+
if(values[mid].index < feature_index) {
|
20
|
+
low = mid + 1;
|
21
|
+
} else if(values[mid].index > feature_index) {
|
22
|
+
high = mid - 1;
|
23
|
+
} else {
|
24
|
+
return values[mid].value;
|
25
|
+
}
|
26
|
+
mid = (high + low) / 2;
|
27
|
+
}
|
28
|
+
|
29
|
+
return 0.0;
|
30
|
+
}
|
31
|
+
|
32
|
+
double DataSet::SparseExample::get_value(string feature_name, SparseDataSet *data_set) {
|
33
|
+
return get_value(data_set->get_feature_by_name(feature_name)->index);
|
34
|
+
}
|
35
|
+
|
36
|
+
void DataSet::SparseExample::set_value(int feature_index, double new_value) {
|
37
|
+
int i = 0;
|
38
|
+
|
39
|
+
for(; i < size; i++) {
|
40
|
+
if(values[i].index == feature_index) {
|
41
|
+
values[i].value = new_value;
|
42
|
+
return;
|
43
|
+
} else if(values[i].index > feature_index) {
|
44
|
+
break;
|
45
|
+
}
|
46
|
+
}
|
47
|
+
|
48
|
+
if(buffer_size == size)
|
49
|
+
values = (Value *) realloc(values, sizeof(Value) * (++buffer_size));
|
50
|
+
|
51
|
+
if(i != size)
|
52
|
+
memcpy(&values[i + 1], &values[i], (size - i) * sizeof(Value));
|
53
|
+
|
54
|
+
values[i].index = feature_index;
|
55
|
+
values[i].value = new_value;
|
56
|
+
size++;
|
57
|
+
}
|
58
|
+
|
59
|
+
void DataSet::SparseExample::append_value(int feature_index, double new_value) {
|
60
|
+
if(buffer_size == size)
|
61
|
+
values = (Value *) realloc(values, sizeof(Value) * (++buffer_size));
|
62
|
+
values[size].index = feature_index;
|
63
|
+
values[size].value = new_value;
|
64
|
+
size++;
|
65
|
+
}
|
66
|
+
|
67
|
+
double DataSet::SparseExample::euclidean_distance(Example *other_example) {
|
68
|
+
return 0.0;
|
69
|
+
}
|
70
|
+
|
71
|
+
double DataSet::SparseExample::cosine_distance(Example *other_example) {
|
72
|
+
return 0.0;
|
73
|
+
}
|
74
|
+
|
75
|
+
void DataSet::SparseExample::print() {
|
76
|
+
for(int i = 0; i < size; i++) {
|
77
|
+
cout << values[i].index << ":" << values[i].value;
|
78
|
+
if(i < (size - 1))
|
79
|
+
cout << ",";
|
80
|
+
}
|
81
|
+
cout << endl;
|
82
|
+
}
|
@@ -0,0 +1,38 @@
|
|
1
|
+
#ifndef __sparse_data_set_example_h__
|
2
|
+
#define __sparse_data_set_example_h__
|
3
|
+
#include "data_set/example.h"
|
4
|
+
#include <stdlib.h>
|
5
|
+
#include <string>
|
6
|
+
using namespace std;
|
7
|
+
|
8
|
+
namespace DataSet {
|
9
|
+
class SparseDataSet;
|
10
|
+
|
11
|
+
class SparseExample : public Example {
|
12
|
+
public:
|
13
|
+
typedef struct {
|
14
|
+
int index;
|
15
|
+
double value;
|
16
|
+
} Value;
|
17
|
+
|
18
|
+
Value *values;
|
19
|
+
int buffer_size;
|
20
|
+
|
21
|
+
SparseExample(int buffer_size = 0) : Example(0), buffer_size(buffer_size) {
|
22
|
+
if(buffer_size > 0)
|
23
|
+
values = (Value *) calloc(sizeof(Value), buffer_size);
|
24
|
+
else
|
25
|
+
values = NULL;
|
26
|
+
}
|
27
|
+
|
28
|
+
double get_value(int feature_index);
|
29
|
+
double get_value(string feature_name, SparseDataSet *data_set);
|
30
|
+
void set_value(int feature_index, double new_value);
|
31
|
+
void append_value(int feature_index, double new_value);
|
32
|
+
double euclidean_distance(Example *other_example);
|
33
|
+
double cosine_distance(Example *other_example);
|
34
|
+
void print();
|
35
|
+
};
|
36
|
+
}
|
37
|
+
|
38
|
+
#endif
|
@@ -0,0 +1,129 @@
|
|
1
|
+
#include "confusion_matrix.h"
|
2
|
+
#include <iostream>
|
3
|
+
const string ConfusionMatrix::average_row_name = "Average";
|
4
|
+
|
5
|
+
// TODO: CM should reference a classifier, not a data set
|
6
|
+
|
7
|
+
ConfusionMatrix::ConfusionMatrix(DataSet::DataSet *data_set) : incorrect(0), correct(0), data_set(data_set) {
|
8
|
+
int count = data_set->categories_size();
|
9
|
+
counts.reserve(count);
|
10
|
+
for(int i = 0; i < count; i++)
|
11
|
+
counts.push_back(valarray<int>(0, count));
|
12
|
+
}
|
13
|
+
|
14
|
+
void ConfusionMatrix::add(int predicted, int actual) {
|
15
|
+
// category indexes are 1 based
|
16
|
+
counts[predicted - 1][actual - 1] += 1;
|
17
|
+
if(predicted == actual)
|
18
|
+
correct++;
|
19
|
+
else
|
20
|
+
incorrect++;
|
21
|
+
}
|
22
|
+
|
23
|
+
double ConfusionMatrix::accuracy() {
|
24
|
+
return ((double)correct) / (correct + incorrect);
|
25
|
+
}
|
26
|
+
|
27
|
+
double ConfusionMatrix::error() {
|
28
|
+
return ((double)incorrect) / (correct + incorrect);
|
29
|
+
}
|
30
|
+
|
31
|
+
// true positive
|
32
|
+
int ConfusionMatrix::tp(int category) {
|
33
|
+
return counts[category - 1][category - 1];
|
34
|
+
}
|
35
|
+
|
36
|
+
// false positive
|
37
|
+
int ConfusionMatrix::fp(int category) {
|
38
|
+
return counts[category - 1].sum() - tp(category);
|
39
|
+
}
|
40
|
+
|
41
|
+
// true negative
|
42
|
+
int ConfusionMatrix::tn(int category) {
|
43
|
+
int sum = 0, count = data_set->categories_size();
|
44
|
+
for(int i = 1; i <= count; i++)
|
45
|
+
for(int j = 1; j <= count; j++)
|
46
|
+
if(i != category && j != category)
|
47
|
+
sum += counts[i - 1][j - 1];
|
48
|
+
return sum;
|
49
|
+
}
|
50
|
+
|
51
|
+
// false negative
|
52
|
+
int ConfusionMatrix::fn(int category) {
|
53
|
+
int sum = 0, count = data_set->categories_size();
|
54
|
+
for(int i = 1; i <= count; i++)
|
55
|
+
if(i != category)
|
56
|
+
sum += counts[i - 1][category - 1];
|
57
|
+
return sum;
|
58
|
+
}
|
59
|
+
|
60
|
+
double ConfusionMatrix::precision(int category) {
|
61
|
+
int denom = tp(category) + fp(category);
|
62
|
+
if(denom == 0)
|
63
|
+
return 0.0;
|
64
|
+
return ((double)tp(category)) / denom;
|
65
|
+
}
|
66
|
+
|
67
|
+
double ConfusionMatrix::recall(int category) {
|
68
|
+
int denom = tp(category) + fn(category);
|
69
|
+
if(denom == 0)
|
70
|
+
return 0.0;
|
71
|
+
return ((double)tp(category)) / denom;
|
72
|
+
}
|
73
|
+
|
74
|
+
double ConfusionMatrix::fscore(int category) {
|
75
|
+
double p = precision(category);
|
76
|
+
double r = recall(category);
|
77
|
+
if((p + r) == 0.0)
|
78
|
+
return 0.0;
|
79
|
+
return (2 * p * r) / (p + r);
|
80
|
+
}
|
81
|
+
|
82
|
+
void ConfusionMatrix::print_summary() {
|
83
|
+
// overall counts and summary
|
84
|
+
cout.precision(4);
|
85
|
+
cout << "== Summary ==" << endl;
|
86
|
+
cout << setw(23) <<"Correctly classified:" << setw(12) << right << correct << setw(10) << right << accuracy() * 100 << "%" << endl;
|
87
|
+
cout << setw(23) << "Incorrectly classified:" << setw(12) << right << incorrect << setw(10) << right << error() * 100 << "%" << endl;
|
88
|
+
cout << setw(23) << "Total classifications:" << setw(12) << right << correct + incorrect << endl << endl;
|
89
|
+
|
90
|
+
// determine the width of the left (category name) column
|
91
|
+
int max_name_length = 0;
|
92
|
+
for(int category = 1; category <= data_set->categories_size(); category++)
|
93
|
+
if(data_set->category_feature()->names[category].length() > max_name_length)
|
94
|
+
max_name_length = data_set->category_feature()->names[category].length();
|
95
|
+
if(average_row_name.length() > max_name_length)
|
96
|
+
max_name_length = average_row_name.length();
|
97
|
+
max_name_length += 1;
|
98
|
+
|
99
|
+
// detailed category information
|
100
|
+
cout << "== Category Performance ==" << endl;
|
101
|
+
cout << setw(max_name_length) << "";
|
102
|
+
cout << setw(9) << right << "True +";
|
103
|
+
cout << setw(9) << right << "False +";
|
104
|
+
cout << setw(9) << right << "True -";
|
105
|
+
cout << setw(9) << right << "False -";
|
106
|
+
cout << setw(9) << right << "Precis.";
|
107
|
+
cout << setw(9) << right << "Recall";
|
108
|
+
cout << setw(9) << right << "F-score" << endl;
|
109
|
+
|
110
|
+
for(int category = 1; category <= data_set->categories_size(); category++) {
|
111
|
+
cout << setw(max_name_length) << data_set->category_feature()->names[category];
|
112
|
+
cout << setw(9) << tp(category);
|
113
|
+
cout << setw(9) << fp(category);
|
114
|
+
cout << setw(9) << tn(category);
|
115
|
+
cout << setw(9) << fn(category);
|
116
|
+
cout << setw(8) << precision(category) * 100 << "%";
|
117
|
+
cout << setw(8) << recall(category) * 100 << "%";
|
118
|
+
cout << setw(8) << fscore(category) * 100 << "%" << endl;
|
119
|
+
}
|
120
|
+
|
121
|
+
cout << setw(max_name_length) << average_row_name;
|
122
|
+
cout << setw(9) << avg_tp();
|
123
|
+
cout << setw(9) << avg_fp();
|
124
|
+
cout << setw(9) << avg_tn();
|
125
|
+
cout << setw(9) << avg_fn();
|
126
|
+
cout << setw(8) << avg_precision() * 100 << "%";
|
127
|
+
cout << setw(8) << avg_recall() * 100 << "%";
|
128
|
+
cout << setw(8) << avg_fscore() * 100 << "%" << endl;
|
129
|
+
}
|
@@ -0,0 +1,82 @@
|
|
1
|
+
#ifndef __confusion_matrix__
|
2
|
+
#define __confusion_matrix__
|
3
|
+
#include "data_set/data_set.h"
|
4
|
+
#include <vector>
|
5
|
+
#include <valarray>
|
6
|
+
#include <iostream>
|
7
|
+
#include <iomanip>
|
8
|
+
using namespace std;
|
9
|
+
|
10
|
+
namespace DataSet {
|
11
|
+
class Category;
|
12
|
+
}
|
13
|
+
|
14
|
+
class ConfusionMatrix {
|
15
|
+
public:
|
16
|
+
static const string average_row_name;
|
17
|
+
DataSet::DataSet *data_set;
|
18
|
+
vector<valarray<int> > counts;
|
19
|
+
int correct;
|
20
|
+
int incorrect;
|
21
|
+
|
22
|
+
ConfusionMatrix(DataSet::DataSet *data_set);
|
23
|
+
void merge(ConfusionMatrix *other) {
|
24
|
+
incorrect += other->incorrect;
|
25
|
+
correct += other->correct;
|
26
|
+
|
27
|
+
for(unsigned int i = 0; i < counts.size(); i++)
|
28
|
+
counts[i] += other->counts[i];
|
29
|
+
}
|
30
|
+
|
31
|
+
void add(int predicted, int actual);
|
32
|
+
double accuracy();
|
33
|
+
double error();
|
34
|
+
int tp(int category);
|
35
|
+
int fp(int category);
|
36
|
+
int tn(int category);
|
37
|
+
int fn(int category);
|
38
|
+
double precision(int category);
|
39
|
+
double recall(int category);
|
40
|
+
double fscore(int category);
|
41
|
+
void print_summary();
|
42
|
+
|
43
|
+
// averages
|
44
|
+
double avg_tp() {
|
45
|
+
return apply<int>(&ConfusionMatrix::tp);
|
46
|
+
}
|
47
|
+
|
48
|
+
double avg_fp() {
|
49
|
+
return apply<int>(&ConfusionMatrix::fp);
|
50
|
+
}
|
51
|
+
|
52
|
+
double avg_tn() {
|
53
|
+
return apply<int>(&ConfusionMatrix::tn);
|
54
|
+
}
|
55
|
+
|
56
|
+
double avg_fn() {
|
57
|
+
return apply<int>(&ConfusionMatrix::fn);
|
58
|
+
}
|
59
|
+
|
60
|
+
double avg_precision() {
|
61
|
+
return apply<double>(&ConfusionMatrix::precision);
|
62
|
+
}
|
63
|
+
|
64
|
+
double avg_recall() {
|
65
|
+
return apply<double>(&ConfusionMatrix::recall);
|
66
|
+
}
|
67
|
+
|
68
|
+
double avg_fscore() {
|
69
|
+
return apply<double>(&ConfusionMatrix::fscore);
|
70
|
+
}
|
71
|
+
|
72
|
+
protected:
|
73
|
+
template <class T, class Function>
|
74
|
+
double apply(Function func) {
|
75
|
+
T result = 0.0;
|
76
|
+
for(int category = 1; category <= data_set->categories_size(); category++)
|
77
|
+
result += (this->*func)(category);
|
78
|
+
return result / ((double)counts.size());
|
79
|
+
}
|
80
|
+
};
|
81
|
+
|
82
|
+
#endif
|
@@ -0,0 +1,29 @@
|
|
1
|
+
#include "model.h"
|
2
|
+
|
3
|
+
void Model::Model::train(DataSet::Example *example) {
|
4
|
+
}
|
5
|
+
|
6
|
+
void Model::Model::train_text(string text) {
|
7
|
+
}
|
8
|
+
|
9
|
+
int Model::Model::classify(DataSet::Example *example) {
|
10
|
+
return classifier->classify(example);
|
11
|
+
}
|
12
|
+
|
13
|
+
int Model::Model::classify_text(string text) {
|
14
|
+
DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str());
|
15
|
+
int category = classifier->classify(example);
|
16
|
+
delete example;
|
17
|
+
return category;
|
18
|
+
}
|
19
|
+
|
20
|
+
vector<Classifier::Score> *Model::Model::rank(DataSet::Example *example) {
|
21
|
+
return classifier->rank(example);
|
22
|
+
}
|
23
|
+
|
24
|
+
vector<Classifier::Score> *Model::Model::rank_text(string text) {
|
25
|
+
DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str());
|
26
|
+
vector<Classifier::Score> *ranks = classifier->rank(example);
|
27
|
+
delete example;
|
28
|
+
return ranks;
|
29
|
+
}
|
@@ -0,0 +1,50 @@
|
|
1
|
+
#ifndef __model_h__
|
2
|
+
#define __model_h__
|
3
|
+
#include "data_set/data_set.h"
|
4
|
+
#include "data_set/example.h"
|
5
|
+
#include "classifier/classifier.h"
|
6
|
+
#include "preprocessing/text/text_pipeline.h"
|
7
|
+
|
8
|
+
namespace Model {
|
9
|
+
class Model {
|
10
|
+
public:
|
11
|
+
DataSet::DataSet *data_set;
|
12
|
+
Classifier::Classifier *classifier;
|
13
|
+
Preprocessing::Text::TextPipeline *text_pipeline;
|
14
|
+
|
15
|
+
Model() : data_set(NULL), classifier(NULL), text_pipeline(NULL) {}
|
16
|
+
|
17
|
+
void train(DataSet::Example *example);
|
18
|
+
void train_text(string text);
|
19
|
+
int classify(DataSet::Example *example);
|
20
|
+
int classify_text(string text);
|
21
|
+
vector<Classifier::Score> *rank(DataSet::Example *example);
|
22
|
+
vector<Classifier::Score> *rank_text(string example);
|
23
|
+
|
24
|
+
void set_data_set(DataSet::DataSet *ds) {
|
25
|
+
data_set = ds;
|
26
|
+
}
|
27
|
+
|
28
|
+
DataSet::DataSet *get_data_set() {
|
29
|
+
return data_set;
|
30
|
+
}
|
31
|
+
|
32
|
+
void set_classifier(Classifier::Classifier *c) {
|
33
|
+
classifier = c;
|
34
|
+
}
|
35
|
+
|
36
|
+
Classifier::Classifier *get_classifier() {
|
37
|
+
return classifier;
|
38
|
+
}
|
39
|
+
|
40
|
+
void set_text_pipeline(Preprocessing::Text::TextPipeline *pipeline) {
|
41
|
+
text_pipeline = pipeline;
|
42
|
+
}
|
43
|
+
|
44
|
+
Preprocessing::Text::TextPipeline *get_text_pipeline() {
|
45
|
+
return text_pipeline;
|
46
|
+
}
|
47
|
+
};
|
48
|
+
}
|
49
|
+
|
50
|
+
#endif
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#ifndef __example_preprocessor_h__
|
2
|
+
#define __example_preprocessor_h__
|
3
|
+
#include "data_set/example.h"
|
4
|
+
|
5
|
+
namespace Preprocessing {
|
6
|
+
namespace Examples {
|
7
|
+
|
8
|
+
class ExamplePreprocessor {
|
9
|
+
public:
|
10
|
+
virtual void process(DataSet::Example *example) {}
|
11
|
+
void process_data_set(DataSet::DataSet *data_set) {
|
12
|
+
for(vector<DataSet::Example *>::iterator example = data_set->examples.begin(); example != data_set->examples.end(); example++)
|
13
|
+
process(*example);
|
14
|
+
}
|
15
|
+
};
|
16
|
+
|
17
|
+
}
|
18
|
+
}
|
19
|
+
|
20
|
+
#endif
|