thera 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. data/.document +5 -0
  2. data/.gitignore +56 -0
  3. data/Gemfile +2 -0
  4. data/Gemfile.lock +20 -0
  5. data/LICENSE.txt +1 -0
  6. data/README.rdoc +8 -0
  7. data/Rakefile +1 -0
  8. data/ext/Makefile +225 -0
  9. data/ext/extconf.rb +29 -0
  10. data/ext/quarry/quarry_toolkit.cpp +148 -0
  11. data/lib/quarry/Makefile.linux +2 -0
  12. data/lib/quarry/Makefile.osx +6 -0
  13. data/lib/quarry/Makefile.targets +23 -0
  14. data/lib/quarry/obj/.gitkeep +0 -0
  15. data/lib/quarry/src/classifier/aode/aode_classifier.cpp +0 -0
  16. data/lib/quarry/src/classifier/aode/aode_classifier.h +0 -0
  17. data/lib/quarry/src/classifier/centroid/centroid_classifier.cpp +0 -0
  18. data/lib/quarry/src/classifier/centroid/centroid_classifier.h +0 -0
  19. data/lib/quarry/src/classifier/classifier.cpp +32 -0
  20. data/lib/quarry/src/classifier/classifier.h +59 -0
  21. data/lib/quarry/src/classifier/knn/knn_classifier.cpp +0 -0
  22. data/lib/quarry/src/classifier/knn/knn_classifier.h +0 -0
  23. data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.cpp +40 -0
  24. data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.h +18 -0
  25. data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +80 -0
  26. data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.h +52 -0
  27. data/lib/quarry/src/data_set/data_set.cpp +130 -0
  28. data/lib/quarry/src/data_set/data_set.h +78 -0
  29. data/lib/quarry/src/data_set/dense/dense_data_set.h +39 -0
  30. data/lib/quarry/src/data_set/dense/dense_example.h +44 -0
  31. data/lib/quarry/src/data_set/example.cpp +10 -0
  32. data/lib/quarry/src/data_set/example.h +23 -0
  33. data/lib/quarry/src/data_set/feature.h +36 -0
  34. data/lib/quarry/src/data_set/features/nominal_feature.cpp +57 -0
  35. data/lib/quarry/src/data_set/features/nominal_feature.h +76 -0
  36. data/lib/quarry/src/data_set/features/numeric_feature.cpp +69 -0
  37. data/lib/quarry/src/data_set/features/numeric_feature.h +78 -0
  38. data/lib/quarry/src/data_set/sparse/sparse_data_set.h +40 -0
  39. data/lib/quarry/src/data_set/sparse/sparse_example.cpp +82 -0
  40. data/lib/quarry/src/data_set/sparse/sparse_example.h +38 -0
  41. data/lib/quarry/src/metrics/confusion_matrix.cpp +129 -0
  42. data/lib/quarry/src/metrics/confusion_matrix.h +82 -0
  43. data/lib/quarry/src/model/model.cpp +29 -0
  44. data/lib/quarry/src/model/model.h +50 -0
  45. data/lib/quarry/src/preprocessing/examples/example_preprocessor.h +20 -0
  46. data/lib/quarry/src/preprocessing/examples/weights/binary_weight.h +20 -0
  47. data/lib/quarry/src/preprocessing/examples/weights/local_weight.h +29 -0
  48. data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +19 -0
  49. data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +59 -0
  50. data/lib/quarry/src/preprocessing/text/inplace_processor/downcase.h +26 -0
  51. data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h +17 -0
  52. data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer.h +44 -0
  53. data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer_original.cpp +375 -0
  54. data/lib/quarry/src/preprocessing/text/text_pipeline.cpp +29 -0
  55. data/lib/quarry/src/preprocessing/text/text_pipeline.h +37 -0
  56. data/lib/quarry/src/preprocessing/text/token_selector/pos_tag_selector.h +21 -0
  57. data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp +82 -0
  58. data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h +20 -0
  59. data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h +17 -0
  60. data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.cpp +29 -0
  61. data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.h +20 -0
  62. data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h +19 -0
  63. data/lib/quarry/src/quarry.cpp +1 -0
  64. data/lib/quarry/src/quarry.h +29 -0
  65. data/lib/quarry/src/storage/arff.cpp +198 -0
  66. data/lib/quarry/src/storage/arff.h +26 -0
  67. data/lib/quarry/src/storage/binary.cpp +457 -0
  68. data/lib/quarry/src/storage/binary.h +79 -0
  69. data/lib/quarry/src/storage/folders.cpp +98 -0
  70. data/lib/quarry/src/storage/folders.h +25 -0
  71. data/lib/quarry/src/storage/storage.h +19 -0
  72. data/lib/quarry/src/test.cpp +6 -0
  73. data/lib/quarry_rb/classifier/classifier.rb +22 -0
  74. data/lib/quarry_rb/classifier/naive_bayes_classifier.rb +10 -0
  75. data/lib/quarry_rb/confusion_matrix.rb +58 -0
  76. data/lib/quarry_rb/data_set/data_set.rb +42 -0
  77. data/lib/quarry_rb/data_set/example.rb +33 -0
  78. data/lib/quarry_rb/data_set/feature.rb +28 -0
  79. data/lib/quarry_rb/enumerable_helper.rb +32 -0
  80. data/lib/quarry_rb/model/model.rb +56 -0
  81. data/lib/quarry_rb/storage/arff.rb +11 -0
  82. data/lib/quarry_rb/storage/binary.rb +23 -0
  83. data/lib/quarry_rb/storage/folders.rb +11 -0
  84. data/lib/quarry_rb/text_pipeline.rb +16 -0
  85. data/lib/thera.rb +20 -0
  86. data/test/helper.rb +19 -0
  87. data/test/test_quarry.rb +33 -0
  88. data/thera.gemspec +21 -0
  89. metadata +148 -0
@@ -0,0 +1,76 @@
1
+ #ifndef __nominal_feature_h__
2
+ #define __nominal_feature_h__
3
+ #include "data_set/example.h"
4
+ #include "data_set/feature.h"
5
+ #include <vector>
6
+ #include <map>
7
+ #include <iostream>
8
+
9
+ namespace DataSet {
10
+ class DataSet;
11
+
12
+ class NominalFeature : public Feature {
13
+ public:
14
+ NominalFeature(string name, int index) : Feature(name, index), names(1, "") {}
15
+ NominalFeature(NominalFeature *other) : Feature(other->name, other->index), indexes(other->indexes), names(other->names) {}
16
+
17
+ NominalFeature *clone() {
18
+ return new NominalFeature(this);
19
+ }
20
+
21
+ void reset() {
22
+ frequencies.clear();
23
+ probabilities.clear();
24
+ category_frequencies.clear();
25
+ category_probabilities.clear();
26
+ examples_with_value.clear();
27
+ }
28
+
29
+ void print();
30
+
31
+ // nominal values are referenced by index in examples
32
+ // (e.g "CategoryA" -> 2; would be stored as 2 in an example)
33
+ map<string, int> indexes;
34
+ vector<string> names;
35
+ void add_value(string name) {
36
+ int index = indexes.size() + 1;
37
+ indexes[name] = index;
38
+ names.push_back(name);
39
+ }
40
+
41
+ int value_index(string name) {
42
+ int index = indexes[name];
43
+ if(index == 0) {
44
+ index = indexes.size();
45
+ indexes[name] = index;
46
+ names.push_back(name);
47
+ }
48
+ return index;
49
+ }
50
+
51
+ // counts
52
+ void prepare_for_counting(DataSet *data_set);
53
+ void count_example(double value, int category_index);
54
+ void finalise_counting(DataSet *data_set);
55
+
56
+ // counts for this feature over the entire data set
57
+ vector<int> frequencies;
58
+ vector<double> probabilities;
59
+ int value_frequency(int index) { return frequencies[index]; }
60
+ double value_probability(int index) { return probabilities[index]; }
61
+
62
+ // counts for this feature per category
63
+ vector<vector<int> > category_frequencies;
64
+ vector<vector<double> > category_probabilities;
65
+ int category_value_frequency(int category, int index) { return category_frequencies[category][index]; }
66
+ double category_value_probability(int category, int index) { return category_probabilities[category][index]; }
67
+
68
+ // indexes
69
+ void prepare_for_indexing(DataSet *data_set);
70
+ void index_example(double value, Example *example);
71
+ void finalise_indexing(DataSet *data_set);
72
+ vector<vector<Example *> > examples_with_value;
73
+ };
74
+ }
75
+
76
+ #endif
@@ -0,0 +1,69 @@
1
+ #include "data_set/data_set.h"
2
+ #include "data_set/example.h"
3
+ #include "numeric_feature.h"
4
+ #include "stdlib.h"
5
+
6
+ void DataSet::NumericFeature::prepare_for_counting(DataSet *data_set) {
7
+ category_counts = (Counts *) calloc(sizeof(Counts), data_set->categories_size() + 1);
8
+ }
9
+
10
+ void DataSet::NumericFeature::count_example(double value, int category_index) {
11
+ // non zero count
12
+ if(value != 0.0) {
13
+ counts.non_zero_count++;
14
+ category_counts[category_index].non_zero_count++;
15
+ }
16
+
17
+ // minima
18
+ if(value < counts.min)
19
+ counts.min = value;
20
+ if(value < category_counts[category_index].min)
21
+ category_counts[category_index].min = value;
22
+
23
+ // maxima
24
+ if(value > counts.max)
25
+ counts.max = value;
26
+ if(value > category_counts[category_index].max)
27
+ category_counts[category_index].max = value;
28
+
29
+ // sum
30
+ counts.sum += value;
31
+ category_counts[category_index].sum += value;
32
+
33
+ // squared sum
34
+ counts.sq_sum += (value * value);
35
+ category_counts[category_index].sq_sum += (value * value);
36
+ }
37
+
38
+ void DataSet::NumericFeature::finalise_counting(DataSet *data_set) {
39
+ int categories_count = data_set->categories_size();
40
+ int examples_count = data_set->examples.size();
41
+
42
+ // mean
43
+ counts.mean = counts.sum / examples_count;
44
+ for(int i = 1; i <= categories_count; i++)
45
+ category_counts[i].mean = category_counts[i].sum / data_set->category_feature()->value_frequency(i);
46
+
47
+ // variance
48
+ counts.variance = (counts.sq_sum / examples_count) - (counts.mean * counts.mean);
49
+ for(int i = 1; i <= categories_count; i++)
50
+ category_counts[i].variance = (category_counts[i].sq_sum / data_set->category_feature()->value_frequency(i)) - (category_counts[i].mean * category_counts[i].mean);
51
+ }
52
+
53
+ void DataSet::NumericFeature::prepare_for_indexing(DataSet *data_set) {}
54
+
55
+ void DataSet::NumericFeature::index_example(double value, Example *example) {
56
+ if(value != 0.0)
57
+ non_zero_examples.push_back(example);
58
+ }
59
+
60
+ void DataSet::NumericFeature::finalise_indexing(DataSet *data_set) {}
61
+
62
+ void DataSet::NumericFeature::print(DataSet::DataSet *data_set) {
63
+ cout << "F" << index << ", " << name << endl;
64
+ print_counts(&counts);
65
+ for(int i = 0; i < (data_set->categories_size() + 1); i++) {
66
+ cout << "C" << i << ":";
67
+ print_counts(&(category_counts[i]));
68
+ }
69
+ }
@@ -0,0 +1,78 @@
1
+ #ifndef __numeric_feature_h__
2
+ #define __numeric_feature_h__
3
+ #include "data_set/example.h"
4
+ #include "data_set/feature.h"
5
+ #include <iostream>
6
+
7
+ namespace DataSet {
8
+ class DataSet;
9
+
10
+ class NumericFeature : public Feature {
11
+ public:
12
+ NumericFeature(string name, int index) : Feature(name, index), category_counts(NULL), non_zero_examples() {
13
+ reset();
14
+ }
15
+
16
+ NumericFeature *clone() {
17
+ return new NumericFeature(name, index);
18
+ }
19
+
20
+ void reset() {
21
+ memset(&counts, 0, sizeof(Counts));
22
+ if(category_counts != NULL)
23
+ free(category_counts);
24
+ category_counts = NULL;
25
+ non_zero_examples.clear();
26
+ }
27
+
28
+ void prepare_for_counting(DataSet *data_set);
29
+ void count_example(double value, int category_index);
30
+ void finalise_counting(DataSet *data_set);
31
+
32
+ void prepare_for_indexing(DataSet *data_set);
33
+ void index_example(double value, Example *example);
34
+ void finalise_indexing(DataSet *data_set);
35
+
36
+ // counts
37
+ typedef struct {
38
+ int non_zero_count;
39
+ double sum;
40
+ double sq_sum;
41
+ double min;
42
+ double max;
43
+ double mean;
44
+ double variance;
45
+ } Counts;
46
+
47
+ // counts for this feature over the entire data set
48
+ Counts counts;
49
+ int non_zero_count() { return counts.non_zero_count; }
50
+ double sum() { return counts.sum; }
51
+ double sq_sum() { return counts.sq_sum; }
52
+ double min() { return counts.min; }
53
+ double max() { return counts.max; }
54
+ double mean() { return counts.mean; }
55
+ double variance() { return counts.variance; }
56
+
57
+ // counts for this feature per category
58
+ Counts *category_counts;
59
+ int category_non_zero_count(int index) { return category_counts[index].non_zero_count; }
60
+ double category_sum(int index) { return category_counts[index].sum; }
61
+ double category_sq_sum(int index) { return category_counts[index].sq_sum; }
62
+ double category_min(int index) { return category_counts[index].min; }
63
+ double category_max(int index) { return category_counts[index].max; }
64
+ double category_mean(int index) { return category_counts[index].mean; }
65
+ double category_variance(int index) { return category_counts[index].variance; }
66
+
67
+ void print(DataSet *data_set);
68
+
69
+ void print_counts(Counts *c) {
70
+ cout << c->non_zero_count << ";" << c->sum << ";" << c->sq_sum << ";" << c->min << ";" << c->max << ";" << c->mean << ";" << c->variance << endl;
71
+ }
72
+
73
+ // indexes
74
+ vector<Example *> non_zero_examples;
75
+ };
76
+ }
77
+
78
+ #endif
@@ -0,0 +1,40 @@
1
+ #ifndef __sparse_data_set_h__
2
+ #define __sparse_data_set_h__
3
+ #include "data_set/data_set.h"
4
+ #include "sparse_example.h"
5
+
6
+ namespace DataSet {
7
+ class SparseDataSet : public DataSet {
8
+ void perform_count() {
9
+ int example_category_index = 0;
10
+ SparseExample::Value *value;
11
+
12
+ for(vector<Example *>::iterator example = examples.begin(); example < examples.end(); example++) {
13
+ example_category_index = (int)((*example)->get_value(category_index));
14
+ for(int i = 0; i < (*example)->size; i++) {
15
+ value = &(((SparseExample *)(*example))->values[i]);
16
+ features[value->index]->count_example(value->value, example_category_index);
17
+ }
18
+ }
19
+ }
20
+
21
+ void perform_index() {
22
+ }
23
+
24
+ public:
25
+ SparseDataSet() : DataSet() {}
26
+ SparseDataSet(DataSet *other) : DataSet(other) {}
27
+
28
+ SparseDataSet *clone_without_examples() {
29
+ return new SparseDataSet(this);
30
+ }
31
+
32
+ SparseExample *new_example(int buffer_size = 0) {
33
+ SparseExample *example = new SparseExample(buffer_size);
34
+ examples.push_back(example);
35
+ return example;
36
+ }
37
+ };
38
+ }
39
+
40
+ #endif
@@ -0,0 +1,82 @@
1
+ #include "sparse_data_set.h"
2
+ #include "sparse_example.h"
3
+ #include <stdlib.h>
4
+
5
+ double DataSet::SparseExample::get_value(int feature_index) {
6
+ if(feature_index == 0 && size != 0)
7
+ return values[0].value;
8
+
9
+ int low = 0;
10
+ int high = size - 1;
11
+ int mid = high / 2;
12
+
13
+ // branch prediction makes this triple clause if statement faster
14
+ // than a double clause "single comparison" search. precondition
15
+ // loops also seem to be faster than post condition loops in GCC,
16
+ // really don't know why... this implementation ends up being
17
+ // around 30% faster than well known single comparison versions.
18
+ while(low <= high) {
19
+ if(values[mid].index < feature_index) {
20
+ low = mid + 1;
21
+ } else if(values[mid].index > feature_index) {
22
+ high = mid - 1;
23
+ } else {
24
+ return values[mid].value;
25
+ }
26
+ mid = (high + low) / 2;
27
+ }
28
+
29
+ return 0.0;
30
+ }
31
+
32
+ double DataSet::SparseExample::get_value(string feature_name, SparseDataSet *data_set) {
33
+ return get_value(data_set->get_feature_by_name(feature_name)->index);
34
+ }
35
+
36
+ void DataSet::SparseExample::set_value(int feature_index, double new_value) {
37
+ int i = 0;
38
+
39
+ for(; i < size; i++) {
40
+ if(values[i].index == feature_index) {
41
+ values[i].value = new_value;
42
+ return;
43
+ } else if(values[i].index > feature_index) {
44
+ break;
45
+ }
46
+ }
47
+
48
+ if(buffer_size == size)
49
+ values = (Value *) realloc(values, sizeof(Value) * (++buffer_size));
50
+
51
+ if(i != size)
52
+ memcpy(&values[i + 1], &values[i], (size - i) * sizeof(Value));
53
+
54
+ values[i].index = feature_index;
55
+ values[i].value = new_value;
56
+ size++;
57
+ }
58
+
59
+ void DataSet::SparseExample::append_value(int feature_index, double new_value) {
60
+ if(buffer_size == size)
61
+ values = (Value *) realloc(values, sizeof(Value) * (++buffer_size));
62
+ values[size].index = feature_index;
63
+ values[size].value = new_value;
64
+ size++;
65
+ }
66
+
67
+ double DataSet::SparseExample::euclidean_distance(Example *other_example) {
68
+ return 0.0;
69
+ }
70
+
71
+ double DataSet::SparseExample::cosine_distance(Example *other_example) {
72
+ return 0.0;
73
+ }
74
+
75
+ void DataSet::SparseExample::print() {
76
+ for(int i = 0; i < size; i++) {
77
+ cout << values[i].index << ":" << values[i].value;
78
+ if(i < (size - 1))
79
+ cout << ",";
80
+ }
81
+ cout << endl;
82
+ }
@@ -0,0 +1,38 @@
1
+ #ifndef __sparse_data_set_example_h__
2
+ #define __sparse_data_set_example_h__
3
+ #include "data_set/example.h"
4
+ #include <stdlib.h>
5
+ #include <string>
6
+ using namespace std;
7
+
8
+ namespace DataSet {
9
+ class SparseDataSet;
10
+
11
+ class SparseExample : public Example {
12
+ public:
13
+ typedef struct {
14
+ int index;
15
+ double value;
16
+ } Value;
17
+
18
+ Value *values;
19
+ int buffer_size;
20
+
21
+ SparseExample(int buffer_size = 0) : Example(0), buffer_size(buffer_size) {
22
+ if(buffer_size > 0)
23
+ values = (Value *) calloc(sizeof(Value), buffer_size);
24
+ else
25
+ values = NULL;
26
+ }
27
+
28
+ double get_value(int feature_index);
29
+ double get_value(string feature_name, SparseDataSet *data_set);
30
+ void set_value(int feature_index, double new_value);
31
+ void append_value(int feature_index, double new_value);
32
+ double euclidean_distance(Example *other_example);
33
+ double cosine_distance(Example *other_example);
34
+ void print();
35
+ };
36
+ }
37
+
38
+ #endif
@@ -0,0 +1,129 @@
1
+ #include "confusion_matrix.h"
2
+ #include <iostream>
3
+ const string ConfusionMatrix::average_row_name = "Average";
4
+
5
+ // TODO: CM should reference a classifier, not a data set
6
+
7
+ ConfusionMatrix::ConfusionMatrix(DataSet::DataSet *data_set) : incorrect(0), correct(0), data_set(data_set) {
8
+ int count = data_set->categories_size();
9
+ counts.reserve(count);
10
+ for(int i = 0; i < count; i++)
11
+ counts.push_back(valarray<int>(0, count));
12
+ }
13
+
14
+ void ConfusionMatrix::add(int predicted, int actual) {
15
+ // category indexes are 1 based
16
+ counts[predicted - 1][actual - 1] += 1;
17
+ if(predicted == actual)
18
+ correct++;
19
+ else
20
+ incorrect++;
21
+ }
22
+
23
+ double ConfusionMatrix::accuracy() {
24
+ return ((double)correct) / (correct + incorrect);
25
+ }
26
+
27
+ double ConfusionMatrix::error() {
28
+ return ((double)incorrect) / (correct + incorrect);
29
+ }
30
+
31
+ // true positive
32
+ int ConfusionMatrix::tp(int category) {
33
+ return counts[category - 1][category - 1];
34
+ }
35
+
36
+ // false positive
37
+ int ConfusionMatrix::fp(int category) {
38
+ return counts[category - 1].sum() - tp(category);
39
+ }
40
+
41
+ // true negative
42
+ int ConfusionMatrix::tn(int category) {
43
+ int sum = 0, count = data_set->categories_size();
44
+ for(int i = 1; i <= count; i++)
45
+ for(int j = 1; j <= count; j++)
46
+ if(i != category && j != category)
47
+ sum += counts[i - 1][j - 1];
48
+ return sum;
49
+ }
50
+
51
+ // false negative
52
+ int ConfusionMatrix::fn(int category) {
53
+ int sum = 0, count = data_set->categories_size();
54
+ for(int i = 1; i <= count; i++)
55
+ if(i != category)
56
+ sum += counts[i - 1][category - 1];
57
+ return sum;
58
+ }
59
+
60
+ double ConfusionMatrix::precision(int category) {
61
+ int denom = tp(category) + fp(category);
62
+ if(denom == 0)
63
+ return 0.0;
64
+ return ((double)tp(category)) / denom;
65
+ }
66
+
67
+ double ConfusionMatrix::recall(int category) {
68
+ int denom = tp(category) + fn(category);
69
+ if(denom == 0)
70
+ return 0.0;
71
+ return ((double)tp(category)) / denom;
72
+ }
73
+
74
+ double ConfusionMatrix::fscore(int category) {
75
+ double p = precision(category);
76
+ double r = recall(category);
77
+ if((p + r) == 0.0)
78
+ return 0.0;
79
+ return (2 * p * r) / (p + r);
80
+ }
81
+
82
+ void ConfusionMatrix::print_summary() {
83
+ // overall counts and summary
84
+ cout.precision(4);
85
+ cout << "== Summary ==" << endl;
86
+ cout << setw(23) <<"Correctly classified:" << setw(12) << right << correct << setw(10) << right << accuracy() * 100 << "%" << endl;
87
+ cout << setw(23) << "Incorrectly classified:" << setw(12) << right << incorrect << setw(10) << right << error() * 100 << "%" << endl;
88
+ cout << setw(23) << "Total classifications:" << setw(12) << right << correct + incorrect << endl << endl;
89
+
90
+ // determine the width of the left (category name) column
91
+ int max_name_length = 0;
92
+ for(int category = 1; category <= data_set->categories_size(); category++)
93
+ if(data_set->category_feature()->names[category].length() > max_name_length)
94
+ max_name_length = data_set->category_feature()->names[category].length();
95
+ if(average_row_name.length() > max_name_length)
96
+ max_name_length = average_row_name.length();
97
+ max_name_length += 1;
98
+
99
+ // detailed category information
100
+ cout << "== Category Performance ==" << endl;
101
+ cout << setw(max_name_length) << "";
102
+ cout << setw(9) << right << "True +";
103
+ cout << setw(9) << right << "False +";
104
+ cout << setw(9) << right << "True -";
105
+ cout << setw(9) << right << "False -";
106
+ cout << setw(9) << right << "Precis.";
107
+ cout << setw(9) << right << "Recall";
108
+ cout << setw(9) << right << "F-score" << endl;
109
+
110
+ for(int category = 1; category <= data_set->categories_size(); category++) {
111
+ cout << setw(max_name_length) << data_set->category_feature()->names[category];
112
+ cout << setw(9) << tp(category);
113
+ cout << setw(9) << fp(category);
114
+ cout << setw(9) << tn(category);
115
+ cout << setw(9) << fn(category);
116
+ cout << setw(8) << precision(category) * 100 << "%";
117
+ cout << setw(8) << recall(category) * 100 << "%";
118
+ cout << setw(8) << fscore(category) * 100 << "%" << endl;
119
+ }
120
+
121
+ cout << setw(max_name_length) << average_row_name;
122
+ cout << setw(9) << avg_tp();
123
+ cout << setw(9) << avg_fp();
124
+ cout << setw(9) << avg_tn();
125
+ cout << setw(9) << avg_fn();
126
+ cout << setw(8) << avg_precision() * 100 << "%";
127
+ cout << setw(8) << avg_recall() * 100 << "%";
128
+ cout << setw(8) << avg_fscore() * 100 << "%" << endl;
129
+ }
@@ -0,0 +1,82 @@
1
+ #ifndef __confusion_matrix__
2
+ #define __confusion_matrix__
3
+ #include "data_set/data_set.h"
4
+ #include <vector>
5
+ #include <valarray>
6
+ #include <iostream>
7
+ #include <iomanip>
8
+ using namespace std;
9
+
10
+ namespace DataSet {
11
+ class Category;
12
+ }
13
+
14
+ class ConfusionMatrix {
15
+ public:
16
+ static const string average_row_name;
17
+ DataSet::DataSet *data_set;
18
+ vector<valarray<int> > counts;
19
+ int correct;
20
+ int incorrect;
21
+
22
+ ConfusionMatrix(DataSet::DataSet *data_set);
23
+ void merge(ConfusionMatrix *other) {
24
+ incorrect += other->incorrect;
25
+ correct += other->correct;
26
+
27
+ for(unsigned int i = 0; i < counts.size(); i++)
28
+ counts[i] += other->counts[i];
29
+ }
30
+
31
+ void add(int predicted, int actual);
32
+ double accuracy();
33
+ double error();
34
+ int tp(int category);
35
+ int fp(int category);
36
+ int tn(int category);
37
+ int fn(int category);
38
+ double precision(int category);
39
+ double recall(int category);
40
+ double fscore(int category);
41
+ void print_summary();
42
+
43
+ // averages
44
+ double avg_tp() {
45
+ return apply<int>(&ConfusionMatrix::tp);
46
+ }
47
+
48
+ double avg_fp() {
49
+ return apply<int>(&ConfusionMatrix::fp);
50
+ }
51
+
52
+ double avg_tn() {
53
+ return apply<int>(&ConfusionMatrix::tn);
54
+ }
55
+
56
+ double avg_fn() {
57
+ return apply<int>(&ConfusionMatrix::fn);
58
+ }
59
+
60
+ double avg_precision() {
61
+ return apply<double>(&ConfusionMatrix::precision);
62
+ }
63
+
64
+ double avg_recall() {
65
+ return apply<double>(&ConfusionMatrix::recall);
66
+ }
67
+
68
+ double avg_fscore() {
69
+ return apply<double>(&ConfusionMatrix::fscore);
70
+ }
71
+
72
+ protected:
73
+ template <class T, class Function>
74
+ double apply(Function func) {
75
+ T result = 0.0;
76
+ for(int category = 1; category <= data_set->categories_size(); category++)
77
+ result += (this->*func)(category);
78
+ return result / ((double)counts.size());
79
+ }
80
+ };
81
+
82
+ #endif
@@ -0,0 +1,29 @@
1
+ #include "model.h"
2
+
3
+ void Model::Model::train(DataSet::Example *example) {
4
+ }
5
+
6
+ void Model::Model::train_text(string text) {
7
+ }
8
+
9
+ int Model::Model::classify(DataSet::Example *example) {
10
+ return classifier->classify(example);
11
+ }
12
+
13
+ int Model::Model::classify_text(string text) {
14
+ DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str());
15
+ int category = classifier->classify(example);
16
+ delete example;
17
+ return category;
18
+ }
19
+
20
+ vector<Classifier::Score> *Model::Model::rank(DataSet::Example *example) {
21
+ return classifier->rank(example);
22
+ }
23
+
24
+ vector<Classifier::Score> *Model::Model::rank_text(string text) {
25
+ DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str());
26
+ vector<Classifier::Score> *ranks = classifier->rank(example);
27
+ delete example;
28
+ return ranks;
29
+ }
@@ -0,0 +1,50 @@
1
+ #ifndef __model_h__
2
+ #define __model_h__
3
+ #include "data_set/data_set.h"
4
+ #include "data_set/example.h"
5
+ #include "classifier/classifier.h"
6
+ #include "preprocessing/text/text_pipeline.h"
7
+
8
+ namespace Model {
9
+ class Model {
10
+ public:
11
+ DataSet::DataSet *data_set;
12
+ Classifier::Classifier *classifier;
13
+ Preprocessing::Text::TextPipeline *text_pipeline;
14
+
15
+ Model() : data_set(NULL), classifier(NULL), text_pipeline(NULL) {}
16
+
17
+ void train(DataSet::Example *example);
18
+ void train_text(string text);
19
+ int classify(DataSet::Example *example);
20
+ int classify_text(string text);
21
+ vector<Classifier::Score> *rank(DataSet::Example *example);
22
+ vector<Classifier::Score> *rank_text(string example);
23
+
24
+ void set_data_set(DataSet::DataSet *ds) {
25
+ data_set = ds;
26
+ }
27
+
28
+ DataSet::DataSet *get_data_set() {
29
+ return data_set;
30
+ }
31
+
32
+ void set_classifier(Classifier::Classifier *c) {
33
+ classifier = c;
34
+ }
35
+
36
+ Classifier::Classifier *get_classifier() {
37
+ return classifier;
38
+ }
39
+
40
+ void set_text_pipeline(Preprocessing::Text::TextPipeline *pipeline) {
41
+ text_pipeline = pipeline;
42
+ }
43
+
44
+ Preprocessing::Text::TextPipeline *get_text_pipeline() {
45
+ return text_pipeline;
46
+ }
47
+ };
48
+ }
49
+
50
+ #endif
@@ -0,0 +1,20 @@
1
+ #ifndef __example_preprocessor_h__
2
+ #define __example_preprocessor_h__
3
+ #include "data_set/example.h"
4
+
5
+ namespace Preprocessing {
6
+ namespace Examples {
7
+
8
+ class ExamplePreprocessor {
9
+ public:
10
+ virtual void process(DataSet::Example *example) {}
11
+ void process_data_set(DataSet::DataSet *data_set) {
12
+ for(vector<DataSet::Example *>::iterator example = data_set->examples.begin(); example != data_set->examples.end(); example++)
13
+ process(*example);
14
+ }
15
+ };
16
+
17
+ }
18
+ }
19
+
20
+ #endif