thera 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. data/.document +5 -0
  2. data/.gitignore +56 -0
  3. data/Gemfile +2 -0
  4. data/Gemfile.lock +20 -0
  5. data/LICENSE.txt +1 -0
  6. data/README.rdoc +8 -0
  7. data/Rakefile +1 -0
  8. data/ext/Makefile +225 -0
  9. data/ext/extconf.rb +29 -0
  10. data/ext/quarry/quarry_toolkit.cpp +148 -0
  11. data/lib/quarry/Makefile.linux +2 -0
  12. data/lib/quarry/Makefile.osx +6 -0
  13. data/lib/quarry/Makefile.targets +23 -0
  14. data/lib/quarry/obj/.gitkeep +0 -0
  15. data/lib/quarry/src/classifier/aode/aode_classifier.cpp +0 -0
  16. data/lib/quarry/src/classifier/aode/aode_classifier.h +0 -0
  17. data/lib/quarry/src/classifier/centroid/centroid_classifier.cpp +0 -0
  18. data/lib/quarry/src/classifier/centroid/centroid_classifier.h +0 -0
  19. data/lib/quarry/src/classifier/classifier.cpp +32 -0
  20. data/lib/quarry/src/classifier/classifier.h +59 -0
  21. data/lib/quarry/src/classifier/knn/knn_classifier.cpp +0 -0
  22. data/lib/quarry/src/classifier/knn/knn_classifier.h +0 -0
  23. data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.cpp +40 -0
  24. data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.h +18 -0
  25. data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +80 -0
  26. data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.h +52 -0
  27. data/lib/quarry/src/data_set/data_set.cpp +130 -0
  28. data/lib/quarry/src/data_set/data_set.h +78 -0
  29. data/lib/quarry/src/data_set/dense/dense_data_set.h +39 -0
  30. data/lib/quarry/src/data_set/dense/dense_example.h +44 -0
  31. data/lib/quarry/src/data_set/example.cpp +10 -0
  32. data/lib/quarry/src/data_set/example.h +23 -0
  33. data/lib/quarry/src/data_set/feature.h +36 -0
  34. data/lib/quarry/src/data_set/features/nominal_feature.cpp +57 -0
  35. data/lib/quarry/src/data_set/features/nominal_feature.h +76 -0
  36. data/lib/quarry/src/data_set/features/numeric_feature.cpp +69 -0
  37. data/lib/quarry/src/data_set/features/numeric_feature.h +78 -0
  38. data/lib/quarry/src/data_set/sparse/sparse_data_set.h +40 -0
  39. data/lib/quarry/src/data_set/sparse/sparse_example.cpp +82 -0
  40. data/lib/quarry/src/data_set/sparse/sparse_example.h +38 -0
  41. data/lib/quarry/src/metrics/confusion_matrix.cpp +129 -0
  42. data/lib/quarry/src/metrics/confusion_matrix.h +82 -0
  43. data/lib/quarry/src/model/model.cpp +29 -0
  44. data/lib/quarry/src/model/model.h +50 -0
  45. data/lib/quarry/src/preprocessing/examples/example_preprocessor.h +20 -0
  46. data/lib/quarry/src/preprocessing/examples/weights/binary_weight.h +20 -0
  47. data/lib/quarry/src/preprocessing/examples/weights/local_weight.h +29 -0
  48. data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +19 -0
  49. data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +59 -0
  50. data/lib/quarry/src/preprocessing/text/inplace_processor/downcase.h +26 -0
  51. data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h +17 -0
  52. data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer.h +44 -0
  53. data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer_original.cpp +375 -0
  54. data/lib/quarry/src/preprocessing/text/text_pipeline.cpp +29 -0
  55. data/lib/quarry/src/preprocessing/text/text_pipeline.h +37 -0
  56. data/lib/quarry/src/preprocessing/text/token_selector/pos_tag_selector.h +21 -0
  57. data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp +82 -0
  58. data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h +20 -0
  59. data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h +17 -0
  60. data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.cpp +29 -0
  61. data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.h +20 -0
  62. data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h +19 -0
  63. data/lib/quarry/src/quarry.cpp +1 -0
  64. data/lib/quarry/src/quarry.h +29 -0
  65. data/lib/quarry/src/storage/arff.cpp +198 -0
  66. data/lib/quarry/src/storage/arff.h +26 -0
  67. data/lib/quarry/src/storage/binary.cpp +457 -0
  68. data/lib/quarry/src/storage/binary.h +79 -0
  69. data/lib/quarry/src/storage/folders.cpp +98 -0
  70. data/lib/quarry/src/storage/folders.h +25 -0
  71. data/lib/quarry/src/storage/storage.h +19 -0
  72. data/lib/quarry/src/test.cpp +6 -0
  73. data/lib/quarry_rb/classifier/classifier.rb +22 -0
  74. data/lib/quarry_rb/classifier/naive_bayes_classifier.rb +10 -0
  75. data/lib/quarry_rb/confusion_matrix.rb +58 -0
  76. data/lib/quarry_rb/data_set/data_set.rb +42 -0
  77. data/lib/quarry_rb/data_set/example.rb +33 -0
  78. data/lib/quarry_rb/data_set/feature.rb +28 -0
  79. data/lib/quarry_rb/enumerable_helper.rb +32 -0
  80. data/lib/quarry_rb/model/model.rb +56 -0
  81. data/lib/quarry_rb/storage/arff.rb +11 -0
  82. data/lib/quarry_rb/storage/binary.rb +23 -0
  83. data/lib/quarry_rb/storage/folders.rb +11 -0
  84. data/lib/quarry_rb/text_pipeline.rb +16 -0
  85. data/lib/thera.rb +20 -0
  86. data/test/helper.rb +19 -0
  87. data/test/test_quarry.rb +33 -0
  88. data/thera.gemspec +21 -0
  89. metadata +148 -0
@@ -0,0 +1,76 @@
1
+ #ifndef __nominal_feature_h__
2
+ #define __nominal_feature_h__
3
+ #include "data_set/example.h"
4
+ #include "data_set/feature.h"
5
+ #include <vector>
6
+ #include <map>
7
+ #include <iostream>
8
+
9
+ namespace DataSet {
10
+ class DataSet;
11
+
12
+ class NominalFeature : public Feature {
13
+ public:
14
+ NominalFeature(string name, int index) : Feature(name, index), names(1, "") {}
15
+ NominalFeature(NominalFeature *other) : Feature(other->name, other->index), indexes(other->indexes), names(other->names) {}
16
+
17
+ NominalFeature *clone() {
18
+ return new NominalFeature(this);
19
+ }
20
+
21
+ void reset() {
22
+ frequencies.clear();
23
+ probabilities.clear();
24
+ category_frequencies.clear();
25
+ category_probabilities.clear();
26
+ examples_with_value.clear();
27
+ }
28
+
29
+ void print();
30
+
31
+ // nominal values are referenced by index in examples
32
+ // (e.g "CategoryA" -> 2; would be stored as 2 in an example)
33
+ map<string, int> indexes;
34
+ vector<string> names;
35
+ void add_value(string name) {
36
+ int index = indexes.size() + 1;
37
+ indexes[name] = index;
38
+ names.push_back(name);
39
+ }
40
+
41
+ int value_index(string name) {
42
+ int index = indexes[name];
43
+ if(index == 0) {
44
+ index = indexes.size();
45
+ indexes[name] = index;
46
+ names.push_back(name);
47
+ }
48
+ return index;
49
+ }
50
+
51
+ // counts
52
+ void prepare_for_counting(DataSet *data_set);
53
+ void count_example(double value, int category_index);
54
+ void finalise_counting(DataSet *data_set);
55
+
56
+ // counts for this feature over the entire data set
57
+ vector<int> frequencies;
58
+ vector<double> probabilities;
59
+ int value_frequency(int index) { return frequencies[index]; }
60
+ double value_probability(int index) { return probabilities[index]; }
61
+
62
+ // counts for this feature per category
63
+ vector<vector<int> > category_frequencies;
64
+ vector<vector<double> > category_probabilities;
65
+ int category_value_frequency(int category, int index) { return category_frequencies[category][index]; }
66
+ double category_value_probability(int category, int index) { return category_probabilities[category][index]; }
67
+
68
+ // indexes
69
+ void prepare_for_indexing(DataSet *data_set);
70
+ void index_example(double value, Example *example);
71
+ void finalise_indexing(DataSet *data_set);
72
+ vector<vector<Example *> > examples_with_value;
73
+ };
74
+ }
75
+
76
+ #endif
@@ -0,0 +1,69 @@
1
+ #include "data_set/data_set.h"
2
+ #include "data_set/example.h"
3
+ #include "numeric_feature.h"
4
+ #include "stdlib.h"
5
+
6
+ void DataSet::NumericFeature::prepare_for_counting(DataSet *data_set) {
7
+ category_counts = (Counts *) calloc(sizeof(Counts), data_set->categories_size() + 1);
8
+ }
9
+
10
+ void DataSet::NumericFeature::count_example(double value, int category_index) {
11
+ // non zero count
12
+ if(value != 0.0) {
13
+ counts.non_zero_count++;
14
+ category_counts[category_index].non_zero_count++;
15
+ }
16
+
17
+ // minima
18
+ if(value < counts.min)
19
+ counts.min = value;
20
+ if(value < category_counts[category_index].min)
21
+ category_counts[category_index].min = value;
22
+
23
+ // maxima
24
+ if(value > counts.max)
25
+ counts.max = value;
26
+ if(value > category_counts[category_index].max)
27
+ category_counts[category_index].max = value;
28
+
29
+ // sum
30
+ counts.sum += value;
31
+ category_counts[category_index].sum += value;
32
+
33
+ // squared sum
34
+ counts.sq_sum += (value * value);
35
+ category_counts[category_index].sq_sum += (value * value);
36
+ }
37
+
38
+ void DataSet::NumericFeature::finalise_counting(DataSet *data_set) {
39
+ int categories_count = data_set->categories_size();
40
+ int examples_count = data_set->examples.size();
41
+
42
+ // mean
43
+ counts.mean = counts.sum / examples_count;
44
+ for(int i = 1; i <= categories_count; i++)
45
+ category_counts[i].mean = category_counts[i].sum / data_set->category_feature()->value_frequency(i);
46
+
47
+ // variance
48
+ counts.variance = (counts.sq_sum / examples_count) - (counts.mean * counts.mean);
49
+ for(int i = 1; i <= categories_count; i++)
50
+ category_counts[i].variance = (category_counts[i].sq_sum / data_set->category_feature()->value_frequency(i)) - (category_counts[i].mean * category_counts[i].mean);
51
+ }
52
+
53
+ void DataSet::NumericFeature::prepare_for_indexing(DataSet *data_set) {}
54
+
55
+ void DataSet::NumericFeature::index_example(double value, Example *example) {
56
+ if(value != 0.0)
57
+ non_zero_examples.push_back(example);
58
+ }
59
+
60
+ void DataSet::NumericFeature::finalise_indexing(DataSet *data_set) {}
61
+
62
+ void DataSet::NumericFeature::print(DataSet::DataSet *data_set) {
63
+ cout << "F" << index << ", " << name << endl;
64
+ print_counts(&counts);
65
+ for(int i = 0; i < (data_set->categories_size() + 1); i++) {
66
+ cout << "C" << i << ":";
67
+ print_counts(&(category_counts[i]));
68
+ }
69
+ }
@@ -0,0 +1,78 @@
1
+ #ifndef __numeric_feature_h__
2
+ #define __numeric_feature_h__
3
+ #include "data_set/example.h"
4
+ #include "data_set/feature.h"
5
+ #include <iostream>
6
+
7
+ namespace DataSet {
8
+ class DataSet;
9
+
10
+ class NumericFeature : public Feature {
11
+ public:
12
+ NumericFeature(string name, int index) : Feature(name, index), category_counts(NULL), non_zero_examples() {
13
+ reset();
14
+ }
15
+
16
+ NumericFeature *clone() {
17
+ return new NumericFeature(name, index);
18
+ }
19
+
20
+ void reset() {
21
+ memset(&counts, 0, sizeof(Counts));
22
+ if(category_counts != NULL)
23
+ free(category_counts);
24
+ category_counts = NULL;
25
+ non_zero_examples.clear();
26
+ }
27
+
28
+ void prepare_for_counting(DataSet *data_set);
29
+ void count_example(double value, int category_index);
30
+ void finalise_counting(DataSet *data_set);
31
+
32
+ void prepare_for_indexing(DataSet *data_set);
33
+ void index_example(double value, Example *example);
34
+ void finalise_indexing(DataSet *data_set);
35
+
36
+ // counts
37
+ typedef struct {
38
+ int non_zero_count;
39
+ double sum;
40
+ double sq_sum;
41
+ double min;
42
+ double max;
43
+ double mean;
44
+ double variance;
45
+ } Counts;
46
+
47
+ // counts for this feature over the entire data set
48
+ Counts counts;
49
+ int non_zero_count() { return counts.non_zero_count; }
50
+ double sum() { return counts.sum; }
51
+ double sq_sum() { return counts.sq_sum; }
52
+ double min() { return counts.min; }
53
+ double max() { return counts.max; }
54
+ double mean() { return counts.mean; }
55
+ double variance() { return counts.variance; }
56
+
57
+ // counts for this feature per category
58
+ Counts *category_counts;
59
+ int category_non_zero_count(int index) { return category_counts[index].non_zero_count; }
60
+ double category_sum(int index) { return category_counts[index].sum; }
61
+ double category_sq_sum(int index) { return category_counts[index].sq_sum; }
62
+ double category_min(int index) { return category_counts[index].min; }
63
+ double category_max(int index) { return category_counts[index].max; }
64
+ double category_mean(int index) { return category_counts[index].mean; }
65
+ double category_variance(int index) { return category_counts[index].variance; }
66
+
67
+ void print(DataSet *data_set);
68
+
69
+ void print_counts(Counts *c) {
70
+ cout << c->non_zero_count << ";" << c->sum << ";" << c->sq_sum << ";" << c->min << ";" << c->max << ";" << c->mean << ";" << c->variance << endl;
71
+ }
72
+
73
+ // indexes
74
+ vector<Example *> non_zero_examples;
75
+ };
76
+ }
77
+
78
+ #endif
@@ -0,0 +1,40 @@
1
+ #ifndef __sparse_data_set_h__
2
+ #define __sparse_data_set_h__
3
+ #include "data_set/data_set.h"
4
+ #include "sparse_example.h"
5
+
6
+ namespace DataSet {
7
+ class SparseDataSet : public DataSet {
8
+ void perform_count() {
9
+ int example_category_index = 0;
10
+ SparseExample::Value *value;
11
+
12
+ for(vector<Example *>::iterator example = examples.begin(); example < examples.end(); example++) {
13
+ example_category_index = (int)((*example)->get_value(category_index));
14
+ for(int i = 0; i < (*example)->size; i++) {
15
+ value = &(((SparseExample *)(*example))->values[i]);
16
+ features[value->index]->count_example(value->value, example_category_index);
17
+ }
18
+ }
19
+ }
20
+
21
+ void perform_index() {
22
+ }
23
+
24
+ public:
25
+ SparseDataSet() : DataSet() {}
26
+ SparseDataSet(DataSet *other) : DataSet(other) {}
27
+
28
+ SparseDataSet *clone_without_examples() {
29
+ return new SparseDataSet(this);
30
+ }
31
+
32
+ SparseExample *new_example(int buffer_size = 0) {
33
+ SparseExample *example = new SparseExample(buffer_size);
34
+ examples.push_back(example);
35
+ return example;
36
+ }
37
+ };
38
+ }
39
+
40
+ #endif
@@ -0,0 +1,82 @@
1
+ #include "sparse_data_set.h"
2
+ #include "sparse_example.h"
3
+ #include <stdlib.h>
4
+
5
+ double DataSet::SparseExample::get_value(int feature_index) {
6
+ if(feature_index == 0 && size != 0)
7
+ return values[0].value;
8
+
9
+ int low = 0;
10
+ int high = size - 1;
11
+ int mid = high / 2;
12
+
13
+ // branch prediction makes this triple clause if statement faster
14
+ // than a double clause "single comparison" search. precondition
15
+ // loops also seem to be faster than post condition loops in GCC,
16
+ // really don't know why... this implementation ends up being
17
+ // around 30% faster than well known single comparison versions.
18
+ while(low <= high) {
19
+ if(values[mid].index < feature_index) {
20
+ low = mid + 1;
21
+ } else if(values[mid].index > feature_index) {
22
+ high = mid - 1;
23
+ } else {
24
+ return values[mid].value;
25
+ }
26
+ mid = (high + low) / 2;
27
+ }
28
+
29
+ return 0.0;
30
+ }
31
+
32
+ double DataSet::SparseExample::get_value(string feature_name, SparseDataSet *data_set) {
33
+ return get_value(data_set->get_feature_by_name(feature_name)->index);
34
+ }
35
+
36
+ void DataSet::SparseExample::set_value(int feature_index, double new_value) {
37
+ int i = 0;
38
+
39
+ for(; i < size; i++) {
40
+ if(values[i].index == feature_index) {
41
+ values[i].value = new_value;
42
+ return;
43
+ } else if(values[i].index > feature_index) {
44
+ break;
45
+ }
46
+ }
47
+
48
+ if(buffer_size == size)
49
+ values = (Value *) realloc(values, sizeof(Value) * (++buffer_size));
50
+
51
+ if(i != size)
52
+ memcpy(&values[i + 1], &values[i], (size - i) * sizeof(Value));
53
+
54
+ values[i].index = feature_index;
55
+ values[i].value = new_value;
56
+ size++;
57
+ }
58
+
59
+ void DataSet::SparseExample::append_value(int feature_index, double new_value) {
60
+ if(buffer_size == size)
61
+ values = (Value *) realloc(values, sizeof(Value) * (++buffer_size));
62
+ values[size].index = feature_index;
63
+ values[size].value = new_value;
64
+ size++;
65
+ }
66
+
67
+ double DataSet::SparseExample::euclidean_distance(Example *other_example) {
68
+ return 0.0;
69
+ }
70
+
71
+ double DataSet::SparseExample::cosine_distance(Example *other_example) {
72
+ return 0.0;
73
+ }
74
+
75
+ void DataSet::SparseExample::print() {
76
+ for(int i = 0; i < size; i++) {
77
+ cout << values[i].index << ":" << values[i].value;
78
+ if(i < (size - 1))
79
+ cout << ",";
80
+ }
81
+ cout << endl;
82
+ }
@@ -0,0 +1,38 @@
1
+ #ifndef __sparse_data_set_example_h__
2
+ #define __sparse_data_set_example_h__
3
+ #include "data_set/example.h"
4
+ #include <stdlib.h>
5
+ #include <string>
6
+ using namespace std;
7
+
8
+ namespace DataSet {
9
+ class SparseDataSet;
10
+
11
+ class SparseExample : public Example {
12
+ public:
13
+ typedef struct {
14
+ int index;
15
+ double value;
16
+ } Value;
17
+
18
+ Value *values;
19
+ int buffer_size;
20
+
21
+ SparseExample(int buffer_size = 0) : Example(0), buffer_size(buffer_size) {
22
+ if(buffer_size > 0)
23
+ values = (Value *) calloc(sizeof(Value), buffer_size);
24
+ else
25
+ values = NULL;
26
+ }
27
+
28
+ double get_value(int feature_index);
29
+ double get_value(string feature_name, SparseDataSet *data_set);
30
+ void set_value(int feature_index, double new_value);
31
+ void append_value(int feature_index, double new_value);
32
+ double euclidean_distance(Example *other_example);
33
+ double cosine_distance(Example *other_example);
34
+ void print();
35
+ };
36
+ }
37
+
38
+ #endif
@@ -0,0 +1,129 @@
1
+ #include "confusion_matrix.h"
2
+ #include <iostream>
3
+ const string ConfusionMatrix::average_row_name = "Average";
4
+
5
+ // TODO: CM should reference a classifier, not a data set
6
+
7
+ ConfusionMatrix::ConfusionMatrix(DataSet::DataSet *data_set) : incorrect(0), correct(0), data_set(data_set) {
8
+ int count = data_set->categories_size();
9
+ counts.reserve(count);
10
+ for(int i = 0; i < count; i++)
11
+ counts.push_back(valarray<int>(0, count));
12
+ }
13
+
14
+ void ConfusionMatrix::add(int predicted, int actual) {
15
+ // category indexes are 1 based
16
+ counts[predicted - 1][actual - 1] += 1;
17
+ if(predicted == actual)
18
+ correct++;
19
+ else
20
+ incorrect++;
21
+ }
22
+
23
+ double ConfusionMatrix::accuracy() {
24
+ return ((double)correct) / (correct + incorrect);
25
+ }
26
+
27
+ double ConfusionMatrix::error() {
28
+ return ((double)incorrect) / (correct + incorrect);
29
+ }
30
+
31
+ // true positive
32
+ int ConfusionMatrix::tp(int category) {
33
+ return counts[category - 1][category - 1];
34
+ }
35
+
36
+ // false positive
37
+ int ConfusionMatrix::fp(int category) {
38
+ return counts[category - 1].sum() - tp(category);
39
+ }
40
+
41
+ // true negative
42
+ int ConfusionMatrix::tn(int category) {
43
+ int sum = 0, count = data_set->categories_size();
44
+ for(int i = 1; i <= count; i++)
45
+ for(int j = 1; j <= count; j++)
46
+ if(i != category && j != category)
47
+ sum += counts[i - 1][j - 1];
48
+ return sum;
49
+ }
50
+
51
+ // false negative
52
+ int ConfusionMatrix::fn(int category) {
53
+ int sum = 0, count = data_set->categories_size();
54
+ for(int i = 1; i <= count; i++)
55
+ if(i != category)
56
+ sum += counts[i - 1][category - 1];
57
+ return sum;
58
+ }
59
+
60
+ double ConfusionMatrix::precision(int category) {
61
+ int denom = tp(category) + fp(category);
62
+ if(denom == 0)
63
+ return 0.0;
64
+ return ((double)tp(category)) / denom;
65
+ }
66
+
67
+ double ConfusionMatrix::recall(int category) {
68
+ int denom = tp(category) + fn(category);
69
+ if(denom == 0)
70
+ return 0.0;
71
+ return ((double)tp(category)) / denom;
72
+ }
73
+
74
+ double ConfusionMatrix::fscore(int category) {
75
+ double p = precision(category);
76
+ double r = recall(category);
77
+ if((p + r) == 0.0)
78
+ return 0.0;
79
+ return (2 * p * r) / (p + r);
80
+ }
81
+
82
+ void ConfusionMatrix::print_summary() {
83
+ // overall counts and summary
84
+ cout.precision(4);
85
+ cout << "== Summary ==" << endl;
86
+ cout << setw(23) <<"Correctly classified:" << setw(12) << right << correct << setw(10) << right << accuracy() * 100 << "%" << endl;
87
+ cout << setw(23) << "Incorrectly classified:" << setw(12) << right << incorrect << setw(10) << right << error() * 100 << "%" << endl;
88
+ cout << setw(23) << "Total classifications:" << setw(12) << right << correct + incorrect << endl << endl;
89
+
90
+ // determine the width of the left (category name) column
91
+ int max_name_length = 0;
92
+ for(int category = 1; category <= data_set->categories_size(); category++)
93
+ if(data_set->category_feature()->names[category].length() > max_name_length)
94
+ max_name_length = data_set->category_feature()->names[category].length();
95
+ if(average_row_name.length() > max_name_length)
96
+ max_name_length = average_row_name.length();
97
+ max_name_length += 1;
98
+
99
+ // detailed category information
100
+ cout << "== Category Performance ==" << endl;
101
+ cout << setw(max_name_length) << "";
102
+ cout << setw(9) << right << "True +";
103
+ cout << setw(9) << right << "False +";
104
+ cout << setw(9) << right << "True -";
105
+ cout << setw(9) << right << "False -";
106
+ cout << setw(9) << right << "Precis.";
107
+ cout << setw(9) << right << "Recall";
108
+ cout << setw(9) << right << "F-score" << endl;
109
+
110
+ for(int category = 1; category <= data_set->categories_size(); category++) {
111
+ cout << setw(max_name_length) << data_set->category_feature()->names[category];
112
+ cout << setw(9) << tp(category);
113
+ cout << setw(9) << fp(category);
114
+ cout << setw(9) << tn(category);
115
+ cout << setw(9) << fn(category);
116
+ cout << setw(8) << precision(category) * 100 << "%";
117
+ cout << setw(8) << recall(category) * 100 << "%";
118
+ cout << setw(8) << fscore(category) * 100 << "%" << endl;
119
+ }
120
+
121
+ cout << setw(max_name_length) << average_row_name;
122
+ cout << setw(9) << avg_tp();
123
+ cout << setw(9) << avg_fp();
124
+ cout << setw(9) << avg_tn();
125
+ cout << setw(9) << avg_fn();
126
+ cout << setw(8) << avg_precision() * 100 << "%";
127
+ cout << setw(8) << avg_recall() * 100 << "%";
128
+ cout << setw(8) << avg_fscore() * 100 << "%" << endl;
129
+ }
@@ -0,0 +1,82 @@
1
+ #ifndef __confusion_matrix__
2
+ #define __confusion_matrix__
3
+ #include "data_set/data_set.h"
4
+ #include <vector>
5
+ #include <valarray>
6
+ #include <iostream>
7
+ #include <iomanip>
8
+ using namespace std;
9
+
10
+ namespace DataSet {
11
+ class Category;
12
+ }
13
+
14
+ class ConfusionMatrix {
15
+ public:
16
+ static const string average_row_name;
17
+ DataSet::DataSet *data_set;
18
+ vector<valarray<int> > counts;
19
+ int correct;
20
+ int incorrect;
21
+
22
+ ConfusionMatrix(DataSet::DataSet *data_set);
23
+ void merge(ConfusionMatrix *other) {
24
+ incorrect += other->incorrect;
25
+ correct += other->correct;
26
+
27
+ for(unsigned int i = 0; i < counts.size(); i++)
28
+ counts[i] += other->counts[i];
29
+ }
30
+
31
+ void add(int predicted, int actual);
32
+ double accuracy();
33
+ double error();
34
+ int tp(int category);
35
+ int fp(int category);
36
+ int tn(int category);
37
+ int fn(int category);
38
+ double precision(int category);
39
+ double recall(int category);
40
+ double fscore(int category);
41
+ void print_summary();
42
+
43
+ // averages
44
+ double avg_tp() {
45
+ return apply<int>(&ConfusionMatrix::tp);
46
+ }
47
+
48
+ double avg_fp() {
49
+ return apply<int>(&ConfusionMatrix::fp);
50
+ }
51
+
52
+ double avg_tn() {
53
+ return apply<int>(&ConfusionMatrix::tn);
54
+ }
55
+
56
+ double avg_fn() {
57
+ return apply<int>(&ConfusionMatrix::fn);
58
+ }
59
+
60
+ double avg_precision() {
61
+ return apply<double>(&ConfusionMatrix::precision);
62
+ }
63
+
64
+ double avg_recall() {
65
+ return apply<double>(&ConfusionMatrix::recall);
66
+ }
67
+
68
+ double avg_fscore() {
69
+ return apply<double>(&ConfusionMatrix::fscore);
70
+ }
71
+
72
+ protected:
73
+ template <class T, class Function>
74
+ double apply(Function func) {
75
+ T result = 0.0;
76
+ for(int category = 1; category <= data_set->categories_size(); category++)
77
+ result += (this->*func)(category);
78
+ return result / ((double)counts.size());
79
+ }
80
+ };
81
+
82
+ #endif
@@ -0,0 +1,29 @@
1
+ #include "model.h"
2
+
3
+ void Model::Model::train(DataSet::Example *example) {
4
+ }
5
+
6
+ void Model::Model::train_text(string text) {
7
+ }
8
+
9
+ int Model::Model::classify(DataSet::Example *example) {
10
+ return classifier->classify(example);
11
+ }
12
+
13
+ int Model::Model::classify_text(string text) {
14
+ DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str());
15
+ int category = classifier->classify(example);
16
+ delete example;
17
+ return category;
18
+ }
19
+
20
+ vector<Classifier::Score> *Model::Model::rank(DataSet::Example *example) {
21
+ return classifier->rank(example);
22
+ }
23
+
24
+ vector<Classifier::Score> *Model::Model::rank_text(string text) {
25
+ DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str());
26
+ vector<Classifier::Score> *ranks = classifier->rank(example);
27
+ delete example;
28
+ return ranks;
29
+ }
@@ -0,0 +1,50 @@
1
+ #ifndef __model_h__
2
+ #define __model_h__
3
+ #include "data_set/data_set.h"
4
+ #include "data_set/example.h"
5
+ #include "classifier/classifier.h"
6
+ #include "preprocessing/text/text_pipeline.h"
7
+
8
+ namespace Model {
9
+ class Model {
10
+ public:
11
+ DataSet::DataSet *data_set;
12
+ Classifier::Classifier *classifier;
13
+ Preprocessing::Text::TextPipeline *text_pipeline;
14
+
15
+ Model() : data_set(NULL), classifier(NULL), text_pipeline(NULL) {}
16
+
17
+ void train(DataSet::Example *example);
18
+ void train_text(string text);
19
+ int classify(DataSet::Example *example);
20
+ int classify_text(string text);
21
+ vector<Classifier::Score> *rank(DataSet::Example *example);
22
+ vector<Classifier::Score> *rank_text(string example);
23
+
24
+ void set_data_set(DataSet::DataSet *ds) {
25
+ data_set = ds;
26
+ }
27
+
28
+ DataSet::DataSet *get_data_set() {
29
+ return data_set;
30
+ }
31
+
32
+ void set_classifier(Classifier::Classifier *c) {
33
+ classifier = c;
34
+ }
35
+
36
+ Classifier::Classifier *get_classifier() {
37
+ return classifier;
38
+ }
39
+
40
+ void set_text_pipeline(Preprocessing::Text::TextPipeline *pipeline) {
41
+ text_pipeline = pipeline;
42
+ }
43
+
44
+ Preprocessing::Text::TextPipeline *get_text_pipeline() {
45
+ return text_pipeline;
46
+ }
47
+ };
48
+ }
49
+
50
+ #endif
@@ -0,0 +1,20 @@
1
+ #ifndef __example_preprocessor_h__
2
+ #define __example_preprocessor_h__
3
+ #include "data_set/example.h"
4
+
5
+ namespace Preprocessing {
6
+ namespace Examples {
7
+
8
+ class ExamplePreprocessor {
9
+ public:
10
+ virtual void process(DataSet::Example *example) {}
11
+ void process_data_set(DataSet::DataSet *data_set) {
12
+ for(vector<DataSet::Example *>::iterator example = data_set->examples.begin(); example != data_set->examples.end(); example++)
13
+ process(*example);
14
+ }
15
+ };
16
+
17
+ }
18
+ }
19
+
20
+ #endif