thera 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. data/.document +5 -0
  2. data/.gitignore +56 -0
  3. data/Gemfile +2 -0
  4. data/Gemfile.lock +20 -0
  5. data/LICENSE.txt +1 -0
  6. data/README.rdoc +8 -0
  7. data/Rakefile +1 -0
  8. data/ext/Makefile +225 -0
  9. data/ext/extconf.rb +29 -0
  10. data/ext/quarry/quarry_toolkit.cpp +148 -0
  11. data/lib/quarry/Makefile.linux +2 -0
  12. data/lib/quarry/Makefile.osx +6 -0
  13. data/lib/quarry/Makefile.targets +23 -0
  14. data/lib/quarry/obj/.gitkeep +0 -0
  15. data/lib/quarry/src/classifier/aode/aode_classifier.cpp +0 -0
  16. data/lib/quarry/src/classifier/aode/aode_classifier.h +0 -0
  17. data/lib/quarry/src/classifier/centroid/centroid_classifier.cpp +0 -0
  18. data/lib/quarry/src/classifier/centroid/centroid_classifier.h +0 -0
  19. data/lib/quarry/src/classifier/classifier.cpp +32 -0
  20. data/lib/quarry/src/classifier/classifier.h +59 -0
  21. data/lib/quarry/src/classifier/knn/knn_classifier.cpp +0 -0
  22. data/lib/quarry/src/classifier/knn/knn_classifier.h +0 -0
  23. data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.cpp +40 -0
  24. data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.h +18 -0
  25. data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +80 -0
  26. data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.h +52 -0
  27. data/lib/quarry/src/data_set/data_set.cpp +130 -0
  28. data/lib/quarry/src/data_set/data_set.h +78 -0
  29. data/lib/quarry/src/data_set/dense/dense_data_set.h +39 -0
  30. data/lib/quarry/src/data_set/dense/dense_example.h +44 -0
  31. data/lib/quarry/src/data_set/example.cpp +10 -0
  32. data/lib/quarry/src/data_set/example.h +23 -0
  33. data/lib/quarry/src/data_set/feature.h +36 -0
  34. data/lib/quarry/src/data_set/features/nominal_feature.cpp +57 -0
  35. data/lib/quarry/src/data_set/features/nominal_feature.h +76 -0
  36. data/lib/quarry/src/data_set/features/numeric_feature.cpp +69 -0
  37. data/lib/quarry/src/data_set/features/numeric_feature.h +78 -0
  38. data/lib/quarry/src/data_set/sparse/sparse_data_set.h +40 -0
  39. data/lib/quarry/src/data_set/sparse/sparse_example.cpp +82 -0
  40. data/lib/quarry/src/data_set/sparse/sparse_example.h +38 -0
  41. data/lib/quarry/src/metrics/confusion_matrix.cpp +129 -0
  42. data/lib/quarry/src/metrics/confusion_matrix.h +82 -0
  43. data/lib/quarry/src/model/model.cpp +29 -0
  44. data/lib/quarry/src/model/model.h +50 -0
  45. data/lib/quarry/src/preprocessing/examples/example_preprocessor.h +20 -0
  46. data/lib/quarry/src/preprocessing/examples/weights/binary_weight.h +20 -0
  47. data/lib/quarry/src/preprocessing/examples/weights/local_weight.h +29 -0
  48. data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +19 -0
  49. data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +59 -0
  50. data/lib/quarry/src/preprocessing/text/inplace_processor/downcase.h +26 -0
  51. data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h +17 -0
  52. data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer.h +44 -0
  53. data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer_original.cpp +375 -0
  54. data/lib/quarry/src/preprocessing/text/text_pipeline.cpp +29 -0
  55. data/lib/quarry/src/preprocessing/text/text_pipeline.h +37 -0
  56. data/lib/quarry/src/preprocessing/text/token_selector/pos_tag_selector.h +21 -0
  57. data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp +82 -0
  58. data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h +20 -0
  59. data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h +17 -0
  60. data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.cpp +29 -0
  61. data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.h +20 -0
  62. data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h +19 -0
  63. data/lib/quarry/src/quarry.cpp +1 -0
  64. data/lib/quarry/src/quarry.h +29 -0
  65. data/lib/quarry/src/storage/arff.cpp +198 -0
  66. data/lib/quarry/src/storage/arff.h +26 -0
  67. data/lib/quarry/src/storage/binary.cpp +457 -0
  68. data/lib/quarry/src/storage/binary.h +79 -0
  69. data/lib/quarry/src/storage/folders.cpp +98 -0
  70. data/lib/quarry/src/storage/folders.h +25 -0
  71. data/lib/quarry/src/storage/storage.h +19 -0
  72. data/lib/quarry/src/test.cpp +6 -0
  73. data/lib/quarry_rb/classifier/classifier.rb +22 -0
  74. data/lib/quarry_rb/classifier/naive_bayes_classifier.rb +10 -0
  75. data/lib/quarry_rb/confusion_matrix.rb +58 -0
  76. data/lib/quarry_rb/data_set/data_set.rb +42 -0
  77. data/lib/quarry_rb/data_set/example.rb +33 -0
  78. data/lib/quarry_rb/data_set/feature.rb +28 -0
  79. data/lib/quarry_rb/enumerable_helper.rb +32 -0
  80. data/lib/quarry_rb/model/model.rb +56 -0
  81. data/lib/quarry_rb/storage/arff.rb +11 -0
  82. data/lib/quarry_rb/storage/binary.rb +23 -0
  83. data/lib/quarry_rb/storage/folders.rb +11 -0
  84. data/lib/quarry_rb/text_pipeline.rb +16 -0
  85. data/lib/thera.rb +20 -0
  86. data/test/helper.rb +19 -0
  87. data/test/test_quarry.rb +33 -0
  88. data/thera.gemspec +21 -0
  89. metadata +148 -0
@@ -0,0 +1,40 @@
1
+ #include "multinomial_bayes_classifier.h"
2
+ #include "data_set/data_set.h"
3
+ #include <math.h>
4
+
5
+ double Classifier::MultinomialBayesClassifier::score(DataSet::Category *category, DataSet::Example *example) {
6
+
7
+ }
8
+
9
+ void Classifier::MultinomialBayesClassifier::prepare() {
10
+ numeric_feature_probabilities.resize(data_set->categories_size() + 1);
11
+ nominal_feature_probabilities.resize(data_set->categories_size() + 1);
12
+ DataSet::NumericFeature *numeric_feature = NULL;
13
+ DataSet::NominalFeature *nominal_feature = NULL;
14
+ int feature_count = data_set->features.size();
15
+ double category_sum = 0.0;
16
+
17
+ data_set->count();
18
+
19
+ // determine the category probabilities for each feature
20
+ for(int i = 1; i <= data_set->categories_size(); i++) {
21
+ numeric_feature_probabilities[i].reserve(feature_count);
22
+ nominal_feature_probabilities[i].reserve(feature_count);
23
+
24
+ // sum the counts of each numeric feature for this category
25
+ category_sum = 0.0
26
+ for(int j = 0; j < numeric_features.size(); j++)
27
+ category_sum += numeric_features[j]->category_sum(i);
28
+ category_sum += numeric_features.size();
29
+
30
+ // weight each numeric feature only by the number of other numeric features; nominal features are handled separately
31
+ for(int j = 0; j < numeric_features.size(); j++)
32
+ numeric_feature_probabilities[i][j] = (1.0 + numeric_features[j]->category_sum(i)) / (category_sum);
33
+
34
+ // each value of a nominal feature is treated as if it were another feature in itself
35
+ for(int j = 0; j < nominal_features.size(); j++) {
36
+ nominal_feature = nominal_features[j];
37
+ nominal_feature_probabilities[j].resize(nominal_feature->values.size());
38
+ }
39
+ }
40
+ }
@@ -0,0 +1,18 @@
1
+ #ifndef __multinomial_bayes_classifier_h__
2
+ #define __multinomial_bayes_classifier_h__
3
+ #include "classifier/classifier.h"
4
+ #include <vector>
5
+
6
+ namespace Classifier {
7
+ class MultinomialBayesClassifier : public Classifier {
8
+ vector<vector<double> > numeric_feature_probabilities;
9
+ vector<vector<vector<double> > > nominal_feature_probabilities;
10
+
11
+ public:
12
+ MultinomialBayesClassifier(DataSet::DataSet *data_set) : Classifier(data_set) {}
13
+ double score(DataSet::Category *category, DataSet::Example *example);
14
+ void prepare();
15
+ };
16
+ }
17
+
18
+ #endif
@@ -0,0 +1,80 @@
1
+ #include "naive_bayes_classifier.h"
2
+ #include "data_set/data_set.h"
3
+ #include "data_set/dense/dense_data_set.h"
4
+ #include "storage/binary.h"
5
+ #include <math.h>
6
+
7
+ Classifier::NaiveBayesClassifier *Classifier::NaiveBayesClassifier::clone(DataSet::DataSet *new_data_set) {
8
+ return new NaiveBayesClassifier(new_data_set);
9
+ }
10
+
11
+
12
+ void Classifier::NaiveBayesClassifier::prepare() {
13
+ feature_caches.resize(data_set->categories_size() + 1);
14
+ int features_size = data_set->features_size();
15
+ DataSet::NumericFeature *feature = NULL;
16
+
17
+ data_set->count();
18
+ category_probabilities = data_set->category_feature()->probabilities;
19
+
20
+ for(int i = 1; i <= data_set->categories_size(); i++) {
21
+ feature_caches[i].resize(features_size);
22
+
23
+ for(int j = 0; j < features_size; j++) {
24
+ if(!numeric_features[j])
25
+ continue;
26
+ feature = (DataSet::NumericFeature *) data_set->features[j];
27
+ feature_caches[i][j].denom = 2 * feature->category_variance(i);
28
+ feature_caches[i][j].lhs = 1 / sqrt(2 * M_PI * feature->category_variance(i));
29
+ }
30
+ }
31
+ }
32
+
33
+
34
+ double Classifier::NaiveBayesClassifier::score(int category, DataSet::Example *example) {
35
+ DataSet::SparseExample *sparse_example = NULL;
36
+ DataSet::SparseExample::Value *sparse_value = NULL;
37
+ DataSet::DenseExample *dense_example = NULL;
38
+ double dense_value = 0.0, probability = 0.0;
39
+
40
+ if(typeid(*example) == typeid(DataSet::SparseExample)) {
41
+ sparse_example = (DataSet::SparseExample *) example;
42
+ for(int i = 0; i < example->size; i++) {
43
+ sparse_value = &(sparse_example->values[i]);
44
+ if(numeric_features[sparse_value->index])
45
+ score_numeric_feature(sparse_value->index, sparse_value->value, category, &probability);
46
+ else if(sparse_value->index != data_set->category_index)
47
+ score_nominal_feature(sparse_value->index, sparse_value->value, category, &probability);
48
+ }
49
+
50
+ } else {
51
+ dense_example = (DataSet::DenseExample *) example;
52
+ for(int i = 0; i < example->size; i++) {
53
+ dense_value = dense_example->get_value(i);
54
+ if(numeric_features[i])
55
+ score_numeric_feature(i, dense_value, category, &probability);
56
+ else if(i != data_set->category_index)
57
+ score_nominal_feature(i, dense_value, category, &probability);
58
+ }
59
+ }
60
+
61
+ return probability * category_probabilities[category];
62
+ }
63
+
64
+ void Classifier::NaiveBayesClassifier::write_binary(Storage::Binary *file) {
65
+ int category_count = feature_caches.size();
66
+ file->write_int(category_count);
67
+ file->write_vector<double>(&category_probabilities);
68
+
69
+ for(int i = 1; i <= category_count; i++)
70
+ file->write_vector<NumericFeatureCache>(&feature_caches[i]);
71
+ }
72
+
73
+ void Classifier::NaiveBayesClassifier::read_binary(Storage::Binary *file) {
74
+ int category_count = file->read_int();
75
+ feature_caches.resize(category_count + 1);
76
+ category_probabilities = *(file->read_vector<double>());
77
+
78
+ for(int i = 1; i <= category_count; i++)
79
+ feature_caches[i] = *(file->read_vector<NumericFeatureCache>());
80
+ }
@@ -0,0 +1,52 @@
1
+ #ifndef __naive_bayes_classifier_h__
2
+ #define __naive_bayes_classifier_h__
3
+ #include "classifier/classifier.h"
4
+ #include <vector>
5
+
6
+ namespace Classifier {
7
+ class NaiveBayesClassifier : public Classifier {
8
+ typedef struct {
9
+ double denom;
10
+ double lhs;
11
+ } NumericFeatureCache;
12
+ vector<vector<NumericFeatureCache> > feature_caches; // features_caches[category_index][numeric_feature]
13
+ vector<double> category_probabilities;
14
+
15
+ // (1 / sqrt(2PI * var)) * e^(-((value - mean) ^ 2) / (2 * var))
16
+ void score_numeric_feature(int index, double value, int category, double *probability) {
17
+ DataSet::NumericFeature *numeric_feature = (DataSet::NumericFeature *) data_set->features[index];
18
+ if(feature_caches[category][index].denom != 0.0) {
19
+ double numerator = -1 * pow(value - numeric_feature->category_mean(category), 2);
20
+ if(*probability == 0.0)
21
+ *probability = 1.0;
22
+ *probability = *probability * (feature_caches[category][index].lhs * exp(numerator / feature_caches[category][index].denom));
23
+ }
24
+ }
25
+
26
+ void score_nominal_feature(int index, double value, int category, double *probability) {
27
+ DataSet::NominalFeature *nominal_feature = (DataSet::NominalFeature *) data_set->features[index];
28
+ *probability = *probability * nominal_feature->category_value_probability(category, (int)value);
29
+ }
30
+
31
+ public:
32
+ static const uint32_t file_mark = 'naiv';
33
+ NaiveBayesClassifier(DataSet::DataSet *data_set) : Classifier(data_set) {}
34
+ NaiveBayesClassifier *clone(DataSet::DataSet *new_data_set);
35
+
36
+ double score(int category, DataSet::Example *example);
37
+ void prepare();
38
+ void write_binary(Storage::Binary *file);
39
+ void read_binary(Storage::Binary *file);
40
+ uint32_t mark() { return file_mark; }
41
+
42
+ void print() {
43
+ cout << "NB:" << endl;
44
+ for(unsigned int i = 0; i < feature_caches.size(); i++) {
45
+ for(unsigned int j = 0; j < feature_caches[i].size(); j++)
46
+ cout << "C" << i << "F" << j << ":" << feature_caches[i][j].denom << ", l:" << feature_caches[i][j].lhs << endl;
47
+ }
48
+ }
49
+ };
50
+ }
51
+
52
+ #endif
@@ -0,0 +1,130 @@
1
+ #include <iostream>
2
+ #include "data_set.h"
3
+ #include "classifier/classifier.h"
4
+ #include "metrics/confusion_matrix.h"
5
+ #include "dense/dense_data_set.h"
6
+ #include "dense/dense_example.h"
7
+ #include "sparse/sparse_data_set.h"
8
+ #include "sparse/sparse_example.h"
9
+
10
+ DataSet::NominalFeature *DataSet::DataSet::new_nominal_feature(string name) {
11
+ NominalFeature *feature = new NominalFeature(name, features.size());
12
+ feature_names[name] = feature;
13
+ features.push_back(feature);
14
+ return feature;
15
+ }
16
+
17
+
18
+ DataSet::NumericFeature *DataSet::DataSet::new_numeric_feature(string name) {
19
+ NumericFeature *feature = new NumericFeature(name, features.size());
20
+ feature_names[name] = feature;
21
+ features.push_back(feature);
22
+ return feature;
23
+ }
24
+
25
+
26
+ void DataSet::DataSet::set_category_index(int index) {
27
+ category_index = index;
28
+ for(vector<Feature *>::iterator feature = features.begin(); feature < features.end(); feature++)
29
+ (*feature)->reset();
30
+ counted = false;
31
+ indexed = false;
32
+ }
33
+
34
+
35
+ void DataSet::DataSet::count() {
36
+ if(counted)
37
+ return;
38
+
39
+ // initialise each feature for counting
40
+ for(vector<Feature *>::iterator feature = features.begin(); feature < features.end(); feature++)
41
+ (*feature)->prepare_for_counting(this);
42
+
43
+ // implementation optimised count
44
+ perform_count();
45
+
46
+ // calculate and finalise counts
47
+ for(vector<Feature *>::iterator feature = features.begin(); feature < features.end(); feature++)
48
+ (*feature)->finalise_counting(this);
49
+
50
+ counted = true;
51
+ }
52
+
53
+
54
+ void DataSet::DataSet::index() {
55
+ if(indexed)
56
+ return;
57
+
58
+ // initialise each feature for indexing
59
+ for(vector<Feature *>::iterator feature = features.begin(); feature < features.end(); feature++)
60
+ (*feature)->prepare_for_indexing(this);
61
+
62
+ // index
63
+ for(vector<Example *>::iterator example = examples.begin(); example < examples.end(); example++) {
64
+ for(int i = 0; i < features.size(); i++)
65
+ features[i]->index_example((*example)->get_value(i), *example);
66
+ }
67
+
68
+ // finalise indexing
69
+ for(vector<Feature *>::iterator feature = features.begin(); feature < features.end(); feature++)
70
+ (*feature)->finalise_indexing(this);
71
+
72
+ indexed = true;
73
+ }
74
+
75
+
76
+ // TODO: this isn't really stratification; categories should be proportionally represented
77
+ vector<vector<DataSet::Example *> > *DataSet::DataSet::stratify(int number_of_folds) {
78
+ int examples_per_fold = examples.size() / number_of_folds;
79
+ Example *example;
80
+ vector<vector<Example *> > *folds = new vector<vector<Example *> >(number_of_folds, vector<Example *>(examples_per_fold, NULL));
81
+
82
+ for(int fold = 0; fold < number_of_folds; fold++) {
83
+ for(int i = 0; i < examples_per_fold; i++) {
84
+ (*folds)[fold][i] = examples[fold + (i * number_of_folds)];
85
+ }
86
+ }
87
+
88
+ return folds;
89
+ }
90
+
91
+
92
+ ConfusionMatrix *DataSet::DataSet::cross_fold_validation(Classifier::Classifier *classifier, int number_of_folds) {
93
+ vector<vector<Example *> > *folds = stratify(number_of_folds);
94
+ ConfusionMatrix *overall_matrix = new ConfusionMatrix(this);
95
+ Classifier::Classifier *test_classifier = NULL;
96
+ ConfusionMatrix *result = NULL;
97
+ DataSet *test_set = NULL;
98
+ Example *example = NULL;
99
+ int predicted = 0;
100
+
101
+ int examples_per_fold = examples.size() / number_of_folds;
102
+ int examples_per_test = examples_per_fold * (number_of_folds - 1);
103
+
104
+ for(int fold = 0; fold < number_of_folds; fold++) {
105
+ cout << "Running fold " << fold << endl;
106
+ test_set = clone_without_examples();
107
+ test_set->examples.reserve(examples_per_test);
108
+ for(int i = 0; i < number_of_folds; i++) {
109
+ if(i != fold)
110
+ test_set->examples.insert(test_set->examples.begin() + (examples_per_fold * (i > fold ? i - 1 : i)), (*folds)[i].begin(), (*folds)[i].end());
111
+ }
112
+
113
+ test_classifier = classifier->clone(test_set);
114
+ result = new ConfusionMatrix(test_set);
115
+ test_classifier->prepare();
116
+
117
+ for(int i = 0; i < examples_per_fold; i++) {
118
+ example = (*folds)[fold][i];
119
+ predicted = test_classifier->classify(example);
120
+ result->add(predicted, (int)example->get_value(category_index));
121
+ }
122
+
123
+ overall_matrix->merge(result);
124
+ delete test_classifier;
125
+ delete test_set;
126
+ delete result;
127
+ }
128
+
129
+ return overall_matrix;
130
+ }
@@ -0,0 +1,78 @@
1
+ #ifndef __data_set_h__
2
+ #define __data_set_h__
3
+ #include <tr1/unordered_map>
4
+ #include <vector>
5
+ #include <string>
6
+ #include "example.h"
7
+ #include "feature.h"
8
+ #include "features/numeric_feature.h"
9
+ #include "features/nominal_feature.h"
10
+ using namespace std;
11
+
12
+ namespace Classifier { class Classifier; }
13
+ class ConfusionMatrix;
14
+
15
+ namespace DataSet {
16
+ class DataSet {
17
+ virtual void perform_count() {}
18
+ virtual void perform_index() {}
19
+ public:
20
+ DataSet() : category_index(-1), counted(false), indexed(false) {}
21
+ DataSet(DataSet *other) : name(other->name), category_index(other->category_index), counted(false), indexed(false) {
22
+ Feature *feature = NULL;
23
+ for(unsigned int i = 0; i < other->features.size(); i++) {
24
+ feature = other->features[i]->clone();
25
+ feature_names[feature->name] = feature;
26
+ features.push_back(feature);
27
+ }
28
+ }
29
+ virtual DataSet *clone_without_examples() { return NULL; }
30
+
31
+ tr1::unordered_map<string, Feature *> feature_names;
32
+ vector<Feature *> features;
33
+ vector<Example *> examples;
34
+ string name;
35
+ int category_index;
36
+ bool counted;
37
+ bool indexed;
38
+
39
+ NumericFeature *new_numeric_feature(string name);
40
+ NominalFeature *new_nominal_feature(string name);
41
+ virtual Example *new_example() { return NULL; }
42
+
43
+ void count();
44
+ void index();
45
+
46
+ vector<vector<Example *> > *stratify(int number_of_folds);
47
+ ConfusionMatrix *cross_fold_validation(Classifier::Classifier *classifier, int number_of_folds);
48
+
49
+ string get_name() { return name; }
50
+ void set_name(string new_name) { name = new_name; }
51
+ int get_category_index() { return category_index; }
52
+ int features_size() { return features.size(); }
53
+ int examples_size() { return examples.size(); }
54
+ int categories_size() { return ((NominalFeature *)features[category_index])->indexes.size(); }
55
+ NominalFeature *category_feature() { return (NominalFeature *)features[category_index]; }
56
+ void set_category_index(int index);
57
+
58
+ Feature *get_feature_by_name(string name) { return feature_names[name]; }
59
+ Feature *get_feature_by_index(int index) { return features[index]; }
60
+ Example *get_example_by_index(int index) { return examples[index]; }
61
+
62
+ NumericFeature *get_or_create_numeric_feature_by_name(string name) {
63
+ NumericFeature *feature = (NumericFeature *)feature_names[name];
64
+ if(feature == NULL)
65
+ feature = new_numeric_feature(name);
66
+ return feature;
67
+ }
68
+
69
+ NominalFeature *get_or_create_nominal_feature_by_name(string name) {
70
+ NominalFeature *feature = (NominalFeature *)feature_names[name];
71
+ if(feature == NULL)
72
+ feature = new_nominal_feature(name);
73
+ return feature;
74
+ }
75
+ };
76
+ }
77
+
78
+ #endif
@@ -0,0 +1,39 @@
1
+ #ifndef __dense_data_set_h__
2
+ #define __dense_data_set_h__
3
+ #include "dense_example.h"
4
+
5
+ namespace DataSet {
6
+ class DenseDataSet : public DataSet {
7
+ void perform_count() {
8
+ int example_category_index = 0;
9
+ double value = 0.0;
10
+
11
+ for(vector<Example *>::iterator example = examples.begin(); example < examples.end(); example++) {
12
+ example_category_index = (int)((*example)->get_value(category_index));
13
+ for(unsigned int i = 0; i < features.size(); i++) {
14
+ value = (*example)->get_value(i);
15
+ features[i]->count_example(value, example_category_index);
16
+ }
17
+ }
18
+ }
19
+
20
+ void perform_index() {
21
+ }
22
+
23
+ public:
24
+ DenseDataSet() : DataSet() {}
25
+ DenseDataSet(DataSet *other) : DataSet(other) {}
26
+
27
+ DenseDataSet *clone_without_examples() {
28
+ return new DenseDataSet(this);
29
+ }
30
+
31
+ DenseExample *new_example() {
32
+ DenseExample *example = new DenseExample(features.size());
33
+ examples.push_back(example);
34
+ return example;
35
+ }
36
+ };
37
+ }
38
+
39
+ #endif
@@ -0,0 +1,44 @@
1
+ #include <stdexcept>
2
+ #ifndef __dense_data_set_example_h__
3
+ #define __dense_data_set_example_h__
4
+ #include "dense_data_set.h"
5
+ #include <iostream>
6
+
7
+ namespace DataSet {
8
+ class DenseExample : public Example {
9
+ public:
10
+ double *values;
11
+
12
+ DenseExample(int size) : Example(size) {
13
+ values = (double *) calloc(size, sizeof(double));
14
+ }
15
+
16
+ ~DenseExample() {
17
+ free(values);
18
+ }
19
+
20
+ double get_value(int index) {
21
+ return values[index];
22
+ }
23
+
24
+ void set_value(int index, double new_value) {
25
+ values[index] = new_value;
26
+ }
27
+
28
+ double euclidean_distance(Example *other_example) {
29
+ return 0.0;
30
+ }
31
+
32
+ double cosine_distance(Example *other_example) {
33
+ return 0.0;
34
+ }
35
+
36
+ void print() {
37
+ for(int i = 0; i < size; i++)
38
+ cout << values[i] << ",";
39
+ cout << endl;
40
+ }
41
+ };
42
+ }
43
+
44
+ #endif
@@ -0,0 +1,10 @@
1
+ #include "data_set.h"
2
+ #include "example.h"
3
+
4
+ int DataSet::Example::category_index(DataSet *data_set) {
5
+ return (int)get_value(data_set->category_index);
6
+ }
7
+
8
+ void DataSet::Example::set_category_index(DataSet *data_set, int index) {
9
+ set_value(data_set->category_index, index);
10
+ }
@@ -0,0 +1,23 @@
1
+ #ifndef __example_h__
2
+ #define __example_h__
3
+
4
+ namespace DataSet {
5
+ class DataSet;
6
+ class Example {
7
+ public:
8
+ int size;
9
+
10
+ Example(int size) : size(size) {}
11
+ virtual ~Example() {}
12
+
13
+ int category_index(DataSet *data_set);
14
+ void set_category_index(DataSet *data_set, int index);
15
+ virtual double get_value(int index) { return 0.0; }
16
+ virtual void set_value(int index, double new_value) {}
17
+ virtual double euclidean_distance(Example *other_example) { return 0.0; }
18
+ virtual double cosine_distance(Example *other_example) { return 0.0; }
19
+ virtual void print() {}
20
+ };
21
+ }
22
+
23
+ #endif
@@ -0,0 +1,36 @@
1
+ #ifndef __feature_h__
2
+ #define __feature_h__
3
+ #include <string>
4
+ using namespace std;
5
+
6
+ namespace DataSet {
7
+ class DataSet;
8
+
9
+ class Feature {
10
+ public:
11
+ string name;
12
+ int index;
13
+
14
+ string get_name() { return name; }
15
+ int get_index() { return index; }
16
+ void set_name(string new_name) { name = new_name; }
17
+ void set_index(int new_index) { index = new_index; }
18
+
19
+ Feature(string name, int index) : name(name), index(index) {}
20
+ virtual Feature *clone() { return NULL; }
21
+ virtual void reset() {}
22
+ virtual void print() {}
23
+
24
+ // counting
25
+ virtual void prepare_for_counting(DataSet *data_set) {}
26
+ virtual void count_example(double value, int category_index) {}
27
+ virtual void finalise_counting(DataSet *data_set) {}
28
+
29
+ // indexing
30
+ virtual void prepare_for_indexing(DataSet *data_set) {}
31
+ virtual void index_example(double value, Example *example) {}
32
+ virtual void finalise_indexing(DataSet *data_set) {}
33
+ };
34
+ }
35
+
36
+ #endif
@@ -0,0 +1,57 @@
1
+ #include "nominal_feature.h"
2
+ #include "data_set/data_set.h"
3
+ #include "data_set/example.h"
4
+
5
+
6
+ void DataSet::NominalFeature::prepare_for_counting(DataSet *data_set) {
7
+ int categories_count = data_set->categories_size();
8
+ int values_count = names.size();
9
+
10
+ category_frequencies.resize(categories_count + 1);
11
+ category_probabilities.resize(categories_count + 1);
12
+ probabilities.resize(values_count);
13
+ frequencies.resize(values_count);
14
+
15
+ for(int i = 1; i <= categories_count; i++) {
16
+ category_frequencies[i].resize(values_count);
17
+ category_probabilities[i].resize(values_count);
18
+ }
19
+ }
20
+
21
+ void DataSet::NominalFeature::count_example(double value, int category_index) {
22
+ frequencies[(int)value] += 1;
23
+ category_frequencies[category_index][(int)value] += 1;
24
+ }
25
+
26
+ void DataSet::NominalFeature::finalise_counting(DataSet *data_set) {
27
+ int categories_count = data_set->categories_size();
28
+ int examples_count = data_set->examples.size();
29
+ int values_count = names.size();
30
+
31
+ // overall value probabilities
32
+ for(int i = 0; i < values_count; i++)
33
+ probabilities[i] = ((double)frequencies[i] + 1) / examples_count;
34
+
35
+ // value probabilities per category
36
+ if(index != data_set->category_index) {
37
+ for(int i = 1; i <= categories_count; i++) {
38
+ for(int j = 0; j < values_count; j++)
39
+ category_probabilities[i][j] = ((double)category_frequencies[i][j] + 1) / data_set->category_feature()->value_frequency(i);
40
+ }
41
+ }
42
+ }
43
+
44
+ void DataSet::NominalFeature::prepare_for_indexing(DataSet *data_set) {
45
+ examples_with_value.resize(names.size());
46
+ }
47
+
48
+ void DataSet::NominalFeature::index_example(double value, Example *example) {
49
+ examples_with_value[(int)value].push_back(example);
50
+ }
51
+
52
+ void DataSet::NominalFeature::finalise_indexing(DataSet *data_set) {}
53
+
54
+ void DataSet::NominalFeature::print() {
55
+ for(int i = 1; i < names.size(); i++)
56
+ cout << i << ": " << names[i] << endl;
57
+ }