RubyGems - thera - Versions diffs - 0.0.1 - Mend

thera 0.0.1

Files changed (89) hide show

data/.document +5 -0
data/.gitignore +56 -0
data/Gemfile +2 -0
data/Gemfile.lock +20 -0
data/LICENSE.txt +1 -0
data/README.rdoc +8 -0
data/Rakefile +1 -0
data/ext/Makefile +225 -0
data/ext/extconf.rb +29 -0
data/ext/quarry/quarry_toolkit.cpp +148 -0
data/lib/quarry/Makefile.linux +2 -0
data/lib/quarry/Makefile.osx +6 -0
data/lib/quarry/Makefile.targets +23 -0
data/lib/quarry/obj/.gitkeep +0 -0
data/lib/quarry/src/classifier/aode/aode_classifier.cpp +0 -0
data/lib/quarry/src/classifier/aode/aode_classifier.h +0 -0
data/lib/quarry/src/classifier/centroid/centroid_classifier.cpp +0 -0
data/lib/quarry/src/classifier/centroid/centroid_classifier.h +0 -0
data/lib/quarry/src/classifier/classifier.cpp +32 -0
data/lib/quarry/src/classifier/classifier.h +59 -0
data/lib/quarry/src/classifier/knn/knn_classifier.cpp +0 -0
data/lib/quarry/src/classifier/knn/knn_classifier.h +0 -0
data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.cpp +40 -0
data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.h +18 -0
data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +80 -0
data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.h +52 -0
data/lib/quarry/src/data_set/data_set.cpp +130 -0
data/lib/quarry/src/data_set/data_set.h +78 -0
data/lib/quarry/src/data_set/dense/dense_data_set.h +39 -0
data/lib/quarry/src/data_set/dense/dense_example.h +44 -0
data/lib/quarry/src/data_set/example.cpp +10 -0
data/lib/quarry/src/data_set/example.h +23 -0
data/lib/quarry/src/data_set/feature.h +36 -0
data/lib/quarry/src/data_set/features/nominal_feature.cpp +57 -0
data/lib/quarry/src/data_set/features/nominal_feature.h +76 -0
data/lib/quarry/src/data_set/features/numeric_feature.cpp +69 -0
data/lib/quarry/src/data_set/features/numeric_feature.h +78 -0
data/lib/quarry/src/data_set/sparse/sparse_data_set.h +40 -0
data/lib/quarry/src/data_set/sparse/sparse_example.cpp +82 -0
data/lib/quarry/src/data_set/sparse/sparse_example.h +38 -0
data/lib/quarry/src/metrics/confusion_matrix.cpp +129 -0
data/lib/quarry/src/metrics/confusion_matrix.h +82 -0
data/lib/quarry/src/model/model.cpp +29 -0
data/lib/quarry/src/model/model.h +50 -0
data/lib/quarry/src/preprocessing/examples/example_preprocessor.h +20 -0
data/lib/quarry/src/preprocessing/examples/weights/binary_weight.h +20 -0
data/lib/quarry/src/preprocessing/examples/weights/local_weight.h +29 -0
data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +19 -0
data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +59 -0
data/lib/quarry/src/preprocessing/text/inplace_processor/downcase.h +26 -0
data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h +17 -0
data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer.h +44 -0
data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer_original.cpp +375 -0
data/lib/quarry/src/preprocessing/text/text_pipeline.cpp +29 -0
data/lib/quarry/src/preprocessing/text/text_pipeline.h +37 -0
data/lib/quarry/src/preprocessing/text/token_selector/pos_tag_selector.h +21 -0
data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp +82 -0
data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h +20 -0
data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h +17 -0
data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.cpp +29 -0
data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.h +20 -0
data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h +19 -0
data/lib/quarry/src/quarry.cpp +1 -0
data/lib/quarry/src/quarry.h +29 -0
data/lib/quarry/src/storage/arff.cpp +198 -0
data/lib/quarry/src/storage/arff.h +26 -0
data/lib/quarry/src/storage/binary.cpp +457 -0
data/lib/quarry/src/storage/binary.h +79 -0
data/lib/quarry/src/storage/folders.cpp +98 -0
data/lib/quarry/src/storage/folders.h +25 -0
data/lib/quarry/src/storage/storage.h +19 -0
data/lib/quarry/src/test.cpp +6 -0
data/lib/quarry_rb/classifier/classifier.rb +22 -0
data/lib/quarry_rb/classifier/naive_bayes_classifier.rb +10 -0
data/lib/quarry_rb/confusion_matrix.rb +58 -0
data/lib/quarry_rb/data_set/data_set.rb +42 -0
data/lib/quarry_rb/data_set/example.rb +33 -0
data/lib/quarry_rb/data_set/feature.rb +28 -0
data/lib/quarry_rb/enumerable_helper.rb +32 -0
data/lib/quarry_rb/model/model.rb +56 -0
data/lib/quarry_rb/storage/arff.rb +11 -0
data/lib/quarry_rb/storage/binary.rb +23 -0
data/lib/quarry_rb/storage/folders.rb +11 -0
data/lib/quarry_rb/text_pipeline.rb +16 -0
data/lib/thera.rb +20 -0
data/test/helper.rb +19 -0
data/test/test_quarry.rb +33 -0
data/thera.gemspec +21 -0
metadata +148 -0

data/lib/quarry/src/data_set/features/nominal_feature.h ADDED Viewed

@@ -0,0 +1,76 @@
+#ifndef __nominal_feature_h__
+#define __nominal_feature_h__
+#include "data_set/example.h"
+#include "data_set/feature.h"
+#include <vector>
+#include <map>
+#include <iostream>
+namespace DataSet {
+  class DataSet;
+  class NominalFeature : public Feature {
+  public:
+    NominalFeature(string name, int index) : Feature(name, index), names(1, "") {}
+    NominalFeature(NominalFeature *other) : Feature(other->name, other->index), indexes(other->indexes), names(other->names) {}
+    NominalFeature *clone() {
+      return new NominalFeature(this);
+    }
+    void reset() {
+      frequencies.clear();
+      probabilities.clear();
+      category_frequencies.clear();
+      category_probabilities.clear();
+      examples_with_value.clear();
+    }
+    void print();
+    // nominal values are referenced by index in examples
+    // (e.g "CategoryA" -> 2; would be stored as 2 in an example)
+    map<string, int>  indexes;
+    vector<string>    names;
+    void add_value(string name) {
+      int index = indexes.size() + 1;
+      indexes[name] = index;
+      names.push_back(name);
+    }
+    int value_index(string name) {
+      int index = indexes[name];
+      if(index == 0) {
+        index = indexes.size();
+        indexes[name] = index;
+        names.push_back(name);
+      }
+      return index;
+    }
+    // counts
+    void prepare_for_counting(DataSet *data_set);
+    void count_example(double value, int category_index);
+    void finalise_counting(DataSet *data_set);
+    // counts for this feature over the entire data set
+    vector<int>     frequencies;
+    vector<double>  probabilities;
+    int    value_frequency(int index)   { return frequencies[index]; }
+    double value_probability(int index) { return probabilities[index]; }
+    // counts for this feature per category
+    vector<vector<int> >    category_frequencies;
+    vector<vector<double> > category_probabilities;
+    int    category_value_frequency(int category, int index)   { return category_frequencies[category][index]; }
+    double category_value_probability(int category, int index) { return category_probabilities[category][index]; }
+    // indexes
+    void prepare_for_indexing(DataSet *data_set);
+    void index_example(double value, Example *example);
+    void finalise_indexing(DataSet *data_set);
+    vector<vector<Example *> > examples_with_value;
+  };
+}
+#endif

data/lib/quarry/src/data_set/features/numeric_feature.cpp ADDED Viewed

@@ -0,0 +1,69 @@
+#include "data_set/data_set.h"
+#include "data_set/example.h"
+#include "numeric_feature.h"
+#include "stdlib.h"
+void DataSet::NumericFeature::prepare_for_counting(DataSet *data_set) {
+  category_counts = (Counts *) calloc(sizeof(Counts), data_set->categories_size() + 1);
+}
+void DataSet::NumericFeature::count_example(double value, int category_index) {
+  // non zero count
+  if(value != 0.0) {
+    counts.non_zero_count++;
+    category_counts[category_index].non_zero_count++;
+  }
+  // minima
+  if(value < counts.min)
+    counts.min = value;
+  if(value < category_counts[category_index].min)
+    category_counts[category_index].min = value;
+  // maxima
+  if(value > counts.max)
+    counts.max = value;
+  if(value > category_counts[category_index].max)
+    category_counts[category_index].max = value;
+  // sum
+  counts.sum += value;
+  category_counts[category_index].sum += value;
+  // squared sum
+  counts.sq_sum += (value * value);
+  category_counts[category_index].sq_sum += (value * value);
+}
+void DataSet::NumericFeature::finalise_counting(DataSet *data_set) {
+  int categories_count = data_set->categories_size();
+  int examples_count = data_set->examples.size();
+  // mean
+  counts.mean = counts.sum / examples_count;
+  for(int i = 1; i <= categories_count; i++)
+    category_counts[i].mean = category_counts[i].sum / data_set->category_feature()->value_frequency(i);
+  // variance
+  counts.variance = (counts.sq_sum / examples_count) - (counts.mean * counts.mean);
+  for(int i = 1; i <= categories_count; i++)
+    category_counts[i].variance = (category_counts[i].sq_sum / data_set->category_feature()->value_frequency(i)) - (category_counts[i].mean * category_counts[i].mean);
+}
+void DataSet::NumericFeature::prepare_for_indexing(DataSet *data_set) {}
+void DataSet::NumericFeature::index_example(double value, Example *example) {
+  if(value != 0.0)
+    non_zero_examples.push_back(example);
+}
+void DataSet::NumericFeature::finalise_indexing(DataSet *data_set) {}
+void DataSet::NumericFeature::print(DataSet::DataSet *data_set) {
+  cout << "F" << index << ", " << name << endl;
+  print_counts(&counts);
+  for(int i = 0; i < (data_set->categories_size() + 1); i++) {
+    cout << "C" << i << ":";
+    print_counts(&(category_counts[i]));
+  }
+}

data/lib/quarry/src/data_set/features/numeric_feature.h ADDED Viewed

@@ -0,0 +1,78 @@
+#ifndef __numeric_feature_h__
+#define __numeric_feature_h__
+#include "data_set/example.h"
+#include "data_set/feature.h"
+#include <iostream>
+namespace DataSet {
+  class DataSet;
+  class NumericFeature : public Feature {
+  public:
+    NumericFeature(string name, int index) : Feature(name, index), category_counts(NULL), non_zero_examples() {
+      reset();
+    }
+    NumericFeature *clone() {
+      return new NumericFeature(name, index);
+    }
+    void reset() {
+      memset(&counts, 0, sizeof(Counts));
+      if(category_counts != NULL)
+        free(category_counts);
+      category_counts = NULL;
+      non_zero_examples.clear();
+    }
+    void prepare_for_counting(DataSet *data_set);
+    void count_example(double value, int category_index);
+    void finalise_counting(DataSet *data_set);
+    void prepare_for_indexing(DataSet *data_set);
+    void index_example(double value, Example *example);
+    void finalise_indexing(DataSet *data_set);
+    // counts
+    typedef struct {
+      int    non_zero_count;
+      double sum;
+      double sq_sum;
+      double min;
+      double max;
+      double mean;
+      double variance;
+    } Counts;
+    // counts for this feature over the entire data set
+    Counts counts;
+    int    non_zero_count()                   { return counts.non_zero_count; }
+    double sum()                              { return counts.sum; }
+    double sq_sum()                           { return counts.sq_sum; }
+    double min()                              { return counts.min; }
+    double max()                              { return counts.max; }
+    double mean()                             { return counts.mean; }
+    double variance()                         { return counts.variance; }
+    // counts for this feature per category
+    Counts *category_counts;
+    int    category_non_zero_count(int index) { return category_counts[index].non_zero_count; }
+    double category_sum(int index)            { return category_counts[index].sum; }
+    double category_sq_sum(int index)         { return category_counts[index].sq_sum; }
+    double category_min(int index)            { return category_counts[index].min; }
+    double category_max(int index)            { return category_counts[index].max; }
+    double category_mean(int index)           { return category_counts[index].mean; }
+    double category_variance(int index)       { return category_counts[index].variance; }
+    void print(DataSet *data_set);
+    void print_counts(Counts *c) {
+      cout << c->non_zero_count << ";" << c->sum << ";" << c->sq_sum << ";" << c->min << ";" << c->max << ";" << c->mean << ";" << c->variance << endl;
+    }
+    // indexes
+    vector<Example *> non_zero_examples;
+  };
+}
+#endif

data/lib/quarry/src/data_set/sparse/sparse_data_set.h ADDED Viewed

@@ -0,0 +1,40 @@
+#ifndef __sparse_data_set_h__
+#define __sparse_data_set_h__
+#include "data_set/data_set.h"
+#include "sparse_example.h"
+namespace DataSet {
+  class SparseDataSet : public DataSet {
+    void perform_count() {
+      int example_category_index = 0;
+      SparseExample::Value *value;
+      for(vector<Example *>::iterator example = examples.begin(); example < examples.end(); example++) {
+        example_category_index = (int)((*example)->get_value(category_index));
+        for(int i = 0; i < (*example)->size; i++) {
+          value = &(((SparseExample *)(*example))->values[i]);
+          features[value->index]->count_example(value->value, example_category_index);
+        }
+      }
+    }
+    void perform_index() {
+    }
+  public:
+    SparseDataSet() : DataSet() {}
+    SparseDataSet(DataSet *other) : DataSet(other) {}
+    SparseDataSet *clone_without_examples() {
+      return new SparseDataSet(this);
+    }
+    SparseExample *new_example(int buffer_size = 0) {
+      SparseExample *example = new SparseExample(buffer_size);
+      examples.push_back(example);
+      return example;
+    }
+  };
+}
+#endif

data/lib/quarry/src/data_set/sparse/sparse_example.cpp ADDED Viewed

@@ -0,0 +1,82 @@
+#include "sparse_data_set.h"
+#include "sparse_example.h"
+#include <stdlib.h>
+double DataSet::SparseExample::get_value(int feature_index) {
+  if(feature_index == 0 && size != 0)
+    return values[0].value;
+  int low = 0;
+  int high = size - 1;
+  int mid = high / 2;
+  // branch prediction makes this triple clause if statement faster
+  // than a double clause "single comparison" search. precondition
+  // loops also seem to be faster than post condition loops in GCC,
+  // really don't know why... this implementation ends up being
+  // around 30% faster than well known single comparison versions.
+  while(low <= high) {
+    if(values[mid].index < feature_index) {
+      low = mid + 1;
+    } else if(values[mid].index > feature_index) {
+      high = mid - 1;
+    } else {
+      return values[mid].value;
+    }
+    mid = (high + low) / 2;
+  }
+  return 0.0;
+}
+double DataSet::SparseExample::get_value(string feature_name, SparseDataSet *data_set) {
+  return get_value(data_set->get_feature_by_name(feature_name)->index);
+}
+void DataSet::SparseExample::set_value(int feature_index, double new_value) {
+  int i = 0;
+  for(; i < size; i++) {
+    if(values[i].index == feature_index) {
+      values[i].value = new_value;
+      return;
+    } else if(values[i].index > feature_index) {
+      break;
+    }
+  }
+  if(buffer_size == size)
+    values = (Value *) realloc(values, sizeof(Value) * (++buffer_size));
+  if(i != size)
+    memcpy(&values[i + 1], &values[i], (size - i) * sizeof(Value));
+  values[i].index = feature_index;
+  values[i].value = new_value;
+  size++;
+}
+void DataSet::SparseExample::append_value(int feature_index, double new_value) {
+  if(buffer_size == size)
+    values = (Value *) realloc(values, sizeof(Value) * (++buffer_size));
+  values[size].index = feature_index;
+  values[size].value = new_value;
+  size++;
+}
+double DataSet::SparseExample::euclidean_distance(Example *other_example) {
+  return 0.0;
+}
+double DataSet::SparseExample::cosine_distance(Example *other_example) {
+  return 0.0;
+}
+void DataSet::SparseExample::print() {
+  for(int i = 0; i < size; i++) {
+    cout << values[i].index << ":" << values[i].value;
+    if(i < (size - 1))
+      cout << ",";
+  }
+  cout << endl;
+}

data/lib/quarry/src/data_set/sparse/sparse_example.h ADDED Viewed

@@ -0,0 +1,38 @@
+#ifndef __sparse_data_set_example_h__
+#define __sparse_data_set_example_h__
+#include "data_set/example.h"
+#include <stdlib.h>
+#include <string>
+using namespace std;
+namespace DataSet {
+  class SparseDataSet;
+  class SparseExample : public Example {
+  public:
+    typedef struct {
+      int     index;
+      double  value;
+    } Value;
+    Value *values;
+    int   buffer_size;
+    SparseExample(int buffer_size = 0) : Example(0), buffer_size(buffer_size) {
+      if(buffer_size > 0)
+        values = (Value *) calloc(sizeof(Value), buffer_size);
+      else
+        values = NULL;
+    }
+    double get_value(int feature_index);
+    double get_value(string feature_name, SparseDataSet *data_set);
+    void set_value(int feature_index, double new_value);
+    void append_value(int feature_index, double new_value);
+    double euclidean_distance(Example *other_example);
+    double cosine_distance(Example *other_example);
+    void print();
+  };
+}
+#endif

data/lib/quarry/src/metrics/confusion_matrix.cpp ADDED Viewed

@@ -0,0 +1,129 @@
+#include "confusion_matrix.h"
+#include <iostream>
+const string ConfusionMatrix::average_row_name = "Average";
+// TODO: CM should reference a classifier, not a data set
+ConfusionMatrix::ConfusionMatrix(DataSet::DataSet *data_set) : incorrect(0), correct(0), data_set(data_set) {
+  int count = data_set->categories_size();
+  counts.reserve(count);
+  for(int i = 0; i < count; i++)
+    counts.push_back(valarray<int>(0, count));
+}
+void ConfusionMatrix::add(int predicted, int actual) {
+  // category indexes are 1 based
+  counts[predicted - 1][actual - 1] += 1;
+  if(predicted == actual)
+    correct++;
+  else
+    incorrect++;
+}
+double ConfusionMatrix::accuracy() {
+  return ((double)correct) / (correct + incorrect);
+}
+double ConfusionMatrix::error() {
+  return ((double)incorrect) / (correct + incorrect);
+}
+// true positive
+int ConfusionMatrix::tp(int category) {
+  return counts[category - 1][category - 1];
+}
+// false positive
+int ConfusionMatrix::fp(int category) {
+  return counts[category - 1].sum() - tp(category);
+}
+// true negative
+int ConfusionMatrix::tn(int category) {
+  int sum = 0, count = data_set->categories_size();
+  for(int i = 1; i <= count; i++)
+    for(int j = 1; j <= count; j++)
+      if(i != category && j != category)
+        sum += counts[i - 1][j - 1];
+  return sum;
+}
+// false negative
+int ConfusionMatrix::fn(int category) {
+  int sum = 0, count = data_set->categories_size();
+  for(int i = 1; i <= count; i++)
+    if(i != category)
+      sum += counts[i - 1][category - 1];
+  return sum;
+}
+double ConfusionMatrix::precision(int category) {
+  int denom = tp(category) + fp(category);
+  if(denom == 0)
+    return 0.0;
+  return ((double)tp(category)) / denom;
+}
+double ConfusionMatrix::recall(int category) {
+  int denom = tp(category) + fn(category);
+  if(denom == 0)
+    return 0.0;
+  return ((double)tp(category)) / denom;
+}
+double ConfusionMatrix::fscore(int category) {
+  double p = precision(category);
+  double r = recall(category);
+  if((p + r) == 0.0)
+    return 0.0;
+  return (2 * p * r) / (p + r);
+}
+void ConfusionMatrix::print_summary() {
+  // overall counts and summary
+  cout.precision(4);
+  cout << "== Summary ==" << endl;
+  cout << setw(23) <<"Correctly classified:" << setw(12) << right << correct << setw(10) << right << accuracy() * 100 << "%" << endl;
+  cout << setw(23) << "Incorrectly classified:" << setw(12) << right << incorrect << setw(10) << right << error() * 100 << "%" << endl;
+  cout << setw(23) << "Total classifications:" << setw(12) << right << correct + incorrect << endl << endl;
+  // determine the width of the left (category name) column
+  int max_name_length = 0;
+  for(int category = 1; category <= data_set->categories_size(); category++)
+    if(data_set->category_feature()->names[category].length() > max_name_length)
+      max_name_length = data_set->category_feature()->names[category].length();
+  if(average_row_name.length() > max_name_length)
+    max_name_length = average_row_name.length();
+  max_name_length += 1;
+  // detailed category information
+  cout << "== Category Performance ==" << endl;
+  cout << setw(max_name_length) << "";
+  cout << setw(9) << right << "True +";
+  cout << setw(9) << right << "False +";
+  cout << setw(9) << right << "True -";
+  cout << setw(9) << right << "False -";
+  cout << setw(9) << right << "Precis.";
+  cout << setw(9) << right << "Recall";
+  cout << setw(9) << right << "F-score" << endl;
+  for(int category = 1; category <= data_set->categories_size(); category++) {
+    cout << setw(max_name_length) << data_set->category_feature()->names[category];
+    cout << setw(9) << tp(category);
+    cout << setw(9) << fp(category);
+    cout << setw(9) << tn(category);
+    cout << setw(9) << fn(category);
+    cout << setw(8) << precision(category) * 100 << "%";
+    cout << setw(8) << recall(category) * 100 << "%";
+    cout << setw(8) << fscore(category) * 100 << "%" << endl;
+  }
+  cout << setw(max_name_length) << average_row_name;
+  cout << setw(9) << avg_tp();
+  cout << setw(9) << avg_fp();
+  cout << setw(9) << avg_tn();
+  cout << setw(9) << avg_fn();
+  cout << setw(8) << avg_precision() * 100 << "%";
+  cout << setw(8) << avg_recall() * 100 << "%";
+  cout << setw(8) << avg_fscore() * 100 << "%" << endl;
+}

data/lib/quarry/src/metrics/confusion_matrix.h ADDED Viewed

@@ -0,0 +1,82 @@
+#ifndef __confusion_matrix__
+#define __confusion_matrix__
+#include "data_set/data_set.h"
+#include <vector>
+#include <valarray>
+#include <iostream>
+#include <iomanip>
+using namespace std;
+namespace DataSet {
+  class Category;
+}
+class ConfusionMatrix {
+public:
+  static const string average_row_name;
+  DataSet::DataSet        *data_set;
+  vector<valarray<int> >  counts;
+  int                     correct;
+  int                     incorrect;
+  ConfusionMatrix(DataSet::DataSet *data_set);
+  void merge(ConfusionMatrix *other) {
+    incorrect += other->incorrect;
+    correct += other->correct;
+    for(unsigned int i = 0; i < counts.size(); i++)
+      counts[i] += other->counts[i];
+  }
+  void add(int predicted, int actual);
+  double accuracy();
+  double error();
+  int tp(int category);
+  int fp(int category);
+  int tn(int category);
+  int fn(int category);
+  double precision(int category);
+  double recall(int category);
+  double fscore(int category);
+  void print_summary();
+  // averages
+  double avg_tp() {
+    return apply<int>(&ConfusionMatrix::tp);
+  }
+  double avg_fp() {
+    return apply<int>(&ConfusionMatrix::fp);
+  }
+  double avg_tn() {
+    return apply<int>(&ConfusionMatrix::tn);
+  }
+  double avg_fn() {
+    return apply<int>(&ConfusionMatrix::fn);
+  }
+  double avg_precision() {
+    return apply<double>(&ConfusionMatrix::precision);
+  }
+  double avg_recall() {
+    return apply<double>(&ConfusionMatrix::recall);
+  }
+  double avg_fscore() {
+    return apply<double>(&ConfusionMatrix::fscore);
+  }
+protected:
+  template <class T, class Function>
+  double apply(Function func) {
+    T result = 0.0;
+    for(int category = 1; category <= data_set->categories_size(); category++)
+      result += (this->*func)(category);
+    return result / ((double)counts.size());
+  }
+};
+#endif

data/lib/quarry/src/model/model.cpp ADDED Viewed

@@ -0,0 +1,29 @@
+#include "model.h"
+void Model::Model::train(DataSet::Example *example) {
+}
+void Model::Model::train_text(string text) {
+}
+int Model::Model::classify(DataSet::Example *example) {
+  return classifier->classify(example);
+}
+int Model::Model::classify_text(string text) {
+  DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str());
+  int category = classifier->classify(example);
+  delete example;
+  return category;
+}
+vector<Classifier::Score> *Model::Model::rank(DataSet::Example *example) {
+  return classifier->rank(example);
+}
+vector<Classifier::Score> *Model::Model::rank_text(string text) {
+  DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str());
+  vector<Classifier::Score> *ranks = classifier->rank(example);
+  delete example;
+  return ranks;
+}

data/lib/quarry/src/model/model.h ADDED Viewed

@@ -0,0 +1,50 @@
+#ifndef __model_h__
+#define __model_h__
+#include "data_set/data_set.h"
+#include "data_set/example.h"
+#include "classifier/classifier.h"
+#include "preprocessing/text/text_pipeline.h"
+namespace Model {
+  class Model {
+  public:
+    DataSet::DataSet *data_set;
+    Classifier::Classifier *classifier;
+    Preprocessing::Text::TextPipeline *text_pipeline;
+    Model() : data_set(NULL), classifier(NULL), text_pipeline(NULL) {}
+    void train(DataSet::Example *example);
+    void train_text(string text);
+    int classify(DataSet::Example *example);
+    int classify_text(string text);
+    vector<Classifier::Score> *rank(DataSet::Example *example);
+    vector<Classifier::Score> *rank_text(string example);
+    void set_data_set(DataSet::DataSet *ds) {
+      data_set = ds;
+    }
+    DataSet::DataSet *get_data_set() {
+      return data_set;
+    }
+    void set_classifier(Classifier::Classifier *c) {
+      classifier = c;
+    }
+    Classifier::Classifier *get_classifier() {
+      return classifier;
+    }
+    void set_text_pipeline(Preprocessing::Text::TextPipeline *pipeline) {
+      text_pipeline = pipeline;
+    }
+    Preprocessing::Text::TextPipeline *get_text_pipeline() {
+      return text_pipeline;
+    }
+  };
+}
+#endif

data/lib/quarry/src/preprocessing/examples/example_preprocessor.h ADDED Viewed

@@ -0,0 +1,20 @@
+#ifndef __example_preprocessor_h__
+#define __example_preprocessor_h__
+#include "data_set/example.h"
+namespace Preprocessing {
+  namespace Examples {
+    class ExamplePreprocessor {
+    public:
+      virtual void process(DataSet::Example *example) {}
+      void process_data_set(DataSet::DataSet *data_set) {
+        for(vector<DataSet::Example *>::iterator example = data_set->examples.begin(); example != data_set->examples.end(); example++)
+          process(*example);
+      }
+    };
+  }
+}
+#endif