RubyGems - thera - Versions diffs - 0.0.1 - Mend

thera 0.0.1

Files changed (89) hide show

data/.document +5 -0
data/.gitignore +56 -0
data/Gemfile +2 -0
data/Gemfile.lock +20 -0
data/LICENSE.txt +1 -0
data/README.rdoc +8 -0
data/Rakefile +1 -0
data/ext/Makefile +225 -0
data/ext/extconf.rb +29 -0
data/ext/quarry/quarry_toolkit.cpp +148 -0
data/lib/quarry/Makefile.linux +2 -0
data/lib/quarry/Makefile.osx +6 -0
data/lib/quarry/Makefile.targets +23 -0
data/lib/quarry/obj/.gitkeep +0 -0
data/lib/quarry/src/classifier/aode/aode_classifier.cpp +0 -0
data/lib/quarry/src/classifier/aode/aode_classifier.h +0 -0
data/lib/quarry/src/classifier/centroid/centroid_classifier.cpp +0 -0
data/lib/quarry/src/classifier/centroid/centroid_classifier.h +0 -0
data/lib/quarry/src/classifier/classifier.cpp +32 -0
data/lib/quarry/src/classifier/classifier.h +59 -0
data/lib/quarry/src/classifier/knn/knn_classifier.cpp +0 -0
data/lib/quarry/src/classifier/knn/knn_classifier.h +0 -0
data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.cpp +40 -0
data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.h +18 -0
data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +80 -0
data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.h +52 -0
data/lib/quarry/src/data_set/data_set.cpp +130 -0
data/lib/quarry/src/data_set/data_set.h +78 -0
data/lib/quarry/src/data_set/dense/dense_data_set.h +39 -0
data/lib/quarry/src/data_set/dense/dense_example.h +44 -0
data/lib/quarry/src/data_set/example.cpp +10 -0
data/lib/quarry/src/data_set/example.h +23 -0
data/lib/quarry/src/data_set/feature.h +36 -0
data/lib/quarry/src/data_set/features/nominal_feature.cpp +57 -0
data/lib/quarry/src/data_set/features/nominal_feature.h +76 -0
data/lib/quarry/src/data_set/features/numeric_feature.cpp +69 -0
data/lib/quarry/src/data_set/features/numeric_feature.h +78 -0
data/lib/quarry/src/data_set/sparse/sparse_data_set.h +40 -0
data/lib/quarry/src/data_set/sparse/sparse_example.cpp +82 -0
data/lib/quarry/src/data_set/sparse/sparse_example.h +38 -0
data/lib/quarry/src/metrics/confusion_matrix.cpp +129 -0
data/lib/quarry/src/metrics/confusion_matrix.h +82 -0
data/lib/quarry/src/model/model.cpp +29 -0
data/lib/quarry/src/model/model.h +50 -0
data/lib/quarry/src/preprocessing/examples/example_preprocessor.h +20 -0
data/lib/quarry/src/preprocessing/examples/weights/binary_weight.h +20 -0
data/lib/quarry/src/preprocessing/examples/weights/local_weight.h +29 -0
data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +19 -0
data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +59 -0
data/lib/quarry/src/preprocessing/text/inplace_processor/downcase.h +26 -0
data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h +17 -0
data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer.h +44 -0
data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer_original.cpp +375 -0
data/lib/quarry/src/preprocessing/text/text_pipeline.cpp +29 -0
data/lib/quarry/src/preprocessing/text/text_pipeline.h +37 -0
data/lib/quarry/src/preprocessing/text/token_selector/pos_tag_selector.h +21 -0
data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp +82 -0
data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h +20 -0
data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h +17 -0
data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.cpp +29 -0
data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.h +20 -0
data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h +19 -0
data/lib/quarry/src/quarry.cpp +1 -0
data/lib/quarry/src/quarry.h +29 -0
data/lib/quarry/src/storage/arff.cpp +198 -0
data/lib/quarry/src/storage/arff.h +26 -0
data/lib/quarry/src/storage/binary.cpp +457 -0
data/lib/quarry/src/storage/binary.h +79 -0
data/lib/quarry/src/storage/folders.cpp +98 -0
data/lib/quarry/src/storage/folders.h +25 -0
data/lib/quarry/src/storage/storage.h +19 -0
data/lib/quarry/src/test.cpp +6 -0
data/lib/quarry_rb/classifier/classifier.rb +22 -0
data/lib/quarry_rb/classifier/naive_bayes_classifier.rb +10 -0
data/lib/quarry_rb/confusion_matrix.rb +58 -0
data/lib/quarry_rb/data_set/data_set.rb +42 -0
data/lib/quarry_rb/data_set/example.rb +33 -0
data/lib/quarry_rb/data_set/feature.rb +28 -0
data/lib/quarry_rb/enumerable_helper.rb +32 -0
data/lib/quarry_rb/model/model.rb +56 -0
data/lib/quarry_rb/storage/arff.rb +11 -0
data/lib/quarry_rb/storage/binary.rb +23 -0
data/lib/quarry_rb/storage/folders.rb +11 -0
data/lib/quarry_rb/text_pipeline.rb +16 -0
data/lib/thera.rb +20 -0
data/test/helper.rb +19 -0
data/test/test_quarry.rb +33 -0
data/thera.gemspec +21 -0
metadata +148 -0

data/lib/quarry/src/storage/binary.cpp ADDED Viewed

@@ -0,0 +1,457 @@
+#include "binary.h"
+#include <stdexcept>
+#include <vector>
+// ------------------------------------------
+// integer 'magic marks' are used to identify
+// binary files, delineate sections of the
+// file, and can be used to test endianess
+// ------------------------------------------
+static const uint32_t file_mark           = 'quar';
+static const uint32_t none_mark           = 'none';
+static const uint32_t classifier_mark     = 'clas';
+static const uint32_t text_pipeline_mark  = 'texp';
+// ------------------------------------------
+// low level read and write operations
+// ------------------------------------------
+void Storage::Binary::write_int(int number) {
+  file.write((char *)(&number), sizeof(int));
+}
+int Storage::Binary::read_int() {
+  int value = 0;
+  file.read((char *)(&value), sizeof(int));
+  return value;
+}
+void Storage::Binary::write_mark(uint32_t mark) {
+  file.write((char *)(&mark), sizeof(uint32_t));
+}
+uint32_t Storage::Binary::read_mark() {
+  uint32_t value = 0;
+  file.read((char *)(&value), sizeof(uint32_t));
+  return value;
+}
+void Storage::Binary::write_bool(bool value) {
+  char file_value = (value ? 1 : 0);
+  file.write(&file_value, 1);
+}
+bool Storage::Binary::read_bool() {
+  char value = 0;
+  file.read(&value, 1);
+  return value != 0;
+}
+void Storage::Binary::write_string(string str) {
+  file.write(str.c_str(), str.length() + 1);
+}
+string Storage::Binary::read_string() {
+  string str;
+  std::getline(file, str, '\0');
+  return str;
+}
+// ------------------------------------------
+// data set
+// ------------------------------------------
+DataSet::DataSet *Storage::Binary::read_data_set() {
+  DataSet::DataSet *data_set = NULL;
+  bool sparse = read_bool();
+  // determine the type of data set to create
+  if(sparse)
+    data_set = new DataSet::SparseDataSet();
+  else
+    data_set = new DataSet::DenseDataSet();
+  // initialise the data set
+  data_set->name = read_string();
+  data_set->category_index = read_int();
+  data_set->counted = read_bool();
+  data_set->indexed = read_bool();
+  // initialise the data set's features
+  DataSet::NominalFeature *nominal_feature;
+  DataSet::NumericFeature *numeric_feature;
+  int index = 0, count = 0;
+  bool nominal = false;
+  string name;
+  // determine the number of features to read; count caches need to know the number of categories up front
+  int num_features = read_int();
+  int num_categories = read_int();
+  for(int i = 0; i < num_features; i++) {
+    nominal = read_bool();
+    index = read_int();
+    name = read_string();
+    if(nominal) {
+      nominal_feature = data_set->new_nominal_feature(name);
+      nominal_feature->index = index;
+      // read the nominal category names
+      count = read_int();
+      for(int i = 0; i < count; i++)
+        nominal_feature->add_value(read_string());
+      // read cached frequencies and probabilities if present
+      if(data_set->counted) {
+        nominal_feature->frequencies = *read_vector<int>();
+        nominal_feature->probabilities = *read_vector<double>();
+        nominal_feature->category_frequencies.resize(num_categories + 1);
+        nominal_feature->category_probabilities.resize(num_categories + 1);
+        for(int i = 1; i <= num_categories; i++)
+          nominal_feature->category_frequencies[i] = *read_vector<int>();
+        for(int i = 1; i <= num_categories; i++)
+          nominal_feature->category_probabilities[i] = *read_vector<double>();
+      }
+      // TODO: read cached indexes
+      if(data_set->indexed) {
+      }
+    } else {
+      numeric_feature = data_set->new_numeric_feature(name);
+      numeric_feature->index = index;
+      // cached counts
+      if(data_set->counted) {
+        file.read((char *)&(numeric_feature->counts), sizeof(DataSet::NumericFeature::Counts));
+        numeric_feature->category_counts = (DataSet::NumericFeature::Counts *) malloc(sizeof(DataSet::NumericFeature::Counts) * (num_categories + 1));
+        for(int i = 1; i <= num_categories; i++)
+          file.read((char *)&(numeric_feature->category_counts[i]), sizeof(DataSet::NumericFeature::Counts));
+      }
+      // TODO: cached indexes
+      if(data_set->indexed) {
+      }
+    }
+  }
+  // read examples if present
+  should_write_examples = read_bool();
+  if(should_write_examples) {
+    int num_examples = read_int();
+    if(sparse) {
+      DataSet::SparseExample *sparse_example;
+      for(int i = 0; i < num_examples; i++) {
+        // number of non-zero values
+        count = read_int();
+        // construct & read the example
+        sparse_example = ((DataSet::SparseDataSet *) data_set)->new_example(count);
+        file.read((char *)sparse_example->values, count * sizeof(DataSet::SparseExample::Value));
+        sparse_example->size = count;
+      }
+    } else {
+      // each dense example stores the same number of values
+      count = read_int();
+      // read each example
+      DataSet::DenseExample *dense_example;
+      for(int i = 0; i < num_examples; i++) {
+        dense_example = ((DataSet::DenseDataSet *) data_set)->new_example();
+        file.read((char *)dense_example->values, count * sizeof(double));
+      }
+    }
+  }
+  return data_set;
+}
+void Storage::Binary::write_data_set(DataSet::DataSet *data_set) {
+  bool sparse = (typeid(*data_set) == typeid(DataSet::SparseDataSet));
+  int num_categories = data_set->categories_size();
+  int num_features = data_set->features_size();
+  int num_examples = data_set->examples_size();
+  // data set header
+  write_bool(sparse);
+  write_string(data_set->name);
+  write_int(data_set->category_index);
+  write_bool(data_set->counted);
+  write_bool(data_set->indexed);
+  write_int(num_features);
+  write_int(num_categories);
+  // features
+  DataSet::NominalFeature *nominal_feature;
+  DataSet::NumericFeature *numeric_feature;
+  DataSet::Feature *feature;
+  uint32_t count = 0;
+  bool nominal;
+  for(int i = 0; i < num_features; i++) {
+    feature = data_set->features[i];
+    nominal = (typeid(*feature) == typeid(DataSet::NominalFeature));
+    write_bool(nominal);
+    write_int(feature->index);
+    write_string(feature->name);
+    if(nominal) {
+      nominal_feature = (DataSet::NominalFeature *)feature;
+      // category names
+      count = nominal_feature->names.size();
+      write_int(count - 1);
+      for(int i = 1; i < count; i++)
+        write_string(nominal_feature->names.at(i));
+      // cached counts
+      if(data_set->counted) {
+        write_vector<int>(&(nominal_feature->frequencies));
+        write_vector<double>(&(nominal_feature->probabilities));
+        for(int i = 1; i <= num_categories; i++)
+          write_vector<int>(&(nominal_feature->category_frequencies.at(i)));
+        for(int i = 1; i <= num_categories; i++)
+          write_vector<double>(&(nominal_feature->category_probabilities.at(i)));
+      }
+      // TODO: cached indexes
+      if(data_set->indexed) {
+      }
+    } else {
+      numeric_feature = (DataSet::NumericFeature *)feature;
+      // cached counts
+      if(data_set->counted) {
+        file.write((const char *)&(numeric_feature->counts), sizeof(DataSet::NumericFeature::Counts));
+        for(int i = 1; i <= num_categories; i++)
+          file.write((const char *)&(numeric_feature->category_counts[i]), sizeof(DataSet::NumericFeature::Counts));
+      }
+      // TODO: cached indexes
+      if(data_set->indexed) {
+      }
+    }
+  }
+  // examples
+  write_bool(should_write_examples);
+  if(should_write_examples) {
+    write_int(num_examples);
+    if(sparse) {
+      DataSet::SparseExample *example;
+      for(int i = 0; i < num_examples; i++) {
+        example = (DataSet::SparseExample *) data_set->examples[i];
+        count = example->size;
+        write_int(count);
+        file.write((char *)(example->values), count * sizeof(DataSet::SparseExample::Value));
+      }
+    } else {
+      // each dense example stores the same number of values
+      count = data_set->examples[0]->size;
+      write_int(count);
+      // write each example
+      for(int i = 0; i < num_examples; i++)
+        file.write((char *)((DataSet::DenseExample *)data_set->examples[i])->values, count * sizeof(double));
+    }
+  }
+}
+// ------------------------------------------
+// classifiers
+// ------------------------------------------
+Classifier::Classifier *Storage::Binary::read_classifier(DataSet::DataSet *data_set) {
+  uint32_t mark = read_mark();
+  if(mark == none_mark)
+    return NULL;
+  else if(mark != classifier_mark)
+    throw runtime_error("Expected classifier section");
+  Classifier::Classifier *classifier = NULL;
+  uint32_t type = read_mark();
+  switch(type) {
+    case Classifier::NaiveBayesClassifier::file_mark:
+      classifier = new Classifier::NaiveBayesClassifier(data_set);
+      break;
+  }
+  if(classifier)
+    classifier->read_binary(this);
+  return classifier;
+}
+void Storage::Binary::write_classifier(Classifier::Classifier *classifier) {
+  if(!classifier) {
+    write_mark(none_mark);
+  } else {
+    write_mark(classifier_mark);
+    write_mark(classifier->mark());
+    classifier->write_binary(this);
+  }
+}
+// ------------------------------------------
+// text pipeline
+// ------------------------------------------
+Preprocessing::Text::TextPipeline *Storage::Binary::read_text_pipeline() {
+  uint32_t mark = read_mark();
+  if(mark == none_mark)
+    return NULL;
+  else if(mark != text_pipeline_mark)
+    throw runtime_error("Expected text pipeline section");
+  Preprocessing::Text::TextPipeline *pipeline = new Preprocessing::Text::TextPipeline();
+  // tokeniser
+  switch(read_mark()) {
+    case Preprocessing::Text::SimpleTokeniser::file_mark:
+      pipeline->tokeniser = new Preprocessing::Text::SimpleTokeniser(pipeline);
+      break;
+  }
+  // inplace processors
+  int count = read_int();
+  for(int i = 0; i < count; i++) {
+    switch(read_mark()) {
+      case Preprocessing::Text::Downcase::file_mark:
+        pipeline->processors.push_back(new Preprocessing::Text::Downcase());
+        break;
+      case Preprocessing::Text::PorterStemmer::file_mark:
+        pipeline->processors.push_back(new Preprocessing::Text::PorterStemmer());
+        break;
+    }
+  }
+  // token selectors
+  count = read_int();
+  for(int i = 0; i < count; i++) {
+    switch(read_mark()) {
+      case Preprocessing::Text::StopWords::file_mark:
+        pipeline->selectors.push_back(new Preprocessing::Text::StopWords());
+        break;
+      case Preprocessing::Text::POSTagSelector::file_mark:
+        pipeline->selectors.push_back(new Preprocessing::Text::POSTagSelector());
+        break;
+    }
+  }
+  // example generator
+  switch(read_mark()) {
+    case Preprocessing::Text::TokenCounter::file_mark:
+      pipeline->generator = new Preprocessing::Text::TokenCounter();
+      break;
+  }
+  return pipeline;
+}
+void Storage::Binary::write_text_pipeline(Preprocessing::Text::TextPipeline *pipeline) {
+  if(!pipeline) {
+    write_mark(none_mark);
+    return;
+  } else {
+    write_mark(text_pipeline_mark);
+  }
+  // tokeniser
+  write_mark(pipeline->tokeniser->mark());
+  // inplace processors
+  int count = pipeline->processors.size();
+  write_int(count);
+  for(int i = 0; i < count; i++)
+    write_mark(pipeline->processors[i]->mark());
+  // token selectors
+  count = pipeline->selectors.size();
+  write_int(count);
+  for(int i = 0; i < count; i++)
+    write_mark(pipeline->selectors[i]->mark());
+  // example generator
+  write_mark(pipeline->generator->mark());
+}
+// ------------------------------------------
+// helpers
+// ------------------------------------------
+void Storage::Binary::open_for_reading() {
+  // open file
+  file.open(path.c_str(), fstream::in | fstream::binary);
+  // ensure file is ok for reading
+  if(!file.good())
+    throw runtime_error("Error opening binary file for reading");
+  // quick sanity check
+  if(read_mark() != file_mark)
+    throw runtime_error("Binary file mark is invalid");
+}
+void Storage::Binary::open_for_writing() {
+  // open/create file
+  file.open(path.c_str(), fstream::out | fstream::binary);
+  // ensure file is ok for writing
+  if(!file.good())
+    throw runtime_error("Error opening binary file for writing");
+  // write the file marker so reads can test the file format
+  write_mark(file_mark);
+}
+// ------------------------------------------
+// public read & write methods
+// ------------------------------------------
+DataSet::DataSet *Storage::Binary::read() {
+  open_for_reading();
+  DataSet::DataSet *data_set = read_data_set();
+  file.close();
+  return data_set;
+}
+void Storage::Binary::write(DataSet::DataSet *data_set) {
+  open_for_writing();
+  write_data_set(data_set);
+  file.close();
+}
+Model::Model *Storage::Binary::read_model() {
+  open_for_reading();
+  // read the 3 model components
+  Model::Model *model  = new Model::Model();
+  model->data_set      = read_data_set();
+  model->classifier    = read_classifier(model->data_set);
+  model->text_pipeline = read_text_pipeline();
+  file.close();
+  return model;
+}
+void Storage::Binary::write_model(Model::Model *model) {
+  open_for_writing();
+  // write the 3 model components
+  write_data_set(model->data_set);
+  write_classifier(model->classifier);
+  write_text_pipeline(model->text_pipeline);
+  file.close();
+}

data/lib/quarry/src/storage/binary.h ADDED Viewed

@@ -0,0 +1,79 @@
+#ifndef __binary_h__
+#define __binary_h__
+#include "storage/storage.h"
+#include "data_set/dense/dense_data_set.h"
+#include "data_set/sparse/sparse_data_set.h"
+#include "classifier/naive_bayes/naive_bayes_classifier.h"
+#include <fstream>
+using namespace std;
+namespace Storage {
+  class Binary : public Storage {
+    string path;
+    fstream file;
+    // helpers
+    void open_for_reading();
+    void open_for_writing();
+    // low level IO
+    void write_string(string str);
+    string read_string();
+    void write_int(int number);
+    int read_int();
+    void write_mark(uint32_t mark);
+    uint32_t read_mark();
+    void write_bool(bool value);
+    bool read_bool();
+    // these templated functions are used outside this class,
+    // so their definition needs to be in this header file for
+    // each version of the function to be generated
+    template<class T> vector<T> *read_vector() {
+      vector<T> *values = new vector<T>();
+      int size = read_int();
+      values->reserve(size);
+      T value;
+      for(int i = 0; i < size; i++) {
+        file.read((char *)(&value), sizeof(T));
+        values->push_back(value);
+      }
+      return values;
+    }
+    template<class T> void write_vector(vector<T> *values) {
+      uint32_t size = values->size();
+      write_int(size);
+      for(int i = 0; i < size; i++)
+        file.write((char *)(&values->at(i)), sizeof(T));
+    }
+    // serialisation
+    DataSet::DataSet *read_data_set();
+    void write_data_set(DataSet::DataSet *data_set);
+    Classifier::Classifier *read_classifier(DataSet::DataSet *data_set);
+    void write_classifier(Classifier::Classifier *classifier);
+    Preprocessing::Text::TextPipeline *read_text_pipeline();
+    void write_text_pipeline(Preprocessing::Text::TextPipeline *pipeline);
+  public:
+    bool should_write_examples;
+    Binary(string path) : path(path), should_write_examples(false) {}
+    bool get_write_examples()           { return should_write_examples; }
+    void set_write_examples(bool write) { should_write_examples = write;}
+    DataSet::DataSet *read();
+    Model::Model *read_model();
+    void write(DataSet::DataSet *data_set);
+    void write_model(Model::Model *model);
+    friend class Preprocessing::Text::TextPipeline;
+    friend class Classifier::Classifier;
+    friend class Classifier::NaiveBayesClassifier;
+  };
+}
+#endif

data/lib/quarry/src/storage/folders.cpp ADDED Viewed

@@ -0,0 +1,98 @@
+#include "folders.h"
+#include <fstream>
+#include <iostream>
+#include <stdlib.h>
+#include <dirent.h>
+#include <sys/stat.h>
+using namespace std;
+static char *file_data = NULL;
+static int file_data_size = 0;
+static int file_count = 0;
+void Storage::Folders::load_directory(string path, DataSet::SparseDataSet *data_set, int category_index) {
+  DataSet::SparseExample *example;
+  DIR *dir = opendir(path.c_str());
+  struct dirent *dp;
+  char *name;
+  string newpath;
+  struct stat info;
+  FILE *file;
+  int file_length;
+  while((dp = readdir(dir))) {
+    // ignore files starting with a dot
+    name = dp->d_name;
+    if(*name == '.')
+      continue;
+    // ensure this is a file, not a folder
+    newpath = path + "/" + name;
+    stat(newpath.c_str(), &info);
+    if(info.st_mode & S_IFDIR)
+      continue;
+    // determine if the file_data buffer is large enough to hold this file
+    file = fopen(newpath.c_str(), "rb");
+    fseek(file, 0, SEEK_END);
+    file_length = ftell(file) + 1;
+    rewind(file);
+    if(file_data_size < file_length) {
+      if(file_data != NULL)
+        free(file_data);
+      file_data = (char *)malloc(file_length);
+      file_data_size = file_length;
+    }
+    // read into the buffer
+    fread(file_data, 1, file_length - 1, file);
+    file_data[file_length - 1] = 0;
+    fclose(file);
+    // insert a new example into the dataset
+    example = pipeline->process_text(data_set, file_data);
+    example->set_category_index(data_set, category_index);
+    file_count++;
+    if((file_count % 10000) == 0)
+      cout << "Read " << file_count << endl;
+  }
+  closedir(dir);
+}
+DataSet::DataSet *Storage::Folders::read() {
+  DataSet::SparseDataSet *data_set = new DataSet::SparseDataSet();
+  DIR *dir = opendir(path.c_str());
+  struct dirent *dp;
+  char *name;
+  string newpath;
+  struct stat info;
+  // create an initial feature "Category"
+  DataSet::NominalFeature *categories = data_set->new_nominal_feature("Category");
+  data_set->set_category_index(0);
+  int category_index = 0;
+  while((dp = readdir(dir))) {
+    // ignore files starting with a dot
+    name = dp->d_name;
+    if(*name == '.')
+      continue;
+    // ensure this is a folder
+    newpath = path + "/" + name;
+    stat(newpath.c_str(), &info);
+    if(info.st_mode & S_IFDIR) {
+      category_index = categories->value_index(string(name));
+      load_directory(newpath, data_set, category_index);
+    }
+  }
+  return data_set;
+}
+void Storage::Folders::write(DataSet::DataSet *data_set) {
+}

data/lib/quarry/src/storage/folders.h ADDED Viewed

@@ -0,0 +1,25 @@
+#ifndef __folders_h__
+#define __folders_h__
+#include "preprocessing/text/text_pipeline.h"
+#include "data_set/data_set.h"
+#include "storage/storage.h"
+#include <algorithm>
+#include <cctype>
+#include <string>
+using namespace std;
+namespace Storage {
+  class Folders : public Storage {
+    void load_directory(string path, DataSet::SparseDataSet *data_set, int category_index);
+  public:
+    string path;
+    Preprocessing::Text::TextPipeline *pipeline;
+    Folders(string path, Preprocessing::Text::TextPipeline *pipeline) : path(path), pipeline(pipeline) {}
+    DataSet::DataSet *read();
+    void write(DataSet::DataSet *data_set);
+  };
+}
+#endif

data/lib/quarry/src/storage/storage.h ADDED Viewed

@@ -0,0 +1,19 @@
+#ifndef __storage_h__
+#define __storage_h__
+#include "data_set/data_set.h"
+#include "model/model.h"
+namespace Storage {
+  class Storage {
+  public:
+    // all storage implementations must be able to read and write data sets
+    virtual DataSet::DataSet *read() = 0;
+    virtual void write(DataSet::DataSet *data_set) = 0;
+    // some implementations can read and write trained models
+    virtual Model::Model *read_model() { return NULL; }
+    virtual void write_model(Model::Model *model) {}
+  };
+}
+#endif

data/lib/quarry/src/test.cpp ADDED Viewed

@@ -0,0 +1,6 @@
+#include "quarry.h"
+#include <stdexcept>
+#include <cstring>
+int main() {
+}

data/lib/quarry_rb/classifier/classifier.rb ADDED Viewed

@@ -0,0 +1,22 @@
+module Quarry
+  module Classifier
+    class Classifier
+      attr_reader :classifier
+      def initialize(data_set)
+        @data_set = data_set
+      end
+      def prepare
+        @classifier.prepare
+      end
+      def classify(example)
+        @data_set.categories[@classifier.classify_to_index(example.example)]
+      end
+      def rank(example)
+        @classifier.rank(example.example)
+      end
+    end
+  end
+end