RubyGems - thera - Versions diffs - 0.0.7 → 0.0.8 - Mend

thera 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

data/ext/quarry/quarry_toolkit.cpp CHANGED

@@ -45,6 +45,24 @@ Object model_rank_text_names(Object self, Object text) {
   return names;
 }
+Object quarry_rank_text_names_from_binary_model(Object path, Object text) {
+  string model_path = from_ruby<string>(path);
+  string example_text = from_ruby<string>(text);
+  Array names;
+  Storage::Binary reader(model_path);
+  Model::Model *model = reader.read_model();
+  vector<Classifier::Score> *ranks = model->rank_text(example_text);
+  DataSet::NominalFeature *categories = model->data_set->category_feature();
+  for(unsigned int i = 0; i < ranks->size(); i++)
+    names.push(categories->names[ranks->at(i).category]);
+  delete ranks;
+  delete model;
+  return names;
+}
 extern "C" {
@@ -55,6 +73,8 @@ extern "C" {
     Module rb_mPreprocessing = define_module_under(rb_mQuarry, "Preprocessing");
     Module rb_mText = define_module_under(rb_mPreprocessing, "Text");
+    // quarry helper
+    rb_mQuarry.define_module_function("rank_text_names_from_binary_model", &quarry_rank_text_names_from_binary_model);
     // text pipeline
     rb_mText.define_module_function("standard_pipeline", &Preprocessing::Text::StandardPipeline);

data/lib/quarry/src/classifier/classifier.h CHANGED

@@ -46,6 +46,8 @@ namespace Classifier {
       }
     }
+    virtual ~Classifier() {}
     virtual void prepare() {};
     virtual Classifier *clone(DataSet::DataSet *new_data_set) = 0;
     virtual double score(int category, DataSet::Example *example) = 0;

data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp CHANGED

@@ -74,8 +74,13 @@ void Classifier::NaiveBayesClassifier::write_binary(Storage::Binary *file) {
 void Classifier::NaiveBayesClassifier::read_binary(Storage::Binary *file) {
   int category_count = file->read_int();
   feature_caches.resize(category_count + 1);
-  category_probabilities = *(file->read_vector<double>());
+  vector<double> *probabilities = file->read_vector<double>();
+  category_probabilities = *probabilities;
+  delete probabilities;
-  for(int i = 1; i <= category_count; i++)
-    feature_caches[i] = *(file->read_vector<NumericFeatureCache>());
+  for(int i = 1; i <= category_count; i++) {
+    vector<NumericFeatureCache> *caches = file->read_vector<NumericFeatureCache>();
+    feature_caches[i] = *caches;
+    delete caches;
+  }
 }

data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.h CHANGED

@@ -33,6 +33,7 @@ namespace Classifier {
     static const uint32_t file_mark = 'naiv';
     NaiveBayesClassifier(DataSet::DataSet *data_set) : Classifier(data_set) {}
     NaiveBayesClassifier *clone(DataSet::DataSet *new_data_set);
+    virtual ~NaiveBayesClassifier() {}
     double score(int category, DataSet::Example *example);
     void prepare();

data/lib/quarry/src/data_set/data_set.h CHANGED

@@ -26,6 +26,14 @@ namespace DataSet {
         features.push_back(feature);
       }
     }
+    virtual ~DataSet() {
+      for(unsigned int i = 0; i < features.size(); i++)
+        delete features[i];
+      for(unsigned int i = 0; i < examples.size(); i++)
+        delete examples[i];
+    }
     virtual DataSet *clone_without_examples() { return NULL; }
     tr1::unordered_map<string, Feature *> feature_names;
@@ -38,7 +46,7 @@ namespace DataSet {
     NumericFeature  *new_numeric_feature(string name);
     NominalFeature  *new_nominal_feature(string name);
-    virtual Example *new_example() { return NULL; }
+    virtual Example *new_example(bool add_to_data_set = true) { return NULL; }
     void count();
     void index();

data/lib/quarry/src/data_set/dense/dense_data_set.h CHANGED

@@ -28,9 +28,10 @@ namespace DataSet {
       return new DenseDataSet(this);
     }
-    DenseExample *new_example() {
+    DenseExample *new_example(bool add_to_data_set = true) {
       DenseExample *example = new DenseExample(features.size());
-      examples.push_back(example);
+      if(add_to_data_set)
+        examples.push_back(example);
       return example;
     }
   };

data/lib/quarry/src/data_set/feature.h CHANGED

@@ -17,6 +17,7 @@ namespace DataSet {
     void    set_index(int new_index)    { index = new_index; }
     Feature(string name, int index) : name(name), index(index) {}
+    virtual ~Feature() {}
     virtual Feature *clone() { return NULL; }
     virtual void reset() {}
     virtual void print() {}

data/lib/quarry/src/data_set/features/nominal_feature.h CHANGED

@@ -14,6 +14,8 @@ namespace DataSet {
     NominalFeature(string name, int index) : Feature(name, index), names(1, "") {}
     NominalFeature(NominalFeature *other) : Feature(other->name, other->index), indexes(other->indexes), names(other->names) {}
+    ~NominalFeature() {}
     NominalFeature *clone() {
       return new NominalFeature(this);
     }

data/lib/quarry/src/data_set/features/numeric_feature.h CHANGED

@@ -15,6 +15,11 @@ namespace DataSet {
       reset();
     }
+    ~NumericFeature() {
+      if(category_counts != NULL)
+        free(category_counts);
+    }
     NumericFeature *clone() {
       return new NumericFeature(name, index);
     }

data/lib/quarry/src/data_set/sparse/sparse_data_set.h CHANGED

@@ -29,9 +29,10 @@ namespace DataSet {
       return new SparseDataSet(this);
     }
-    SparseExample *new_example(int buffer_size = 0) {
+    SparseExample *new_example(int buffer_size = 0, bool add_to_data_set = true) {
       SparseExample *example = new SparseExample(buffer_size);
-      examples.push_back(example);
+      if(add_to_data_set)
+        examples.push_back(example);
       return example;
     }
   };

data/lib/quarry/src/data_set/sparse/sparse_example.h CHANGED

@@ -3,6 +3,7 @@
 #include "data_set/example.h"
 #include <stdlib.h>
 #include <string>
+#include <iostream>
 using namespace std;
 namespace DataSet {
@@ -25,6 +26,11 @@ namespace DataSet {
         values = NULL;
     }
+    ~SparseExample() {
+      if(values != NULL)
+        free(values);
+    }
     double get_value(int feature_index);
     double get_value(string feature_name, SparseDataSet *data_set);
     void set_value(int feature_index, double new_value);

data/lib/quarry/src/model/model.h CHANGED

@@ -4,6 +4,7 @@
 #include "data_set/example.h"
 #include "classifier/classifier.h"
 #include "preprocessing/text/text_pipeline.h"
+#include <iostream>
 namespace Model {
   class Model {
@@ -13,6 +14,14 @@ namespace Model {
     Preprocessing::Text::TextPipeline *text_pipeline;
     Model() : data_set(NULL), classifier(NULL), text_pipeline(NULL) {}
+    ~Model() {
+      if(data_set)
+        delete data_set;
+      if(classifier)
+        delete classifier;
+      if(text_pipeline)
+        delete text_pipeline;
+    }
     void train(DataSet::Example *example);
     void train_text(string text);

data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h CHANGED

@@ -9,6 +9,7 @@ namespace Preprocessing {
     class ExampleGenerator {
     public:
       ExampleGenerator() {}
+      virtual ~ExampleGenerator() {}
       virtual DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens, bool create_features) { return NULL; }
       virtual uint32_t mark() = 0;
     };

data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h CHANGED

@@ -37,7 +37,7 @@ namespace Preprocessing {
         }
         // construct the example
-        DataSet::SparseExample *example = data_set->new_example(token_counts.size());
+        DataSet::SparseExample *example = data_set->new_example(token_counts.size(), false);
         DataSet::Feature *feature = NULL;
         for(map<string, int>::iterator token_counts_it = token_counts.begin(); token_counts_it != token_counts.end(); token_counts_it++) {

data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h CHANGED

@@ -7,6 +7,7 @@ namespace Preprocessing {
     class InplaceProcessor {
     public:
       InplaceProcessor() {}
+      virtual ~InplaceProcessor() {}
       virtual char *process(char *start, char *end) { return end; }
       virtual uint32_t mark() = 0;
     };

data/lib/quarry/src/preprocessing/text/text_pipeline.h CHANGED

@@ -28,6 +28,20 @@ namespace Preprocessing {
       TextPipeline() : tokeniser(NULL), processors(), selectors(), generator(NULL), tokens() {}
       DataSet::SparseExample *process_text(DataSet::SparseDataSet *data_set, char *text, bool create_features);
       void process_token(char *start, char *end);
+      ~TextPipeline() {
+        // tokens stores ptrs to offsets of a string which is handled externally,
+        // so doesn't need to be released here
+        if(tokeniser)
+          delete tokeniser;
+        if(generator)
+          delete generator;
+        for(unsigned int i = 0; i < processors.size(); i++)
+          delete processors[i];
+        for(unsigned int i = 0; i < selectors.size(); i++)
+          delete selectors[i];
+      }
     };
     TextPipeline *StandardPipeline();

data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp CHANGED

@@ -4,7 +4,6 @@
 using namespace std;
 using namespace tr1;
-static unordered_set<string> *stop_words = NULL;
 static int stop_word_count = 586;
 static string stop_word_list[] = {
   "a", "able", "about", "above", "abroad", "according", "accordingly", "across", "actually", "adj",
@@ -69,14 +68,11 @@ static string stop_word_list[] = {
 };
 Preprocessing::Text::StopWords::StopWords() : TokenSelector() {
-  if(stop_words == NULL) {
-    stop_words = new unordered_set<string>();
-    for(int i = 0; i < stop_word_count; i++)
-      stop_words->insert(stop_word_list[i]);
-  }
+  for(int i = 0; i < stop_word_count; i++)
+    stop_words.insert(stop_word_list[i]);
 }
 bool Preprocessing::Text::StopWords::select(char *start, char *end) {
   string token = string(start, (end - start) + 1);
-  return stop_words->count(token) == 0;
+  return stop_words.count(token) == 0;
 }

data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h CHANGED

@@ -1,6 +1,10 @@
 #ifndef __stop_words_h__
 #define __stop_words_h__
 #include "token_selector.h"
+#include <tr1/unordered_set>
+using namespace std;
+using namespace tr1;
 namespace Preprocessing {
   namespace Text {
@@ -9,8 +13,10 @@ namespace Preprocessing {
     public:
       static const uint32_t file_mark = 'stop';
       uint32_t mark() { return file_mark; }
+      unordered_set<string> stop_words;
       StopWords();
+      ~StopWords() {}
       bool select(char *start, char *end);
     };

data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h CHANGED

@@ -7,6 +7,7 @@ namespace Preprocessing {
     class TokenSelector {
     public:
+      virtual ~TokenSelector() {}
       virtual bool select(char *start, char *end) { return true; }
       virtual uint32_t mark() = 0;
     };

data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h CHANGED

@@ -9,6 +9,7 @@ namespace Preprocessing {
     public:
       TextPipeline *pipeline;
       Tokeniser(TextPipeline *pipeline) : pipeline(pipeline) {}
+      virtual ~Tokeniser() {}
       virtual void tokenise(char *text) {}
       virtual uint32_t mark() = 0;
     };

data/lib/quarry/src/storage/binary.cpp CHANGED

@@ -104,16 +104,26 @@ DataSet::DataSet *Storage::Binary::read_data_set() {
       // read cached frequencies and probabilities if present
       if(data_set->counted) {
-        nominal_feature->frequencies = *read_vector<int>();
-        nominal_feature->probabilities = *read_vector<double>();
+        vector<int> *frequencies = read_vector<int>();
+        vector<double> *probabilities = read_vector<double>();
+        nominal_feature->frequencies = *frequencies;
+        nominal_feature->probabilities = *probabilities;
         nominal_feature->category_frequencies.resize(num_categories + 1);
         nominal_feature->category_probabilities.resize(num_categories + 1);
+        delete frequencies;
+        delete probabilities;
-        for(int i = 1; i <= num_categories; i++)
-          nominal_feature->category_frequencies[i] = *read_vector<int>();
+        for(int i = 1; i <= num_categories; i++) {
+          frequencies = read_vector<int>();
+          nominal_feature->category_frequencies[i] = *frequencies;
+          delete frequencies;
+        }
-        for(int i = 1; i <= num_categories; i++)
-          nominal_feature->category_probabilities[i] = *read_vector<double>();
+        for(int i = 1; i <= num_categories; i++) {
+          probabilities = read_vector<double>();
+          nominal_feature->category_probabilities[i] = *probabilities;
+          delete probabilities;
+        }
       }
       // TODO: read cached indexes

data/lib/quarry/src/storage/folders.cpp CHANGED

@@ -55,6 +55,7 @@ void Storage::Folders::load_directory(string path, DataSet::SparseDataSet *data_
     // insert a new example into the dataset
     example = pipeline->process_text(data_set, file_data, true);
     example->set_category_index(data_set, category_index);
+    data_set->examples.push_back(example);
     file_count++;
     if((file_count % 10000) == 0)

data/thera.gemspec CHANGED

@@ -9,7 +9,7 @@ Gem::Specification.new do |s|
   s.description = "C++ Data Mining Library for Ruby"
   s.email = "me@willcannings.com"
   s.authors = ["Will Cannings"]
-  s.version = '0.0.7'
+  s.version = '0.0.8'
   s.extensions = ["ext/extconf.rb"]
   s.files         = `git ls-files`.split("\n")

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: thera
 version: !ruby/object:Gem::Version
-  version: 0.0.7
+  version: 0.0.8
   prerelease:
 platform: ruby
 authors:
@@ -9,11 +9,11 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-12-22 00:00:00.000000000Z
+date: 2012-01-21 00:00:00.000000000Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rice
-  requirement: &70195705116960 !ruby/object:Gem::Requirement
+  requirement: &70151526602040 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70195705116960
+  version_requirements: *70151526602040
 description: C++ Data Mining Library for Ruby
 email: me@willcannings.com
 executables: []