thera 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -45,6 +45,24 @@ Object model_rank_text_names(Object self, Object text) {
45
45
  return names;
46
46
  }
47
47
 
48
+ Object quarry_rank_text_names_from_binary_model(Object path, Object text) {
49
+ string model_path = from_ruby<string>(path);
50
+ string example_text = from_ruby<string>(text);
51
+ Array names;
52
+
53
+ Storage::Binary reader(model_path);
54
+ Model::Model *model = reader.read_model();
55
+
56
+ vector<Classifier::Score> *ranks = model->rank_text(example_text);
57
+ DataSet::NominalFeature *categories = model->data_set->category_feature();
58
+ for(unsigned int i = 0; i < ranks->size(); i++)
59
+ names.push(categories->names[ranks->at(i).category]);
60
+
61
+ delete ranks;
62
+ delete model;
63
+ return names;
64
+ }
65
+
48
66
 
49
67
  extern "C" {
50
68
 
@@ -55,6 +73,8 @@ extern "C" {
55
73
  Module rb_mPreprocessing = define_module_under(rb_mQuarry, "Preprocessing");
56
74
  Module rb_mText = define_module_under(rb_mPreprocessing, "Text");
57
75
 
76
+ // quarry helper
77
+ rb_mQuarry.define_module_function("rank_text_names_from_binary_model", &quarry_rank_text_names_from_binary_model);
58
78
 
59
79
  // text pipeline
60
80
  rb_mText.define_module_function("standard_pipeline", &Preprocessing::Text::StandardPipeline);
@@ -46,6 +46,8 @@ namespace Classifier {
46
46
  }
47
47
  }
48
48
 
49
+ virtual ~Classifier() {}
50
+
49
51
  virtual void prepare() {};
50
52
  virtual Classifier *clone(DataSet::DataSet *new_data_set) = 0;
51
53
  virtual double score(int category, DataSet::Example *example) = 0;
@@ -74,8 +74,13 @@ void Classifier::NaiveBayesClassifier::write_binary(Storage::Binary *file) {
74
74
  void Classifier::NaiveBayesClassifier::read_binary(Storage::Binary *file) {
75
75
  int category_count = file->read_int();
76
76
  feature_caches.resize(category_count + 1);
77
- category_probabilities = *(file->read_vector<double>());
77
+ vector<double> *probabilities = file->read_vector<double>();
78
+ category_probabilities = *probabilities;
79
+ delete probabilities;
78
80
 
79
- for(int i = 1; i <= category_count; i++)
80
- feature_caches[i] = *(file->read_vector<NumericFeatureCache>());
81
+ for(int i = 1; i <= category_count; i++) {
82
+ vector<NumericFeatureCache> *caches = file->read_vector<NumericFeatureCache>();
83
+ feature_caches[i] = *caches;
84
+ delete caches;
85
+ }
81
86
  }
@@ -33,6 +33,7 @@ namespace Classifier {
33
33
  static const uint32_t file_mark = 'naiv';
34
34
  NaiveBayesClassifier(DataSet::DataSet *data_set) : Classifier(data_set) {}
35
35
  NaiveBayesClassifier *clone(DataSet::DataSet *new_data_set);
36
+ virtual ~NaiveBayesClassifier() {}
36
37
 
37
38
  double score(int category, DataSet::Example *example);
38
39
  void prepare();
@@ -26,6 +26,14 @@ namespace DataSet {
26
26
  features.push_back(feature);
27
27
  }
28
28
  }
29
+
30
+ virtual ~DataSet() {
31
+ for(unsigned int i = 0; i < features.size(); i++)
32
+ delete features[i];
33
+ for(unsigned int i = 0; i < examples.size(); i++)
34
+ delete examples[i];
35
+ }
36
+
29
37
  virtual DataSet *clone_without_examples() { return NULL; }
30
38
 
31
39
  tr1::unordered_map<string, Feature *> feature_names;
@@ -38,7 +46,7 @@ namespace DataSet {
38
46
 
39
47
  NumericFeature *new_numeric_feature(string name);
40
48
  NominalFeature *new_nominal_feature(string name);
41
- virtual Example *new_example() { return NULL; }
49
+ virtual Example *new_example(bool add_to_data_set = true) { return NULL; }
42
50
 
43
51
  void count();
44
52
  void index();
@@ -28,9 +28,10 @@ namespace DataSet {
28
28
  return new DenseDataSet(this);
29
29
  }
30
30
 
31
- DenseExample *new_example() {
31
+ DenseExample *new_example(bool add_to_data_set = true) {
32
32
  DenseExample *example = new DenseExample(features.size());
33
- examples.push_back(example);
33
+ if(add_to_data_set)
34
+ examples.push_back(example);
34
35
  return example;
35
36
  }
36
37
  };
@@ -17,6 +17,7 @@ namespace DataSet {
17
17
  void set_index(int new_index) { index = new_index; }
18
18
 
19
19
  Feature(string name, int index) : name(name), index(index) {}
20
+ virtual ~Feature() {}
20
21
  virtual Feature *clone() { return NULL; }
21
22
  virtual void reset() {}
22
23
  virtual void print() {}
@@ -14,6 +14,8 @@ namespace DataSet {
14
14
  NominalFeature(string name, int index) : Feature(name, index), names(1, "") {}
15
15
  NominalFeature(NominalFeature *other) : Feature(other->name, other->index), indexes(other->indexes), names(other->names) {}
16
16
 
17
+ ~NominalFeature() {}
18
+
17
19
  NominalFeature *clone() {
18
20
  return new NominalFeature(this);
19
21
  }
@@ -15,6 +15,11 @@ namespace DataSet {
15
15
  reset();
16
16
  }
17
17
 
18
+ ~NumericFeature() {
19
+ if(category_counts != NULL)
20
+ free(category_counts);
21
+ }
22
+
18
23
  NumericFeature *clone() {
19
24
  return new NumericFeature(name, index);
20
25
  }
@@ -29,9 +29,10 @@ namespace DataSet {
29
29
  return new SparseDataSet(this);
30
30
  }
31
31
 
32
- SparseExample *new_example(int buffer_size = 0) {
32
+ SparseExample *new_example(int buffer_size = 0, bool add_to_data_set = true) {
33
33
  SparseExample *example = new SparseExample(buffer_size);
34
- examples.push_back(example);
34
+ if(add_to_data_set)
35
+ examples.push_back(example);
35
36
  return example;
36
37
  }
37
38
  };
@@ -3,6 +3,7 @@
3
3
  #include "data_set/example.h"
4
4
  #include <stdlib.h>
5
5
  #include <string>
6
+ #include <iostream>
6
7
  using namespace std;
7
8
 
8
9
  namespace DataSet {
@@ -25,6 +26,11 @@ namespace DataSet {
25
26
  values = NULL;
26
27
  }
27
28
 
29
+ ~SparseExample() {
30
+ if(values != NULL)
31
+ free(values);
32
+ }
33
+
28
34
  double get_value(int feature_index);
29
35
  double get_value(string feature_name, SparseDataSet *data_set);
30
36
  void set_value(int feature_index, double new_value);
@@ -4,6 +4,7 @@
4
4
  #include "data_set/example.h"
5
5
  #include "classifier/classifier.h"
6
6
  #include "preprocessing/text/text_pipeline.h"
7
+ #include <iostream>
7
8
 
8
9
  namespace Model {
9
10
  class Model {
@@ -13,6 +14,14 @@ namespace Model {
13
14
  Preprocessing::Text::TextPipeline *text_pipeline;
14
15
 
15
16
  Model() : data_set(NULL), classifier(NULL), text_pipeline(NULL) {}
17
+ ~Model() {
18
+ if(data_set)
19
+ delete data_set;
20
+ if(classifier)
21
+ delete classifier;
22
+ if(text_pipeline)
23
+ delete text_pipeline;
24
+ }
16
25
 
17
26
  void train(DataSet::Example *example);
18
27
  void train_text(string text);
@@ -9,6 +9,7 @@ namespace Preprocessing {
9
9
  class ExampleGenerator {
10
10
  public:
11
11
  ExampleGenerator() {}
12
+ virtual ~ExampleGenerator() {}
12
13
  virtual DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens, bool create_features) { return NULL; }
13
14
  virtual uint32_t mark() = 0;
14
15
  };
@@ -37,7 +37,7 @@ namespace Preprocessing {
37
37
  }
38
38
 
39
39
  // construct the example
40
- DataSet::SparseExample *example = data_set->new_example(token_counts.size());
40
+ DataSet::SparseExample *example = data_set->new_example(token_counts.size(), false);
41
41
  DataSet::Feature *feature = NULL;
42
42
 
43
43
  for(map<string, int>::iterator token_counts_it = token_counts.begin(); token_counts_it != token_counts.end(); token_counts_it++) {
@@ -7,6 +7,7 @@ namespace Preprocessing {
7
7
  class InplaceProcessor {
8
8
  public:
9
9
  InplaceProcessor() {}
10
+ virtual ~InplaceProcessor() {}
10
11
  virtual char *process(char *start, char *end) { return end; }
11
12
  virtual uint32_t mark() = 0;
12
13
  };
@@ -28,6 +28,20 @@ namespace Preprocessing {
28
28
  TextPipeline() : tokeniser(NULL), processors(), selectors(), generator(NULL), tokens() {}
29
29
  DataSet::SparseExample *process_text(DataSet::SparseDataSet *data_set, char *text, bool create_features);
30
30
  void process_token(char *start, char *end);
31
+
32
+ ~TextPipeline() {
33
+ // tokens stores ptrs to offsets of a string which is handled externally,
34
+ // so doesn't need to be released here
35
+ if(tokeniser)
36
+ delete tokeniser;
37
+ if(generator)
38
+ delete generator;
39
+
40
+ for(unsigned int i = 0; i < processors.size(); i++)
41
+ delete processors[i];
42
+ for(unsigned int i = 0; i < selectors.size(); i++)
43
+ delete selectors[i];
44
+ }
31
45
  };
32
46
 
33
47
  TextPipeline *StandardPipeline();
@@ -4,7 +4,6 @@
4
4
  using namespace std;
5
5
  using namespace tr1;
6
6
 
7
- static unordered_set<string> *stop_words = NULL;
8
7
  static int stop_word_count = 586;
9
8
  static string stop_word_list[] = {
10
9
  "a", "able", "about", "above", "abroad", "according", "accordingly", "across", "actually", "adj",
@@ -69,14 +68,11 @@ static string stop_word_list[] = {
69
68
  };
70
69
 
71
70
  Preprocessing::Text::StopWords::StopWords() : TokenSelector() {
72
- if(stop_words == NULL) {
73
- stop_words = new unordered_set<string>();
74
- for(int i = 0; i < stop_word_count; i++)
75
- stop_words->insert(stop_word_list[i]);
76
- }
71
+ for(int i = 0; i < stop_word_count; i++)
72
+ stop_words.insert(stop_word_list[i]);
77
73
  }
78
74
 
79
75
  bool Preprocessing::Text::StopWords::select(char *start, char *end) {
80
76
  string token = string(start, (end - start) + 1);
81
- return stop_words->count(token) == 0;
77
+ return stop_words.count(token) == 0;
82
78
  }
@@ -1,6 +1,10 @@
1
1
  #ifndef __stop_words_h__
2
2
  #define __stop_words_h__
3
3
  #include "token_selector.h"
4
+ #include <tr1/unordered_set>
5
+ using namespace std;
6
+ using namespace tr1;
7
+
4
8
 
5
9
  namespace Preprocessing {
6
10
  namespace Text {
@@ -9,8 +13,10 @@ namespace Preprocessing {
9
13
  public:
10
14
  static const uint32_t file_mark = 'stop';
11
15
  uint32_t mark() { return file_mark; }
16
+ unordered_set<string> stop_words;
12
17
 
13
18
  StopWords();
19
+ ~StopWords() {}
14
20
  bool select(char *start, char *end);
15
21
  };
16
22
 
@@ -7,6 +7,7 @@ namespace Preprocessing {
7
7
 
8
8
  class TokenSelector {
9
9
  public:
10
+ virtual ~TokenSelector() {}
10
11
  virtual bool select(char *start, char *end) { return true; }
11
12
  virtual uint32_t mark() = 0;
12
13
  };
@@ -9,6 +9,7 @@ namespace Preprocessing {
9
9
  public:
10
10
  TextPipeline *pipeline;
11
11
  Tokeniser(TextPipeline *pipeline) : pipeline(pipeline) {}
12
+ virtual ~Tokeniser() {}
12
13
  virtual void tokenise(char *text) {}
13
14
  virtual uint32_t mark() = 0;
14
15
  };
@@ -104,16 +104,26 @@ DataSet::DataSet *Storage::Binary::read_data_set() {
104
104
 
105
105
  // read cached frequencies and probabilities if present
106
106
  if(data_set->counted) {
107
- nominal_feature->frequencies = *read_vector<int>();
108
- nominal_feature->probabilities = *read_vector<double>();
107
+ vector<int> *frequencies = read_vector<int>();
108
+ vector<double> *probabilities = read_vector<double>();
109
+ nominal_feature->frequencies = *frequencies;
110
+ nominal_feature->probabilities = *probabilities;
109
111
  nominal_feature->category_frequencies.resize(num_categories + 1);
110
112
  nominal_feature->category_probabilities.resize(num_categories + 1);
113
+ delete frequencies;
114
+ delete probabilities;
111
115
 
112
- for(int i = 1; i <= num_categories; i++)
113
- nominal_feature->category_frequencies[i] = *read_vector<int>();
116
+ for(int i = 1; i <= num_categories; i++) {
117
+ frequencies = read_vector<int>();
118
+ nominal_feature->category_frequencies[i] = *frequencies;
119
+ delete frequencies;
120
+ }
114
121
 
115
- for(int i = 1; i <= num_categories; i++)
116
- nominal_feature->category_probabilities[i] = *read_vector<double>();
122
+ for(int i = 1; i <= num_categories; i++) {
123
+ probabilities = read_vector<double>();
124
+ nominal_feature->category_probabilities[i] = *probabilities;
125
+ delete probabilities;
126
+ }
117
127
  }
118
128
 
119
129
  // TODO: read cached indexes
@@ -55,6 +55,7 @@ void Storage::Folders::load_directory(string path, DataSet::SparseDataSet *data_
55
55
  // insert a new example into the dataset
56
56
  example = pipeline->process_text(data_set, file_data, true);
57
57
  example->set_category_index(data_set, category_index);
58
+ data_set->examples.push_back(example);
58
59
 
59
60
  file_count++;
60
61
  if((file_count % 10000) == 0)
@@ -9,7 +9,7 @@ Gem::Specification.new do |s|
9
9
  s.description = "C++ Data Mining Library for Ruby"
10
10
  s.email = "me@willcannings.com"
11
11
  s.authors = ["Will Cannings"]
12
- s.version = '0.0.7'
12
+ s.version = '0.0.8'
13
13
  s.extensions = ["ext/extconf.rb"]
14
14
 
15
15
  s.files = `git ls-files`.split("\n")
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: thera
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-12-22 00:00:00.000000000Z
12
+ date: 2012-01-21 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rice
16
- requirement: &70195705116960 !ruby/object:Gem::Requirement
16
+ requirement: &70151526602040 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70195705116960
24
+ version_requirements: *70151526602040
25
25
  description: C++ Data Mining Library for Ruby
26
26
  email: me@willcannings.com
27
27
  executables: []