thera 0.0.7 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -45,6 +45,24 @@ Object model_rank_text_names(Object self, Object text) {
45
45
  return names;
46
46
  }
47
47
 
48
+ Object quarry_rank_text_names_from_binary_model(Object path, Object text) {
49
+ string model_path = from_ruby<string>(path);
50
+ string example_text = from_ruby<string>(text);
51
+ Array names;
52
+
53
+ Storage::Binary reader(model_path);
54
+ Model::Model *model = reader.read_model();
55
+
56
+ vector<Classifier::Score> *ranks = model->rank_text(example_text);
57
+ DataSet::NominalFeature *categories = model->data_set->category_feature();
58
+ for(unsigned int i = 0; i < ranks->size(); i++)
59
+ names.push(categories->names[ranks->at(i).category]);
60
+
61
+ delete ranks;
62
+ delete model;
63
+ return names;
64
+ }
65
+
48
66
 
49
67
  extern "C" {
50
68
 
@@ -55,6 +73,8 @@ extern "C" {
55
73
  Module rb_mPreprocessing = define_module_under(rb_mQuarry, "Preprocessing");
56
74
  Module rb_mText = define_module_under(rb_mPreprocessing, "Text");
57
75
 
76
+ // quarry helper
77
+ rb_mQuarry.define_module_function("rank_text_names_from_binary_model", &quarry_rank_text_names_from_binary_model);
58
78
 
59
79
  // text pipeline
60
80
  rb_mText.define_module_function("standard_pipeline", &Preprocessing::Text::StandardPipeline);
@@ -46,6 +46,8 @@ namespace Classifier {
46
46
  }
47
47
  }
48
48
 
49
+ virtual ~Classifier() {}
50
+
49
51
  virtual void prepare() {};
50
52
  virtual Classifier *clone(DataSet::DataSet *new_data_set) = 0;
51
53
  virtual double score(int category, DataSet::Example *example) = 0;
@@ -74,8 +74,13 @@ void Classifier::NaiveBayesClassifier::write_binary(Storage::Binary *file) {
74
74
  void Classifier::NaiveBayesClassifier::read_binary(Storage::Binary *file) {
75
75
  int category_count = file->read_int();
76
76
  feature_caches.resize(category_count + 1);
77
- category_probabilities = *(file->read_vector<double>());
77
+ vector<double> *probabilities = file->read_vector<double>();
78
+ category_probabilities = *probabilities;
79
+ delete probabilities;
78
80
 
79
- for(int i = 1; i <= category_count; i++)
80
- feature_caches[i] = *(file->read_vector<NumericFeatureCache>());
81
+ for(int i = 1; i <= category_count; i++) {
82
+ vector<NumericFeatureCache> *caches = file->read_vector<NumericFeatureCache>();
83
+ feature_caches[i] = *caches;
84
+ delete caches;
85
+ }
81
86
  }
@@ -33,6 +33,7 @@ namespace Classifier {
33
33
  static const uint32_t file_mark = 'naiv';
34
34
  NaiveBayesClassifier(DataSet::DataSet *data_set) : Classifier(data_set) {}
35
35
  NaiveBayesClassifier *clone(DataSet::DataSet *new_data_set);
36
+ virtual ~NaiveBayesClassifier() {}
36
37
 
37
38
  double score(int category, DataSet::Example *example);
38
39
  void prepare();
@@ -26,6 +26,14 @@ namespace DataSet {
26
26
  features.push_back(feature);
27
27
  }
28
28
  }
29
+
30
+ virtual ~DataSet() {
31
+ for(unsigned int i = 0; i < features.size(); i++)
32
+ delete features[i];
33
+ for(unsigned int i = 0; i < examples.size(); i++)
34
+ delete examples[i];
35
+ }
36
+
29
37
  virtual DataSet *clone_without_examples() { return NULL; }
30
38
 
31
39
  tr1::unordered_map<string, Feature *> feature_names;
@@ -38,7 +46,7 @@ namespace DataSet {
38
46
 
39
47
  NumericFeature *new_numeric_feature(string name);
40
48
  NominalFeature *new_nominal_feature(string name);
41
- virtual Example *new_example() { return NULL; }
49
+ virtual Example *new_example(bool add_to_data_set = true) { return NULL; }
42
50
 
43
51
  void count();
44
52
  void index();
@@ -28,9 +28,10 @@ namespace DataSet {
28
28
  return new DenseDataSet(this);
29
29
  }
30
30
 
31
- DenseExample *new_example() {
31
+ DenseExample *new_example(bool add_to_data_set = true) {
32
32
  DenseExample *example = new DenseExample(features.size());
33
- examples.push_back(example);
33
+ if(add_to_data_set)
34
+ examples.push_back(example);
34
35
  return example;
35
36
  }
36
37
  };
@@ -17,6 +17,7 @@ namespace DataSet {
17
17
  void set_index(int new_index) { index = new_index; }
18
18
 
19
19
  Feature(string name, int index) : name(name), index(index) {}
20
+ virtual ~Feature() {}
20
21
  virtual Feature *clone() { return NULL; }
21
22
  virtual void reset() {}
22
23
  virtual void print() {}
@@ -14,6 +14,8 @@ namespace DataSet {
14
14
  NominalFeature(string name, int index) : Feature(name, index), names(1, "") {}
15
15
  NominalFeature(NominalFeature *other) : Feature(other->name, other->index), indexes(other->indexes), names(other->names) {}
16
16
 
17
+ ~NominalFeature() {}
18
+
17
19
  NominalFeature *clone() {
18
20
  return new NominalFeature(this);
19
21
  }
@@ -15,6 +15,11 @@ namespace DataSet {
15
15
  reset();
16
16
  }
17
17
 
18
+ ~NumericFeature() {
19
+ if(category_counts != NULL)
20
+ free(category_counts);
21
+ }
22
+
18
23
  NumericFeature *clone() {
19
24
  return new NumericFeature(name, index);
20
25
  }
@@ -29,9 +29,10 @@ namespace DataSet {
29
29
  return new SparseDataSet(this);
30
30
  }
31
31
 
32
- SparseExample *new_example(int buffer_size = 0) {
32
+ SparseExample *new_example(int buffer_size = 0, bool add_to_data_set = true) {
33
33
  SparseExample *example = new SparseExample(buffer_size);
34
- examples.push_back(example);
34
+ if(add_to_data_set)
35
+ examples.push_back(example);
35
36
  return example;
36
37
  }
37
38
  };
@@ -3,6 +3,7 @@
3
3
  #include "data_set/example.h"
4
4
  #include <stdlib.h>
5
5
  #include <string>
6
+ #include <iostream>
6
7
  using namespace std;
7
8
 
8
9
  namespace DataSet {
@@ -25,6 +26,11 @@ namespace DataSet {
25
26
  values = NULL;
26
27
  }
27
28
 
29
+ ~SparseExample() {
30
+ if(values != NULL)
31
+ free(values);
32
+ }
33
+
28
34
  double get_value(int feature_index);
29
35
  double get_value(string feature_name, SparseDataSet *data_set);
30
36
  void set_value(int feature_index, double new_value);
@@ -4,6 +4,7 @@
4
4
  #include "data_set/example.h"
5
5
  #include "classifier/classifier.h"
6
6
  #include "preprocessing/text/text_pipeline.h"
7
+ #include <iostream>
7
8
 
8
9
  namespace Model {
9
10
  class Model {
@@ -13,6 +14,14 @@ namespace Model {
13
14
  Preprocessing::Text::TextPipeline *text_pipeline;
14
15
 
15
16
  Model() : data_set(NULL), classifier(NULL), text_pipeline(NULL) {}
17
+ ~Model() {
18
+ if(data_set)
19
+ delete data_set;
20
+ if(classifier)
21
+ delete classifier;
22
+ if(text_pipeline)
23
+ delete text_pipeline;
24
+ }
16
25
 
17
26
  void train(DataSet::Example *example);
18
27
  void train_text(string text);
@@ -9,6 +9,7 @@ namespace Preprocessing {
9
9
  class ExampleGenerator {
10
10
  public:
11
11
  ExampleGenerator() {}
12
+ virtual ~ExampleGenerator() {}
12
13
  virtual DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens, bool create_features) { return NULL; }
13
14
  virtual uint32_t mark() = 0;
14
15
  };
@@ -37,7 +37,7 @@ namespace Preprocessing {
37
37
  }
38
38
 
39
39
  // construct the example
40
- DataSet::SparseExample *example = data_set->new_example(token_counts.size());
40
+ DataSet::SparseExample *example = data_set->new_example(token_counts.size(), false);
41
41
  DataSet::Feature *feature = NULL;
42
42
 
43
43
  for(map<string, int>::iterator token_counts_it = token_counts.begin(); token_counts_it != token_counts.end(); token_counts_it++) {
@@ -7,6 +7,7 @@ namespace Preprocessing {
7
7
  class InplaceProcessor {
8
8
  public:
9
9
  InplaceProcessor() {}
10
+ virtual ~InplaceProcessor() {}
10
11
  virtual char *process(char *start, char *end) { return end; }
11
12
  virtual uint32_t mark() = 0;
12
13
  };
@@ -28,6 +28,20 @@ namespace Preprocessing {
28
28
  TextPipeline() : tokeniser(NULL), processors(), selectors(), generator(NULL), tokens() {}
29
29
  DataSet::SparseExample *process_text(DataSet::SparseDataSet *data_set, char *text, bool create_features);
30
30
  void process_token(char *start, char *end);
31
+
32
+ ~TextPipeline() {
33
+ // tokens stores ptrs to offsets of a string which is handled externally,
34
+ // so doesn't need to be released here
35
+ if(tokeniser)
36
+ delete tokeniser;
37
+ if(generator)
38
+ delete generator;
39
+
40
+ for(unsigned int i = 0; i < processors.size(); i++)
41
+ delete processors[i];
42
+ for(unsigned int i = 0; i < selectors.size(); i++)
43
+ delete selectors[i];
44
+ }
31
45
  };
32
46
 
33
47
  TextPipeline *StandardPipeline();
@@ -4,7 +4,6 @@
4
4
  using namespace std;
5
5
  using namespace tr1;
6
6
 
7
- static unordered_set<string> *stop_words = NULL;
8
7
  static int stop_word_count = 586;
9
8
  static string stop_word_list[] = {
10
9
  "a", "able", "about", "above", "abroad", "according", "accordingly", "across", "actually", "adj",
@@ -69,14 +68,11 @@ static string stop_word_list[] = {
69
68
  };
70
69
 
71
70
  Preprocessing::Text::StopWords::StopWords() : TokenSelector() {
72
- if(stop_words == NULL) {
73
- stop_words = new unordered_set<string>();
74
- for(int i = 0; i < stop_word_count; i++)
75
- stop_words->insert(stop_word_list[i]);
76
- }
71
+ for(int i = 0; i < stop_word_count; i++)
72
+ stop_words.insert(stop_word_list[i]);
77
73
  }
78
74
 
79
75
  bool Preprocessing::Text::StopWords::select(char *start, char *end) {
80
76
  string token = string(start, (end - start) + 1);
81
- return stop_words->count(token) == 0;
77
+ return stop_words.count(token) == 0;
82
78
  }
@@ -1,6 +1,10 @@
1
1
  #ifndef __stop_words_h__
2
2
  #define __stop_words_h__
3
3
  #include "token_selector.h"
4
+ #include <tr1/unordered_set>
5
+ using namespace std;
6
+ using namespace tr1;
7
+
4
8
 
5
9
  namespace Preprocessing {
6
10
  namespace Text {
@@ -9,8 +13,10 @@ namespace Preprocessing {
9
13
  public:
10
14
  static const uint32_t file_mark = 'stop';
11
15
  uint32_t mark() { return file_mark; }
16
+ unordered_set<string> stop_words;
12
17
 
13
18
  StopWords();
19
+ ~StopWords() {}
14
20
  bool select(char *start, char *end);
15
21
  };
16
22
 
@@ -7,6 +7,7 @@ namespace Preprocessing {
7
7
 
8
8
  class TokenSelector {
9
9
  public:
10
+ virtual ~TokenSelector() {}
10
11
  virtual bool select(char *start, char *end) { return true; }
11
12
  virtual uint32_t mark() = 0;
12
13
  };
@@ -9,6 +9,7 @@ namespace Preprocessing {
9
9
  public:
10
10
  TextPipeline *pipeline;
11
11
  Tokeniser(TextPipeline *pipeline) : pipeline(pipeline) {}
12
+ virtual ~Tokeniser() {}
12
13
  virtual void tokenise(char *text) {}
13
14
  virtual uint32_t mark() = 0;
14
15
  };
@@ -104,16 +104,26 @@ DataSet::DataSet *Storage::Binary::read_data_set() {
104
104
 
105
105
  // read cached frequencies and probabilities if present
106
106
  if(data_set->counted) {
107
- nominal_feature->frequencies = *read_vector<int>();
108
- nominal_feature->probabilities = *read_vector<double>();
107
+ vector<int> *frequencies = read_vector<int>();
108
+ vector<double> *probabilities = read_vector<double>();
109
+ nominal_feature->frequencies = *frequencies;
110
+ nominal_feature->probabilities = *probabilities;
109
111
  nominal_feature->category_frequencies.resize(num_categories + 1);
110
112
  nominal_feature->category_probabilities.resize(num_categories + 1);
113
+ delete frequencies;
114
+ delete probabilities;
111
115
 
112
- for(int i = 1; i <= num_categories; i++)
113
- nominal_feature->category_frequencies[i] = *read_vector<int>();
116
+ for(int i = 1; i <= num_categories; i++) {
117
+ frequencies = read_vector<int>();
118
+ nominal_feature->category_frequencies[i] = *frequencies;
119
+ delete frequencies;
120
+ }
114
121
 
115
- for(int i = 1; i <= num_categories; i++)
116
- nominal_feature->category_probabilities[i] = *read_vector<double>();
122
+ for(int i = 1; i <= num_categories; i++) {
123
+ probabilities = read_vector<double>();
124
+ nominal_feature->category_probabilities[i] = *probabilities;
125
+ delete probabilities;
126
+ }
117
127
  }
118
128
 
119
129
  // TODO: read cached indexes
@@ -55,6 +55,7 @@ void Storage::Folders::load_directory(string path, DataSet::SparseDataSet *data_
55
55
  // insert a new example into the dataset
56
56
  example = pipeline->process_text(data_set, file_data, true);
57
57
  example->set_category_index(data_set, category_index);
58
+ data_set->examples.push_back(example);
58
59
 
59
60
  file_count++;
60
61
  if((file_count % 10000) == 0)
@@ -9,7 +9,7 @@ Gem::Specification.new do |s|
9
9
  s.description = "C++ Data Mining Library for Ruby"
10
10
  s.email = "me@willcannings.com"
11
11
  s.authors = ["Will Cannings"]
12
- s.version = '0.0.7'
12
+ s.version = '0.0.8'
13
13
  s.extensions = ["ext/extconf.rb"]
14
14
 
15
15
  s.files = `git ls-files`.split("\n")
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: thera
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-12-22 00:00:00.000000000Z
12
+ date: 2012-01-21 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rice
16
- requirement: &70195705116960 !ruby/object:Gem::Requirement
16
+ requirement: &70151526602040 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70195705116960
24
+ version_requirements: *70151526602040
25
25
  description: C++ Data Mining Library for Ruby
26
26
  email: me@willcannings.com
27
27
  executables: []