thera 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,6 +31,20 @@ Object model_rank_text(Object self, Object text) {
31
31
  return indexes;
32
32
  }
33
33
 
34
+ Object model_rank_text_names(Object self, Object text) {
35
+ Model::Model *model = from_ruby<Model::Model *>(self);
36
+ string example_text = from_ruby<string>(text);
37
+ Array names;
38
+
39
+ vector<Classifier::Score> *ranks = model->rank_text(example_text);
40
+ DataSet::NominalFeature *categories = model->data_set->category_feature();
41
+ for(unsigned int i = 0; i < ranks->size(); i++)
42
+ names.push(categories->names[ranks->at(i).category]);
43
+
44
+ delete ranks;
45
+ return names;
46
+ }
47
+
34
48
 
35
49
  extern "C" {
36
50
 
@@ -45,8 +59,8 @@ extern "C" {
45
59
  // text pipeline
46
60
  rb_mText.define_module_function("standard_pipeline", &Preprocessing::Text::StandardPipeline);
47
61
  Data_Type<Preprocessing::Text::TextPipeline> rb_cTextPipeline = define_class_under<Preprocessing::Text::TextPipeline>(rb_mQuarry, "ImplTextPipeline")
48
- .define_constructor(Constructor<Preprocessing::Text::TextPipeline>())
49
- .define_method("process_text", &Preprocessing::Text::TextPipeline::process_text);
62
+ .define_constructor(Constructor<Preprocessing::Text::TextPipeline>());
63
+ // .define_method("process_text", &Preprocessing::Text::TextPipeline::process_text);
50
64
 
51
65
  // storage
52
66
  Data_Type<Storage::Storage> rb_cStorage = define_class_under<Storage::Storage>(rb_mQuarry, "ImplStorage");
@@ -83,7 +97,8 @@ extern "C" {
83
97
  .define_method("set_text_pipeline", &Model::Model::set_text_pipeline)
84
98
  .define_method("get_text_pipeline", &Model::Model::get_text_pipeline)
85
99
  .define_method("rank", &model_rank)
86
- .define_method("rank_text", &model_rank_text);
100
+ .define_method("rank_text", &model_rank_text)
101
+ .define_method("rank_text_names", &model_rank_text_names);
87
102
 
88
103
 
89
104
 
@@ -11,7 +11,7 @@ int Model::Model::classify(DataSet::Example *example) {
11
11
  }
12
12
 
13
13
  int Model::Model::classify_text(string text) {
14
- DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str());
14
+ DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str(), false);
15
15
  int category = classifier->classify(example);
16
16
  delete example;
17
17
  return category;
@@ -22,7 +22,7 @@ vector<Classifier::Score> *Model::Model::rank(DataSet::Example *example) {
22
22
  }
23
23
 
24
24
  vector<Classifier::Score> *Model::Model::rank_text(string text) {
25
- DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str());
25
+ DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str(), false);
26
26
  vector<Classifier::Score> *ranks = classifier->rank(example);
27
27
  delete example;
28
28
  return ranks;
@@ -9,7 +9,7 @@ namespace Preprocessing {
9
9
  class ExampleGenerator {
10
10
  public:
11
11
  ExampleGenerator() {}
12
- virtual DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens) { return NULL; }
12
+ virtual DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens, bool create_features) { return NULL; }
13
13
  virtual uint32_t mark() = 0;
14
14
  };
15
15
 
@@ -22,7 +22,7 @@ namespace Preprocessing {
22
22
 
23
23
  TokenCounter(TokenCounterWeight weight = Count) : ExampleGenerator(), token_counts(), weight(weight) {}
24
24
 
25
- DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens) {
25
+ DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens, bool create_features) {
26
26
  int max_count = 0, count = 0;
27
27
  double value = 0.0;
28
28
  token_counts.clear();
@@ -38,6 +38,8 @@ namespace Preprocessing {
38
38
 
39
39
  // construct the example
40
40
  DataSet::SparseExample *example = data_set->new_example(token_counts.size());
41
+ DataSet::Feature *feature = NULL;
42
+
41
43
  for(map<string, int>::iterator token_counts_it = token_counts.begin(); token_counts_it != token_counts.end(); token_counts_it++) {
42
44
  value = token_counts_it->second;
43
45
 
@@ -46,7 +48,13 @@ namespace Preprocessing {
46
48
  else if(weight == Binary)
47
49
  value = 1;
48
50
 
49
- example->set_value(data_set->get_or_create_numeric_feature_by_name(token_counts_it->first)->index, value);
51
+ if(create_features) {
52
+ example->set_value(data_set->get_or_create_numeric_feature_by_name(token_counts_it->first)->index, value);
53
+ } else {
54
+ feature = data_set->get_feature_by_name(token_counts_it->first);
55
+ if(feature)
56
+ example->set_value(feature->index, value);
57
+ }
50
58
  }
51
59
 
52
60
  return example;
@@ -1,10 +1,10 @@
1
1
  #include "text_pipeline.h"
2
2
  #include <iostream>
3
3
 
4
- DataSet::SparseExample *Preprocessing::Text::TextPipeline::process_text(DataSet::SparseDataSet *data_set, char *text) {
4
+ DataSet::SparseExample *Preprocessing::Text::TextPipeline::process_text(DataSet::SparseDataSet *data_set, char *text, bool create_features) {
5
5
  tokens.clear();
6
6
  tokeniser->tokenise(text);
7
- return generator->generate(data_set, &tokens);
7
+ return generator->generate(data_set, &tokens, create_features);
8
8
  }
9
9
 
10
10
  void Preprocessing::Text::TextPipeline::process_token(char *start, char *end) {
@@ -26,7 +26,7 @@ namespace Preprocessing {
26
26
  vector<char *> tokens;
27
27
 
28
28
  TextPipeline() : tokeniser(NULL), processors(), selectors(), generator(NULL), tokens() {}
29
- DataSet::SparseExample *process_text(DataSet::SparseDataSet *data_set, char *text);
29
+ DataSet::SparseExample *process_text(DataSet::SparseDataSet *data_set, char *text, bool create_features);
30
30
  void process_token(char *start, char *end);
31
31
  };
32
32
 
@@ -52,7 +52,7 @@ void Storage::Folders::load_directory(string path, DataSet::SparseDataSet *data_
52
52
  fclose(file);
53
53
 
54
54
  // insert a new example into the dataset
55
- example = pipeline->process_text(data_set, file_data);
55
+ example = pipeline->process_text(data_set, file_data, true);
56
56
  example->set_category_index(data_set, category_index);
57
57
 
58
58
  file_count++;
@@ -52,5 +52,9 @@ module Quarry
52
52
  def rank_text(text)
53
53
  @model.rank_text(text)
54
54
  end
55
+
56
+ def rank_text_names(text)
57
+ @model.rank_text_names(text)
58
+ end
55
59
  end
56
60
  end
@@ -5,8 +5,8 @@ module Quarry
5
5
  @text_pipeline = tp || Quarry::ImplTextPipeline.new
6
6
  end
7
7
 
8
- def process_text(data_set, text)
9
- Example.new(@text_pipeline.process_text(data_set, text))
8
+ def process_text(data_set, text, create_features = false)
9
+ Example.new(@text_pipeline.process_text(data_set, text, create_features))
10
10
  end
11
11
 
12
12
  def self.standard_pipeline
data/thera.gemspec CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |s|
9
9
  s.description = "C++ Data Mining Library for Ruby"
10
10
  s.email = "me@willcannings.com"
11
11
  s.authors = ["Will Cannings"]
12
- s.version = '0.0.1'
12
+ s.version = '0.0.2'
13
13
  s.extensions = ["ext/extconf.rb"]
14
14
 
15
15
  s.files = `git ls-files`.split("\n")
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: thera
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2011-12-07 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rice
16
- requirement: &70234812248380 !ruby/object:Gem::Requirement
16
+ requirement: &70171050802120 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70234812248380
24
+ version_requirements: *70171050802120
25
25
  description: C++ Data Mining Library for Ruby
26
26
  email: me@willcannings.com
27
27
  executables: []