thera 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -31,6 +31,20 @@ Object model_rank_text(Object self, Object text) {
31
31
  return indexes;
32
32
  }
33
33
 
34
+ Object model_rank_text_names(Object self, Object text) {
35
+ Model::Model *model = from_ruby<Model::Model *>(self);
36
+ string example_text = from_ruby<string>(text);
37
+ Array names;
38
+
39
+ vector<Classifier::Score> *ranks = model->rank_text(example_text);
40
+ DataSet::NominalFeature *categories = model->data_set->category_feature();
41
+ for(unsigned int i = 0; i < ranks->size(); i++)
42
+ names.push(categories->names[ranks->at(i).category]);
43
+
44
+ delete ranks;
45
+ return names;
46
+ }
47
+
34
48
 
35
49
  extern "C" {
36
50
 
@@ -45,8 +59,8 @@ extern "C" {
45
59
  // text pipeline
46
60
  rb_mText.define_module_function("standard_pipeline", &Preprocessing::Text::StandardPipeline);
47
61
  Data_Type<Preprocessing::Text::TextPipeline> rb_cTextPipeline = define_class_under<Preprocessing::Text::TextPipeline>(rb_mQuarry, "ImplTextPipeline")
48
- .define_constructor(Constructor<Preprocessing::Text::TextPipeline>())
49
- .define_method("process_text", &Preprocessing::Text::TextPipeline::process_text);
62
+ .define_constructor(Constructor<Preprocessing::Text::TextPipeline>());
63
+ // .define_method("process_text", &Preprocessing::Text::TextPipeline::process_text);
50
64
 
51
65
  // storage
52
66
  Data_Type<Storage::Storage> rb_cStorage = define_class_under<Storage::Storage>(rb_mQuarry, "ImplStorage");
@@ -83,7 +97,8 @@ extern "C" {
83
97
  .define_method("set_text_pipeline", &Model::Model::set_text_pipeline)
84
98
  .define_method("get_text_pipeline", &Model::Model::get_text_pipeline)
85
99
  .define_method("rank", &model_rank)
86
- .define_method("rank_text", &model_rank_text);
100
+ .define_method("rank_text", &model_rank_text)
101
+ .define_method("rank_text_names", &model_rank_text_names);
87
102
 
88
103
 
89
104
 
@@ -11,7 +11,7 @@ int Model::Model::classify(DataSet::Example *example) {
11
11
  }
12
12
 
13
13
  int Model::Model::classify_text(string text) {
14
- DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str());
14
+ DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str(), false);
15
15
  int category = classifier->classify(example);
16
16
  delete example;
17
17
  return category;
@@ -22,7 +22,7 @@ vector<Classifier::Score> *Model::Model::rank(DataSet::Example *example) {
22
22
  }
23
23
 
24
24
  vector<Classifier::Score> *Model::Model::rank_text(string text) {
25
- DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str());
25
+ DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str(), false);
26
26
  vector<Classifier::Score> *ranks = classifier->rank(example);
27
27
  delete example;
28
28
  return ranks;
@@ -9,7 +9,7 @@ namespace Preprocessing {
9
9
  class ExampleGenerator {
10
10
  public:
11
11
  ExampleGenerator() {}
12
- virtual DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens) { return NULL; }
12
+ virtual DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens, bool create_features) { return NULL; }
13
13
  virtual uint32_t mark() = 0;
14
14
  };
15
15
 
@@ -22,7 +22,7 @@ namespace Preprocessing {
22
22
 
23
23
  TokenCounter(TokenCounterWeight weight = Count) : ExampleGenerator(), token_counts(), weight(weight) {}
24
24
 
25
- DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens) {
25
+ DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens, bool create_features) {
26
26
  int max_count = 0, count = 0;
27
27
  double value = 0.0;
28
28
  token_counts.clear();
@@ -38,6 +38,8 @@ namespace Preprocessing {
38
38
 
39
39
  // construct the example
40
40
  DataSet::SparseExample *example = data_set->new_example(token_counts.size());
41
+ DataSet::Feature *feature = NULL;
42
+
41
43
  for(map<string, int>::iterator token_counts_it = token_counts.begin(); token_counts_it != token_counts.end(); token_counts_it++) {
42
44
  value = token_counts_it->second;
43
45
 
@@ -46,7 +48,13 @@ namespace Preprocessing {
46
48
  else if(weight == Binary)
47
49
  value = 1;
48
50
 
49
- example->set_value(data_set->get_or_create_numeric_feature_by_name(token_counts_it->first)->index, value);
51
+ if(create_features) {
52
+ example->set_value(data_set->get_or_create_numeric_feature_by_name(token_counts_it->first)->index, value);
53
+ } else {
54
+ feature = data_set->get_feature_by_name(token_counts_it->first);
55
+ if(feature)
56
+ example->set_value(feature->index, value);
57
+ }
50
58
  }
51
59
 
52
60
  return example;
@@ -1,10 +1,10 @@
1
1
  #include "text_pipeline.h"
2
2
  #include <iostream>
3
3
 
4
- DataSet::SparseExample *Preprocessing::Text::TextPipeline::process_text(DataSet::SparseDataSet *data_set, char *text) {
4
+ DataSet::SparseExample *Preprocessing::Text::TextPipeline::process_text(DataSet::SparseDataSet *data_set, char *text, bool create_features) {
5
5
  tokens.clear();
6
6
  tokeniser->tokenise(text);
7
- return generator->generate(data_set, &tokens);
7
+ return generator->generate(data_set, &tokens, create_features);
8
8
  }
9
9
 
10
10
  void Preprocessing::Text::TextPipeline::process_token(char *start, char *end) {
@@ -26,7 +26,7 @@ namespace Preprocessing {
26
26
  vector<char *> tokens;
27
27
 
28
28
  TextPipeline() : tokeniser(NULL), processors(), selectors(), generator(NULL), tokens() {}
29
- DataSet::SparseExample *process_text(DataSet::SparseDataSet *data_set, char *text);
29
+ DataSet::SparseExample *process_text(DataSet::SparseDataSet *data_set, char *text, bool create_features);
30
30
  void process_token(char *start, char *end);
31
31
  };
32
32
 
@@ -52,7 +52,7 @@ void Storage::Folders::load_directory(string path, DataSet::SparseDataSet *data_
52
52
  fclose(file);
53
53
 
54
54
  // insert a new example into the dataset
55
- example = pipeline->process_text(data_set, file_data);
55
+ example = pipeline->process_text(data_set, file_data, true);
56
56
  example->set_category_index(data_set, category_index);
57
57
 
58
58
  file_count++;
@@ -52,5 +52,9 @@ module Quarry
52
52
  def rank_text(text)
53
53
  @model.rank_text(text)
54
54
  end
55
+
56
+ def rank_text_names(text)
57
+ @model.rank_text_names(text)
58
+ end
55
59
  end
56
60
  end
@@ -5,8 +5,8 @@ module Quarry
5
5
  @text_pipeline = tp || Quarry::ImplTextPipeline.new
6
6
  end
7
7
 
8
- def process_text(data_set, text)
9
- Example.new(@text_pipeline.process_text(data_set, text))
8
+ def process_text(data_set, text, create_features = false)
9
+ Example.new(@text_pipeline.process_text(data_set, text, create_features))
10
10
  end
11
11
 
12
12
  def self.standard_pipeline
data/thera.gemspec CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |s|
9
9
  s.description = "C++ Data Mining Library for Ruby"
10
10
  s.email = "me@willcannings.com"
11
11
  s.authors = ["Will Cannings"]
12
- s.version = '0.0.1'
12
+ s.version = '0.0.2'
13
13
  s.extensions = ["ext/extconf.rb"]
14
14
 
15
15
  s.files = `git ls-files`.split("\n")
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: thera
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2011-12-07 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rice
16
- requirement: &70234812248380 !ruby/object:Gem::Requirement
16
+ requirement: &70171050802120 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70234812248380
24
+ version_requirements: *70171050802120
25
25
  description: C++ Data Mining Library for Ruby
26
26
  email: me@willcannings.com
27
27
  executables: []