thera 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -60,7 +60,7 @@ extern "C" {
60
60
  rb_mText.define_module_function("standard_pipeline", &Preprocessing::Text::StandardPipeline);
61
61
  Data_Type<Preprocessing::Text::TextPipeline> rb_cTextPipeline = define_class_under<Preprocessing::Text::TextPipeline>(rb_mQuarry, "ImplTextPipeline")
62
62
  .define_constructor(Constructor<Preprocessing::Text::TextPipeline>());
63
- // .define_method("process_text", &Preprocessing::Text::TextPipeline::process_text);
63
+ //.define_method("process_text", &Preprocessing::Text::TextPipeline::process_text);
64
64
 
65
65
  // storage
66
66
  Data_Type<Storage::Storage> rb_cStorage = define_class_under<Storage::Storage>(rb_mQuarry, "ImplStorage");
@@ -90,6 +90,8 @@ extern "C" {
90
90
  .define_method("train_text", &Model::Model::train_text)
91
91
  .define_method("classify", &Model::Model::classify)
92
92
  .define_method("classify_text", &Model::Model::classify_text)
93
+ .define_method("process_text", &Model::Model::process_text)
94
+ .define_method("add_text_example", &Model::Model::add_text_example)
93
95
  .define_method("set_data_set", &Model::Model::set_data_set)
94
96
  .define_method("get_data_set", &Model::Model::get_data_set)
95
97
  .define_method("set_classifier", &Model::Model::set_classifier)
@@ -110,6 +112,7 @@ extern "C" {
110
112
 
111
113
  Data_Type<DataSet::Example> rb_cDataSetExample = define_class_under<DataSet::Example>(rb_mDataSet, "ImplExample")
112
114
  .define_method("category_index", &DataSet::Example::category_index)
115
+ .define_method("set_category_index", &DataSet::Example::set_category_index)
113
116
  .define_method("get_value", &DataSet::Example::get_value)
114
117
  .define_method("set_value", &DataSet::Example::set_value)
115
118
  .define_constructor(Constructor<DataSet::Example, int>());
@@ -21,6 +21,7 @@ void Classifier::NaiveBayesClassifier::prepare() {
21
21
  feature_caches[i].resize(features_size);
22
22
 
23
23
  for(int j = 0; j < features_size; j++) {
24
+ // FIXME: need to wipe numeric_features[j] here in case category j was, now isn't numeric on a second call to prepare
24
25
  if(!numeric_features[j])
25
26
  continue;
26
27
  feature = (DataSet::NumericFeature *) data_set->features[j];
@@ -62,12 +63,12 @@ double Classifier::NaiveBayesClassifier::score(int category, DataSet::Example *e
62
63
  }
63
64
 
64
65
  void Classifier::NaiveBayesClassifier::write_binary(Storage::Binary *file) {
65
- int category_count = feature_caches.size();
66
+ int category_count = feature_caches.size() - 1;
66
67
  file->write_int(category_count);
67
68
  file->write_vector<double>(&category_probabilities);
68
69
 
69
70
  for(int i = 1; i <= category_count; i++)
70
- file->write_vector<NumericFeatureCache>(&feature_caches[i]);
71
+ file->write_vector<NumericFeatureCache>(&(feature_caches[i]));
71
72
  }
72
73
 
73
74
  void Classifier::NaiveBayesClassifier::read_binary(Storage::Binary *file) {
@@ -6,6 +6,15 @@ void Model::Model::train(DataSet::Example *example) {
6
6
  void Model::Model::train_text(string text) {
7
7
  }
8
8
 
9
+ DataSet::Example *Model::Model::process_text(string text, bool create_features) {
10
+ return text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str(), create_features);
11
+ }
12
+
13
+ void Model::Model::add_text_example(string text, string category) {
14
+ DataSet::Example *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str(), true);
15
+ example->set_category_index(data_set, data_set->category_feature()->value_index(category));
16
+ }
17
+
9
18
  int Model::Model::classify(DataSet::Example *example) {
10
19
  return classifier->classify(example);
11
20
  }
@@ -16,6 +16,8 @@ namespace Model {
16
16
 
17
17
  void train(DataSet::Example *example);
18
18
  void train_text(string text);
19
+ DataSet::Example *process_text(string text, bool create_features);
20
+ void add_text_example(string text, string category);
19
21
  int classify(DataSet::Example *example);
20
22
  int classify_text(string text);
21
23
  vector<Classifier::Score> *rank(DataSet::Example *example);
@@ -448,7 +448,6 @@ Model::Model *Storage::Binary::read_model() {
448
448
  void Storage::Binary::write_model(Model::Model *model) {
449
449
  open_for_writing();
450
450
 
451
- // write the 3 model components
452
451
  write_data_set(model->data_set);
453
452
  write_classifier(model->classifier);
454
453
  write_text_pipeline(model->text_pipeline);
@@ -15,15 +15,15 @@ module Quarry
15
15
  end
16
16
 
17
17
  def examples
18
- @examples ||= EnumerableHelper.new(self, @data_set, Example, :examples_size, :get_example_by_index)
18
+ @examples ||= EnumerableHelper.new(self, @data_set, ::Quarry::DataSet::Example, :examples_size, :get_example_by_index)
19
19
  end
20
20
 
21
21
  def categories
22
- @categories ||= EnumerableHelper.new(self, @data_set, Category, :categories_size, :get_category_by_index)
22
+ @categories ||= EnumerableHelper.new(self, @data_set, ::Quarry::DataSet::Category, :categories_size, :get_category_by_index)
23
23
  end
24
24
 
25
25
  def features
26
- @features ||= EnumerableHelper.new(self, @data_set, Feature, :features_size, :get_feature_by_index)
26
+ @features ||= EnumerableHelper.new(self, @data_set, ::Quarry::DataSet::Feature, :features_size, :get_feature_by_index)
27
27
  end
28
28
 
29
29
  def stratify(classifier, folds, skip_fold)
@@ -25,8 +25,7 @@ module Quarry
25
25
  end
26
26
 
27
27
  def category=(new_category)
28
- raise "new_category must be an instance of Quarry::DataSet::Category" unless new_category.is_a?(Category)
29
- @example.set_category(new_category.category)
28
+ @example.set_category_index(@data_set.data_set, new_category)
30
29
  end
31
30
  end
32
31
  end
@@ -6,11 +6,12 @@ module Quarry
6
6
  end
7
7
 
8
8
  def data_set
9
- Quarry::DataSet::DataSet.new(@model.get_data_set)
9
+ @data_set ||= Quarry::DataSet::DataSet.new(@model.get_data_set)
10
10
  end
11
11
 
12
12
  def data_set=(ds)
13
13
  @model.set_data_set(ds.data_set)
14
+ @data_set = ds
14
15
  end
15
16
 
16
17
  # def classifier
@@ -22,11 +23,12 @@ module Quarry
22
23
  end
23
24
 
24
25
  def text_pipeline
25
- TextPipeline.new(@model.get_text_pipeline)
26
+ @text_pipeline ||= TextPipeline.new(@model.get_text_pipeline)
26
27
  end
27
28
 
28
29
  def text_pipeline=(t)
29
30
  @model.set_text_pipeline(t.text_pipeline)
31
+ @text_pipeline = t
30
32
  end
31
33
 
32
34
  def train(example)
@@ -45,6 +47,14 @@ module Quarry
45
47
  @model.classify_text(text)
46
48
  end
47
49
 
50
+ def process_text(text, create_features = true)
51
+ DataSet::Example.new(@model.process_text(text, create_features), data_set)
52
+ end
53
+
54
+ def add_text_example(text, category_name)
55
+ @model.add_text_example(text, category_name)
56
+ end
57
+
48
58
  def rank(example)
49
59
  @model.rank(example)
50
60
  end
@@ -6,7 +6,7 @@ module Quarry
6
6
  end
7
7
 
8
8
  def process_text(data_set, text, create_features = false)
9
- Example.new(@text_pipeline.process_text(data_set, text, create_features))
9
+ ::Quarry::DataSet::Example.new(@text_pipeline.process_text(data_set.data_set, text, create_features))
10
10
  end
11
11
 
12
12
  def self.standard_pipeline
data/thera.gemspec CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |s|
9
9
  s.description = "C++ Data Mining Library for Ruby"
10
10
  s.email = "me@willcannings.com"
11
11
  s.authors = ["Will Cannings"]
12
- s.version = '0.0.2'
12
+ s.version = '0.0.3'
13
13
  s.extensions = ["ext/extconf.rb"]
14
14
 
15
15
  s.files = `git ls-files`.split("\n")
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: thera
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2011-12-07 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rice
16
- requirement: &70171050802120 !ruby/object:Gem::Requirement
16
+ requirement: &70129254365400 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70171050802120
24
+ version_requirements: *70129254365400
25
25
  description: C++ Data Mining Library for Ruby
26
26
  email: me@willcannings.com
27
27
  executables: []