thera 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -60,7 +60,7 @@ extern "C" {
60
60
  rb_mText.define_module_function("standard_pipeline", &Preprocessing::Text::StandardPipeline);
61
61
  Data_Type<Preprocessing::Text::TextPipeline> rb_cTextPipeline = define_class_under<Preprocessing::Text::TextPipeline>(rb_mQuarry, "ImplTextPipeline")
62
62
  .define_constructor(Constructor<Preprocessing::Text::TextPipeline>());
63
- // .define_method("process_text", &Preprocessing::Text::TextPipeline::process_text);
63
+ //.define_method("process_text", &Preprocessing::Text::TextPipeline::process_text);
64
64
 
65
65
  // storage
66
66
  Data_Type<Storage::Storage> rb_cStorage = define_class_under<Storage::Storage>(rb_mQuarry, "ImplStorage");
@@ -90,6 +90,8 @@ extern "C" {
90
90
  .define_method("train_text", &Model::Model::train_text)
91
91
  .define_method("classify", &Model::Model::classify)
92
92
  .define_method("classify_text", &Model::Model::classify_text)
93
+ .define_method("process_text", &Model::Model::process_text)
94
+ .define_method("add_text_example", &Model::Model::add_text_example)
93
95
  .define_method("set_data_set", &Model::Model::set_data_set)
94
96
  .define_method("get_data_set", &Model::Model::get_data_set)
95
97
  .define_method("set_classifier", &Model::Model::set_classifier)
@@ -110,6 +112,7 @@ extern "C" {
110
112
 
111
113
  Data_Type<DataSet::Example> rb_cDataSetExample = define_class_under<DataSet::Example>(rb_mDataSet, "ImplExample")
112
114
  .define_method("category_index", &DataSet::Example::category_index)
115
+ .define_method("set_category_index", &DataSet::Example::set_category_index)
113
116
  .define_method("get_value", &DataSet::Example::get_value)
114
117
  .define_method("set_value", &DataSet::Example::set_value)
115
118
  .define_constructor(Constructor<DataSet::Example, int>());
@@ -21,6 +21,7 @@ void Classifier::NaiveBayesClassifier::prepare() {
21
21
  feature_caches[i].resize(features_size);
22
22
 
23
23
  for(int j = 0; j < features_size; j++) {
24
+ // FIXME: need to wipe numeric_features[j] here in case category j was, now isn't numeric on a second call to prepare
24
25
  if(!numeric_features[j])
25
26
  continue;
26
27
  feature = (DataSet::NumericFeature *) data_set->features[j];
@@ -62,12 +63,12 @@ double Classifier::NaiveBayesClassifier::score(int category, DataSet::Example *e
62
63
  }
63
64
 
64
65
  void Classifier::NaiveBayesClassifier::write_binary(Storage::Binary *file) {
65
- int category_count = feature_caches.size();
66
+ int category_count = feature_caches.size() - 1;
66
67
  file->write_int(category_count);
67
68
  file->write_vector<double>(&category_probabilities);
68
69
 
69
70
  for(int i = 1; i <= category_count; i++)
70
- file->write_vector<NumericFeatureCache>(&feature_caches[i]);
71
+ file->write_vector<NumericFeatureCache>(&(feature_caches[i]));
71
72
  }
72
73
 
73
74
  void Classifier::NaiveBayesClassifier::read_binary(Storage::Binary *file) {
@@ -6,6 +6,15 @@ void Model::Model::train(DataSet::Example *example) {
6
6
  void Model::Model::train_text(string text) {
7
7
  }
8
8
 
9
+ DataSet::Example *Model::Model::process_text(string text, bool create_features) {
10
+ return text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str(), create_features);
11
+ }
12
+
13
+ void Model::Model::add_text_example(string text, string category) {
14
+ DataSet::Example *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str(), true);
15
+ example->set_category_index(data_set, data_set->category_feature()->value_index(category));
16
+ }
17
+
9
18
  int Model::Model::classify(DataSet::Example *example) {
10
19
  return classifier->classify(example);
11
20
  }
@@ -16,6 +16,8 @@ namespace Model {
16
16
 
17
17
  void train(DataSet::Example *example);
18
18
  void train_text(string text);
19
+ DataSet::Example *process_text(string text, bool create_features);
20
+ void add_text_example(string text, string category);
19
21
  int classify(DataSet::Example *example);
20
22
  int classify_text(string text);
21
23
  vector<Classifier::Score> *rank(DataSet::Example *example);
@@ -448,7 +448,6 @@ Model::Model *Storage::Binary::read_model() {
448
448
  void Storage::Binary::write_model(Model::Model *model) {
449
449
  open_for_writing();
450
450
 
451
- // write the 3 model components
452
451
  write_data_set(model->data_set);
453
452
  write_classifier(model->classifier);
454
453
  write_text_pipeline(model->text_pipeline);
@@ -15,15 +15,15 @@ module Quarry
15
15
  end
16
16
 
17
17
  def examples
18
- @examples ||= EnumerableHelper.new(self, @data_set, Example, :examples_size, :get_example_by_index)
18
+ @examples ||= EnumerableHelper.new(self, @data_set, ::Quarry::DataSet::Example, :examples_size, :get_example_by_index)
19
19
  end
20
20
 
21
21
  def categories
22
- @categories ||= EnumerableHelper.new(self, @data_set, Category, :categories_size, :get_category_by_index)
22
+ @categories ||= EnumerableHelper.new(self, @data_set, ::Quarry::DataSet::Category, :categories_size, :get_category_by_index)
23
23
  end
24
24
 
25
25
  def features
26
- @features ||= EnumerableHelper.new(self, @data_set, Feature, :features_size, :get_feature_by_index)
26
+ @features ||= EnumerableHelper.new(self, @data_set, ::Quarry::DataSet::Feature, :features_size, :get_feature_by_index)
27
27
  end
28
28
 
29
29
  def stratify(classifier, folds, skip_fold)
@@ -25,8 +25,7 @@ module Quarry
25
25
  end
26
26
 
27
27
  def category=(new_category)
28
- raise "new_category must be an instance of Quarry::DataSet::Category" unless new_category.is_a?(Category)
29
- @example.set_category(new_category.category)
28
+ @example.set_category_index(@data_set.data_set, new_category)
30
29
  end
31
30
  end
32
31
  end
@@ -6,11 +6,12 @@ module Quarry
6
6
  end
7
7
 
8
8
  def data_set
9
- Quarry::DataSet::DataSet.new(@model.get_data_set)
9
+ @data_set ||= Quarry::DataSet::DataSet.new(@model.get_data_set)
10
10
  end
11
11
 
12
12
  def data_set=(ds)
13
13
  @model.set_data_set(ds.data_set)
14
+ @data_set = ds
14
15
  end
15
16
 
16
17
  # def classifier
@@ -22,11 +23,12 @@ module Quarry
22
23
  end
23
24
 
24
25
  def text_pipeline
25
- TextPipeline.new(@model.get_text_pipeline)
26
+ @text_pipeline ||= TextPipeline.new(@model.get_text_pipeline)
26
27
  end
27
28
 
28
29
  def text_pipeline=(t)
29
30
  @model.set_text_pipeline(t.text_pipeline)
31
+ @text_pipeline = t
30
32
  end
31
33
 
32
34
  def train(example)
@@ -45,6 +47,14 @@ module Quarry
45
47
  @model.classify_text(text)
46
48
  end
47
49
 
50
+ def process_text(text, create_features = true)
51
+ DataSet::Example.new(@model.process_text(text, create_features), data_set)
52
+ end
53
+
54
+ def add_text_example(text, category_name)
55
+ @model.add_text_example(text, category_name)
56
+ end
57
+
48
58
  def rank(example)
49
59
  @model.rank(example)
50
60
  end
@@ -6,7 +6,7 @@ module Quarry
6
6
  end
7
7
 
8
8
  def process_text(data_set, text, create_features = false)
9
- Example.new(@text_pipeline.process_text(data_set, text, create_features))
9
+ ::Quarry::DataSet::Example.new(@text_pipeline.process_text(data_set.data_set, text, create_features))
10
10
  end
11
11
 
12
12
  def self.standard_pipeline
data/thera.gemspec CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |s|
9
9
  s.description = "C++ Data Mining Library for Ruby"
10
10
  s.email = "me@willcannings.com"
11
11
  s.authors = ["Will Cannings"]
12
- s.version = '0.0.2'
12
+ s.version = '0.0.3'
13
13
  s.extensions = ["ext/extconf.rb"]
14
14
 
15
15
  s.files = `git ls-files`.split("\n")
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: thera
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2011-12-07 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rice
16
- requirement: &70171050802120 !ruby/object:Gem::Requirement
16
+ requirement: &70129254365400 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70171050802120
24
+ version_requirements: *70129254365400
25
25
  description: C++ Data Mining Library for Ruby
26
26
  email: me@willcannings.com
27
27
  executables: []