thera 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/quarry/quarry_toolkit.cpp +4 -1
- data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +3 -2
- data/lib/quarry/src/model/model.cpp +9 -0
- data/lib/quarry/src/model/model.h +2 -0
- data/lib/quarry/src/storage/binary.cpp +0 -1
- data/lib/quarry_rb/data_set/data_set.rb +3 -3
- data/lib/quarry_rb/data_set/example.rb +1 -2
- data/lib/quarry_rb/model/model.rb +12 -2
- data/lib/quarry_rb/text_pipeline.rb +1 -1
- data/thera.gemspec +1 -1
- metadata +3 -3
@@ -60,7 +60,7 @@ extern "C" {
|
|
60
60
|
rb_mText.define_module_function("standard_pipeline", &Preprocessing::Text::StandardPipeline);
|
61
61
|
Data_Type<Preprocessing::Text::TextPipeline> rb_cTextPipeline = define_class_under<Preprocessing::Text::TextPipeline>(rb_mQuarry, "ImplTextPipeline")
|
62
62
|
.define_constructor(Constructor<Preprocessing::Text::TextPipeline>());
|
63
|
-
|
63
|
+
//.define_method("process_text", &Preprocessing::Text::TextPipeline::process_text);
|
64
64
|
|
65
65
|
// storage
|
66
66
|
Data_Type<Storage::Storage> rb_cStorage = define_class_under<Storage::Storage>(rb_mQuarry, "ImplStorage");
|
@@ -90,6 +90,8 @@ extern "C" {
|
|
90
90
|
.define_method("train_text", &Model::Model::train_text)
|
91
91
|
.define_method("classify", &Model::Model::classify)
|
92
92
|
.define_method("classify_text", &Model::Model::classify_text)
|
93
|
+
.define_method("process_text", &Model::Model::process_text)
|
94
|
+
.define_method("add_text_example", &Model::Model::add_text_example)
|
93
95
|
.define_method("set_data_set", &Model::Model::set_data_set)
|
94
96
|
.define_method("get_data_set", &Model::Model::get_data_set)
|
95
97
|
.define_method("set_classifier", &Model::Model::set_classifier)
|
@@ -110,6 +112,7 @@ extern "C" {
|
|
110
112
|
|
111
113
|
Data_Type<DataSet::Example> rb_cDataSetExample = define_class_under<DataSet::Example>(rb_mDataSet, "ImplExample")
|
112
114
|
.define_method("category_index", &DataSet::Example::category_index)
|
115
|
+
.define_method("set_category_index", &DataSet::Example::set_category_index)
|
113
116
|
.define_method("get_value", &DataSet::Example::get_value)
|
114
117
|
.define_method("set_value", &DataSet::Example::set_value)
|
115
118
|
.define_constructor(Constructor<DataSet::Example, int>());
|
@@ -21,6 +21,7 @@ void Classifier::NaiveBayesClassifier::prepare() {
|
|
21
21
|
feature_caches[i].resize(features_size);
|
22
22
|
|
23
23
|
for(int j = 0; j < features_size; j++) {
|
24
|
+
// FIXME: need to wipe numeric_features[j] here in case category j was, now isn't numeric on a second call to prepare
|
24
25
|
if(!numeric_features[j])
|
25
26
|
continue;
|
26
27
|
feature = (DataSet::NumericFeature *) data_set->features[j];
|
@@ -62,12 +63,12 @@ double Classifier::NaiveBayesClassifier::score(int category, DataSet::Example *e
|
|
62
63
|
}
|
63
64
|
|
64
65
|
void Classifier::NaiveBayesClassifier::write_binary(Storage::Binary *file) {
|
65
|
-
int category_count = feature_caches.size();
|
66
|
+
int category_count = feature_caches.size() - 1;
|
66
67
|
file->write_int(category_count);
|
67
68
|
file->write_vector<double>(&category_probabilities);
|
68
69
|
|
69
70
|
for(int i = 1; i <= category_count; i++)
|
70
|
-
file->write_vector<NumericFeatureCache>(&feature_caches[i]);
|
71
|
+
file->write_vector<NumericFeatureCache>(&(feature_caches[i]));
|
71
72
|
}
|
72
73
|
|
73
74
|
void Classifier::NaiveBayesClassifier::read_binary(Storage::Binary *file) {
|
@@ -6,6 +6,15 @@ void Model::Model::train(DataSet::Example *example) {
|
|
6
6
|
void Model::Model::train_text(string text) {
|
7
7
|
}
|
8
8
|
|
9
|
+
DataSet::Example *Model::Model::process_text(string text, bool create_features) {
|
10
|
+
return text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str(), create_features);
|
11
|
+
}
|
12
|
+
|
13
|
+
void Model::Model::add_text_example(string text, string category) {
|
14
|
+
DataSet::Example *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str(), true);
|
15
|
+
example->set_category_index(data_set, data_set->category_feature()->value_index(category));
|
16
|
+
}
|
17
|
+
|
9
18
|
int Model::Model::classify(DataSet::Example *example) {
|
10
19
|
return classifier->classify(example);
|
11
20
|
}
|
@@ -16,6 +16,8 @@ namespace Model {
|
|
16
16
|
|
17
17
|
void train(DataSet::Example *example);
|
18
18
|
void train_text(string text);
|
19
|
+
DataSet::Example *process_text(string text, bool create_features);
|
20
|
+
void add_text_example(string text, string category);
|
19
21
|
int classify(DataSet::Example *example);
|
20
22
|
int classify_text(string text);
|
21
23
|
vector<Classifier::Score> *rank(DataSet::Example *example);
|
@@ -448,7 +448,6 @@ Model::Model *Storage::Binary::read_model() {
|
|
448
448
|
void Storage::Binary::write_model(Model::Model *model) {
|
449
449
|
open_for_writing();
|
450
450
|
|
451
|
-
// write the 3 model components
|
452
451
|
write_data_set(model->data_set);
|
453
452
|
write_classifier(model->classifier);
|
454
453
|
write_text_pipeline(model->text_pipeline);
|
@@ -15,15 +15,15 @@ module Quarry
|
|
15
15
|
end
|
16
16
|
|
17
17
|
def examples
|
18
|
-
@examples ||= EnumerableHelper.new(self, @data_set, Example, :examples_size, :get_example_by_index)
|
18
|
+
@examples ||= EnumerableHelper.new(self, @data_set, ::Quarry::DataSet::Example, :examples_size, :get_example_by_index)
|
19
19
|
end
|
20
20
|
|
21
21
|
def categories
|
22
|
-
@categories ||= EnumerableHelper.new(self, @data_set, Category, :categories_size, :get_category_by_index)
|
22
|
+
@categories ||= EnumerableHelper.new(self, @data_set, ::Quarry::DataSet::Category, :categories_size, :get_category_by_index)
|
23
23
|
end
|
24
24
|
|
25
25
|
def features
|
26
|
-
@features ||= EnumerableHelper.new(self, @data_set, Feature, :features_size, :get_feature_by_index)
|
26
|
+
@features ||= EnumerableHelper.new(self, @data_set, ::Quarry::DataSet::Feature, :features_size, :get_feature_by_index)
|
27
27
|
end
|
28
28
|
|
29
29
|
def stratify(classifier, folds, skip_fold)
|
@@ -25,8 +25,7 @@ module Quarry
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def category=(new_category)
|
28
|
-
|
29
|
-
@example.set_category(new_category.category)
|
28
|
+
@example.set_category_index(@data_set.data_set, new_category)
|
30
29
|
end
|
31
30
|
end
|
32
31
|
end
|
@@ -6,11 +6,12 @@ module Quarry
|
|
6
6
|
end
|
7
7
|
|
8
8
|
def data_set
|
9
|
-
Quarry::DataSet::DataSet.new(@model.get_data_set)
|
9
|
+
@data_set ||= Quarry::DataSet::DataSet.new(@model.get_data_set)
|
10
10
|
end
|
11
11
|
|
12
12
|
def data_set=(ds)
|
13
13
|
@model.set_data_set(ds.data_set)
|
14
|
+
@data_set = ds
|
14
15
|
end
|
15
16
|
|
16
17
|
# def classifier
|
@@ -22,11 +23,12 @@ module Quarry
|
|
22
23
|
end
|
23
24
|
|
24
25
|
def text_pipeline
|
25
|
-
TextPipeline.new(@model.get_text_pipeline)
|
26
|
+
@text_pipeline ||= TextPipeline.new(@model.get_text_pipeline)
|
26
27
|
end
|
27
28
|
|
28
29
|
def text_pipeline=(t)
|
29
30
|
@model.set_text_pipeline(t.text_pipeline)
|
31
|
+
@text_pipeline = t
|
30
32
|
end
|
31
33
|
|
32
34
|
def train(example)
|
@@ -45,6 +47,14 @@ module Quarry
|
|
45
47
|
@model.classify_text(text)
|
46
48
|
end
|
47
49
|
|
50
|
+
def process_text(text, create_features = true)
|
51
|
+
DataSet::Example.new(@model.process_text(text, create_features), data_set)
|
52
|
+
end
|
53
|
+
|
54
|
+
def add_text_example(text, category_name)
|
55
|
+
@model.add_text_example(text, category_name)
|
56
|
+
end
|
57
|
+
|
48
58
|
def rank(example)
|
49
59
|
@model.rank(example)
|
50
60
|
end
|
@@ -6,7 +6,7 @@ module Quarry
|
|
6
6
|
end
|
7
7
|
|
8
8
|
def process_text(data_set, text, create_features = false)
|
9
|
-
Example.new(@text_pipeline.process_text(data_set, text, create_features))
|
9
|
+
::Quarry::DataSet::Example.new(@text_pipeline.process_text(data_set.data_set, text, create_features))
|
10
10
|
end
|
11
11
|
|
12
12
|
def self.standard_pipeline
|
data/thera.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |s|
|
|
9
9
|
s.description = "C++ Data Mining Library for Ruby"
|
10
10
|
s.email = "me@willcannings.com"
|
11
11
|
s.authors = ["Will Cannings"]
|
12
|
-
s.version = '0.0.
|
12
|
+
s.version = '0.0.3'
|
13
13
|
s.extensions = ["ext/extconf.rb"]
|
14
14
|
|
15
15
|
s.files = `git ls-files`.split("\n")
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: thera
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2011-12-07 00:00:00.000000000Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rice
|
16
|
-
requirement: &
|
16
|
+
requirement: &70129254365400 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70129254365400
|
25
25
|
description: C++ Data Mining Library for Ruby
|
26
26
|
email: me@willcannings.com
|
27
27
|
executables: []
|