thera 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/quarry/quarry_toolkit.cpp +4 -1
- data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +3 -2
- data/lib/quarry/src/model/model.cpp +9 -0
- data/lib/quarry/src/model/model.h +2 -0
- data/lib/quarry/src/storage/binary.cpp +0 -1
- data/lib/quarry_rb/data_set/data_set.rb +3 -3
- data/lib/quarry_rb/data_set/example.rb +1 -2
- data/lib/quarry_rb/model/model.rb +12 -2
- data/lib/quarry_rb/text_pipeline.rb +1 -1
- data/thera.gemspec +1 -1
- metadata +3 -3
@@ -60,7 +60,7 @@ extern "C" {
|
|
60
60
|
rb_mText.define_module_function("standard_pipeline", &Preprocessing::Text::StandardPipeline);
|
61
61
|
Data_Type<Preprocessing::Text::TextPipeline> rb_cTextPipeline = define_class_under<Preprocessing::Text::TextPipeline>(rb_mQuarry, "ImplTextPipeline")
|
62
62
|
.define_constructor(Constructor<Preprocessing::Text::TextPipeline>());
|
63
|
-
|
63
|
+
//.define_method("process_text", &Preprocessing::Text::TextPipeline::process_text);
|
64
64
|
|
65
65
|
// storage
|
66
66
|
Data_Type<Storage::Storage> rb_cStorage = define_class_under<Storage::Storage>(rb_mQuarry, "ImplStorage");
|
@@ -90,6 +90,8 @@ extern "C" {
|
|
90
90
|
.define_method("train_text", &Model::Model::train_text)
|
91
91
|
.define_method("classify", &Model::Model::classify)
|
92
92
|
.define_method("classify_text", &Model::Model::classify_text)
|
93
|
+
.define_method("process_text", &Model::Model::process_text)
|
94
|
+
.define_method("add_text_example", &Model::Model::add_text_example)
|
93
95
|
.define_method("set_data_set", &Model::Model::set_data_set)
|
94
96
|
.define_method("get_data_set", &Model::Model::get_data_set)
|
95
97
|
.define_method("set_classifier", &Model::Model::set_classifier)
|
@@ -110,6 +112,7 @@ extern "C" {
|
|
110
112
|
|
111
113
|
Data_Type<DataSet::Example> rb_cDataSetExample = define_class_under<DataSet::Example>(rb_mDataSet, "ImplExample")
|
112
114
|
.define_method("category_index", &DataSet::Example::category_index)
|
115
|
+
.define_method("set_category_index", &DataSet::Example::set_category_index)
|
113
116
|
.define_method("get_value", &DataSet::Example::get_value)
|
114
117
|
.define_method("set_value", &DataSet::Example::set_value)
|
115
118
|
.define_constructor(Constructor<DataSet::Example, int>());
|
@@ -21,6 +21,7 @@ void Classifier::NaiveBayesClassifier::prepare() {
|
|
21
21
|
feature_caches[i].resize(features_size);
|
22
22
|
|
23
23
|
for(int j = 0; j < features_size; j++) {
|
24
|
+
// FIXME: need to wipe numeric_features[j] here in case category j was, now isn't numeric on a second call to prepare
|
24
25
|
if(!numeric_features[j])
|
25
26
|
continue;
|
26
27
|
feature = (DataSet::NumericFeature *) data_set->features[j];
|
@@ -62,12 +63,12 @@ double Classifier::NaiveBayesClassifier::score(int category, DataSet::Example *e
|
|
62
63
|
}
|
63
64
|
|
64
65
|
void Classifier::NaiveBayesClassifier::write_binary(Storage::Binary *file) {
|
65
|
-
int category_count = feature_caches.size();
|
66
|
+
int category_count = feature_caches.size() - 1;
|
66
67
|
file->write_int(category_count);
|
67
68
|
file->write_vector<double>(&category_probabilities);
|
68
69
|
|
69
70
|
for(int i = 1; i <= category_count; i++)
|
70
|
-
file->write_vector<NumericFeatureCache>(&feature_caches[i]);
|
71
|
+
file->write_vector<NumericFeatureCache>(&(feature_caches[i]));
|
71
72
|
}
|
72
73
|
|
73
74
|
void Classifier::NaiveBayesClassifier::read_binary(Storage::Binary *file) {
|
@@ -6,6 +6,15 @@ void Model::Model::train(DataSet::Example *example) {
|
|
6
6
|
void Model::Model::train_text(string text) {
|
7
7
|
}
|
8
8
|
|
9
|
+
DataSet::Example *Model::Model::process_text(string text, bool create_features) {
|
10
|
+
return text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str(), create_features);
|
11
|
+
}
|
12
|
+
|
13
|
+
void Model::Model::add_text_example(string text, string category) {
|
14
|
+
DataSet::Example *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str(), true);
|
15
|
+
example->set_category_index(data_set, data_set->category_feature()->value_index(category));
|
16
|
+
}
|
17
|
+
|
9
18
|
int Model::Model::classify(DataSet::Example *example) {
|
10
19
|
return classifier->classify(example);
|
11
20
|
}
|
@@ -16,6 +16,8 @@ namespace Model {
|
|
16
16
|
|
17
17
|
void train(DataSet::Example *example);
|
18
18
|
void train_text(string text);
|
19
|
+
DataSet::Example *process_text(string text, bool create_features);
|
20
|
+
void add_text_example(string text, string category);
|
19
21
|
int classify(DataSet::Example *example);
|
20
22
|
int classify_text(string text);
|
21
23
|
vector<Classifier::Score> *rank(DataSet::Example *example);
|
@@ -448,7 +448,6 @@ Model::Model *Storage::Binary::read_model() {
|
|
448
448
|
void Storage::Binary::write_model(Model::Model *model) {
|
449
449
|
open_for_writing();
|
450
450
|
|
451
|
-
// write the 3 model components
|
452
451
|
write_data_set(model->data_set);
|
453
452
|
write_classifier(model->classifier);
|
454
453
|
write_text_pipeline(model->text_pipeline);
|
@@ -15,15 +15,15 @@ module Quarry
|
|
15
15
|
end
|
16
16
|
|
17
17
|
def examples
|
18
|
-
@examples ||= EnumerableHelper.new(self, @data_set, Example, :examples_size, :get_example_by_index)
|
18
|
+
@examples ||= EnumerableHelper.new(self, @data_set, ::Quarry::DataSet::Example, :examples_size, :get_example_by_index)
|
19
19
|
end
|
20
20
|
|
21
21
|
def categories
|
22
|
-
@categories ||= EnumerableHelper.new(self, @data_set, Category, :categories_size, :get_category_by_index)
|
22
|
+
@categories ||= EnumerableHelper.new(self, @data_set, ::Quarry::DataSet::Category, :categories_size, :get_category_by_index)
|
23
23
|
end
|
24
24
|
|
25
25
|
def features
|
26
|
-
@features ||= EnumerableHelper.new(self, @data_set, Feature, :features_size, :get_feature_by_index)
|
26
|
+
@features ||= EnumerableHelper.new(self, @data_set, ::Quarry::DataSet::Feature, :features_size, :get_feature_by_index)
|
27
27
|
end
|
28
28
|
|
29
29
|
def stratify(classifier, folds, skip_fold)
|
@@ -25,8 +25,7 @@ module Quarry
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def category=(new_category)
|
28
|
-
|
29
|
-
@example.set_category(new_category.category)
|
28
|
+
@example.set_category_index(@data_set.data_set, new_category)
|
30
29
|
end
|
31
30
|
end
|
32
31
|
end
|
@@ -6,11 +6,12 @@ module Quarry
|
|
6
6
|
end
|
7
7
|
|
8
8
|
def data_set
|
9
|
-
Quarry::DataSet::DataSet.new(@model.get_data_set)
|
9
|
+
@data_set ||= Quarry::DataSet::DataSet.new(@model.get_data_set)
|
10
10
|
end
|
11
11
|
|
12
12
|
def data_set=(ds)
|
13
13
|
@model.set_data_set(ds.data_set)
|
14
|
+
@data_set = ds
|
14
15
|
end
|
15
16
|
|
16
17
|
# def classifier
|
@@ -22,11 +23,12 @@ module Quarry
|
|
22
23
|
end
|
23
24
|
|
24
25
|
def text_pipeline
|
25
|
-
TextPipeline.new(@model.get_text_pipeline)
|
26
|
+
@text_pipeline ||= TextPipeline.new(@model.get_text_pipeline)
|
26
27
|
end
|
27
28
|
|
28
29
|
def text_pipeline=(t)
|
29
30
|
@model.set_text_pipeline(t.text_pipeline)
|
31
|
+
@text_pipeline = t
|
30
32
|
end
|
31
33
|
|
32
34
|
def train(example)
|
@@ -45,6 +47,14 @@ module Quarry
|
|
45
47
|
@model.classify_text(text)
|
46
48
|
end
|
47
49
|
|
50
|
+
def process_text(text, create_features = true)
|
51
|
+
DataSet::Example.new(@model.process_text(text, create_features), data_set)
|
52
|
+
end
|
53
|
+
|
54
|
+
def add_text_example(text, category_name)
|
55
|
+
@model.add_text_example(text, category_name)
|
56
|
+
end
|
57
|
+
|
48
58
|
def rank(example)
|
49
59
|
@model.rank(example)
|
50
60
|
end
|
@@ -6,7 +6,7 @@ module Quarry
|
|
6
6
|
end
|
7
7
|
|
8
8
|
def process_text(data_set, text, create_features = false)
|
9
|
-
Example.new(@text_pipeline.process_text(data_set, text, create_features))
|
9
|
+
::Quarry::DataSet::Example.new(@text_pipeline.process_text(data_set.data_set, text, create_features))
|
10
10
|
end
|
11
11
|
|
12
12
|
def self.standard_pipeline
|
data/thera.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |s|
|
|
9
9
|
s.description = "C++ Data Mining Library for Ruby"
|
10
10
|
s.email = "me@willcannings.com"
|
11
11
|
s.authors = ["Will Cannings"]
|
12
|
-
s.version = '0.0.
|
12
|
+
s.version = '0.0.3'
|
13
13
|
s.extensions = ["ext/extconf.rb"]
|
14
14
|
|
15
15
|
s.files = `git ls-files`.split("\n")
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: thera
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2011-12-07 00:00:00.000000000Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rice
|
16
|
-
requirement: &
|
16
|
+
requirement: &70129254365400 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70129254365400
|
25
25
|
description: C++ Data Mining Library for Ruby
|
26
26
|
email: me@willcannings.com
|
27
27
|
executables: []
|