thera 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/quarry/quarry_toolkit.cpp +18 -3
- data/lib/quarry/src/model/model.cpp +2 -2
- data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +1 -1
- data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +10 -2
- data/lib/quarry/src/preprocessing/text/text_pipeline.cpp +2 -2
- data/lib/quarry/src/preprocessing/text/text_pipeline.h +1 -1
- data/lib/quarry/src/storage/folders.cpp +1 -1
- data/lib/quarry_rb/model/model.rb +4 -0
- data/lib/quarry_rb/text_pipeline.rb +2 -2
- data/thera.gemspec +1 -1
- metadata +3 -3
@@ -31,6 +31,20 @@ Object model_rank_text(Object self, Object text) {
|
|
31
31
|
return indexes;
|
32
32
|
}
|
33
33
|
|
34
|
+
Object model_rank_text_names(Object self, Object text) {
|
35
|
+
Model::Model *model = from_ruby<Model::Model *>(self);
|
36
|
+
string example_text = from_ruby<string>(text);
|
37
|
+
Array names;
|
38
|
+
|
39
|
+
vector<Classifier::Score> *ranks = model->rank_text(example_text);
|
40
|
+
DataSet::NominalFeature *categories = model->data_set->category_feature();
|
41
|
+
for(unsigned int i = 0; i < ranks->size(); i++)
|
42
|
+
names.push(categories->names[ranks->at(i).category]);
|
43
|
+
|
44
|
+
delete ranks;
|
45
|
+
return names;
|
46
|
+
}
|
47
|
+
|
34
48
|
|
35
49
|
extern "C" {
|
36
50
|
|
@@ -45,8 +59,8 @@ extern "C" {
|
|
45
59
|
// text pipeline
|
46
60
|
rb_mText.define_module_function("standard_pipeline", &Preprocessing::Text::StandardPipeline);
|
47
61
|
Data_Type<Preprocessing::Text::TextPipeline> rb_cTextPipeline = define_class_under<Preprocessing::Text::TextPipeline>(rb_mQuarry, "ImplTextPipeline")
|
48
|
-
.define_constructor(Constructor<Preprocessing::Text::TextPipeline>())
|
49
|
-
.define_method("process_text", &Preprocessing::Text::TextPipeline::process_text);
|
62
|
+
.define_constructor(Constructor<Preprocessing::Text::TextPipeline>());
|
63
|
+
// .define_method("process_text", &Preprocessing::Text::TextPipeline::process_text);
|
50
64
|
|
51
65
|
// storage
|
52
66
|
Data_Type<Storage::Storage> rb_cStorage = define_class_under<Storage::Storage>(rb_mQuarry, "ImplStorage");
|
@@ -83,7 +97,8 @@ extern "C" {
|
|
83
97
|
.define_method("set_text_pipeline", &Model::Model::set_text_pipeline)
|
84
98
|
.define_method("get_text_pipeline", &Model::Model::get_text_pipeline)
|
85
99
|
.define_method("rank", &model_rank)
|
86
|
-
.define_method("rank_text", &model_rank_text)
|
100
|
+
.define_method("rank_text", &model_rank_text)
|
101
|
+
.define_method("rank_text_names", &model_rank_text_names);
|
87
102
|
|
88
103
|
|
89
104
|
|
@@ -11,7 +11,7 @@ int Model::Model::classify(DataSet::Example *example) {
|
|
11
11
|
}
|
12
12
|
|
13
13
|
int Model::Model::classify_text(string text) {
|
14
|
-
DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str());
|
14
|
+
DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str(), false);
|
15
15
|
int category = classifier->classify(example);
|
16
16
|
delete example;
|
17
17
|
return category;
|
@@ -22,7 +22,7 @@ vector<Classifier::Score> *Model::Model::rank(DataSet::Example *example) {
|
|
22
22
|
}
|
23
23
|
|
24
24
|
vector<Classifier::Score> *Model::Model::rank_text(string text) {
|
25
|
-
DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str());
|
25
|
+
DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str(), false);
|
26
26
|
vector<Classifier::Score> *ranks = classifier->rank(example);
|
27
27
|
delete example;
|
28
28
|
return ranks;
|
@@ -9,7 +9,7 @@ namespace Preprocessing {
|
|
9
9
|
class ExampleGenerator {
|
10
10
|
public:
|
11
11
|
ExampleGenerator() {}
|
12
|
-
virtual DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens) { return NULL; }
|
12
|
+
virtual DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens, bool create_features) { return NULL; }
|
13
13
|
virtual uint32_t mark() = 0;
|
14
14
|
};
|
15
15
|
|
@@ -22,7 +22,7 @@ namespace Preprocessing {
|
|
22
22
|
|
23
23
|
TokenCounter(TokenCounterWeight weight = Count) : ExampleGenerator(), token_counts(), weight(weight) {}
|
24
24
|
|
25
|
-
DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens) {
|
25
|
+
DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens, bool create_features) {
|
26
26
|
int max_count = 0, count = 0;
|
27
27
|
double value = 0.0;
|
28
28
|
token_counts.clear();
|
@@ -38,6 +38,8 @@ namespace Preprocessing {
|
|
38
38
|
|
39
39
|
// construct the example
|
40
40
|
DataSet::SparseExample *example = data_set->new_example(token_counts.size());
|
41
|
+
DataSet::Feature *feature = NULL;
|
42
|
+
|
41
43
|
for(map<string, int>::iterator token_counts_it = token_counts.begin(); token_counts_it != token_counts.end(); token_counts_it++) {
|
42
44
|
value = token_counts_it->second;
|
43
45
|
|
@@ -46,7 +48,13 @@ namespace Preprocessing {
|
|
46
48
|
else if(weight == Binary)
|
47
49
|
value = 1;
|
48
50
|
|
49
|
-
|
51
|
+
if(create_features) {
|
52
|
+
example->set_value(data_set->get_or_create_numeric_feature_by_name(token_counts_it->first)->index, value);
|
53
|
+
} else {
|
54
|
+
feature = data_set->get_feature_by_name(token_counts_it->first);
|
55
|
+
if(feature)
|
56
|
+
example->set_value(feature->index, value);
|
57
|
+
}
|
50
58
|
}
|
51
59
|
|
52
60
|
return example;
|
@@ -1,10 +1,10 @@
|
|
1
1
|
#include "text_pipeline.h"
|
2
2
|
#include <iostream>
|
3
3
|
|
4
|
-
DataSet::SparseExample *Preprocessing::Text::TextPipeline::process_text(DataSet::SparseDataSet *data_set, char *text) {
|
4
|
+
DataSet::SparseExample *Preprocessing::Text::TextPipeline::process_text(DataSet::SparseDataSet *data_set, char *text, bool create_features) {
|
5
5
|
tokens.clear();
|
6
6
|
tokeniser->tokenise(text);
|
7
|
-
return generator->generate(data_set, &tokens);
|
7
|
+
return generator->generate(data_set, &tokens, create_features);
|
8
8
|
}
|
9
9
|
|
10
10
|
void Preprocessing::Text::TextPipeline::process_token(char *start, char *end) {
|
@@ -26,7 +26,7 @@ namespace Preprocessing {
|
|
26
26
|
vector<char *> tokens;
|
27
27
|
|
28
28
|
TextPipeline() : tokeniser(NULL), processors(), selectors(), generator(NULL), tokens() {}
|
29
|
-
DataSet::SparseExample *process_text(DataSet::SparseDataSet *data_set, char *text);
|
29
|
+
DataSet::SparseExample *process_text(DataSet::SparseDataSet *data_set, char *text, bool create_features);
|
30
30
|
void process_token(char *start, char *end);
|
31
31
|
};
|
32
32
|
|
@@ -52,7 +52,7 @@ void Storage::Folders::load_directory(string path, DataSet::SparseDataSet *data_
|
|
52
52
|
fclose(file);
|
53
53
|
|
54
54
|
// insert a new example into the dataset
|
55
|
-
example = pipeline->process_text(data_set, file_data);
|
55
|
+
example = pipeline->process_text(data_set, file_data, true);
|
56
56
|
example->set_category_index(data_set, category_index);
|
57
57
|
|
58
58
|
file_count++;
|
@@ -5,8 +5,8 @@ module Quarry
|
|
5
5
|
@text_pipeline = tp || Quarry::ImplTextPipeline.new
|
6
6
|
end
|
7
7
|
|
8
|
-
def process_text(data_set, text)
|
9
|
-
Example.new(@text_pipeline.process_text(data_set, text))
|
8
|
+
def process_text(data_set, text, create_features = false)
|
9
|
+
Example.new(@text_pipeline.process_text(data_set, text, create_features))
|
10
10
|
end
|
11
11
|
|
12
12
|
def self.standard_pipeline
|
data/thera.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |s|
|
|
9
9
|
s.description = "C++ Data Mining Library for Ruby"
|
10
10
|
s.email = "me@willcannings.com"
|
11
11
|
s.authors = ["Will Cannings"]
|
12
|
-
s.version = '0.0.
|
12
|
+
s.version = '0.0.2'
|
13
13
|
s.extensions = ["ext/extconf.rb"]
|
14
14
|
|
15
15
|
s.files = `git ls-files`.split("\n")
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: thera
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2011-12-07 00:00:00.000000000Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rice
|
16
|
-
requirement: &
|
16
|
+
requirement: &70171050802120 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70171050802120
|
25
25
|
description: C++ Data Mining Library for Ruby
|
26
26
|
email: me@willcannings.com
|
27
27
|
executables: []
|