thera 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/quarry/quarry_toolkit.cpp +18 -3
- data/lib/quarry/src/model/model.cpp +2 -2
- data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +1 -1
- data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +10 -2
- data/lib/quarry/src/preprocessing/text/text_pipeline.cpp +2 -2
- data/lib/quarry/src/preprocessing/text/text_pipeline.h +1 -1
- data/lib/quarry/src/storage/folders.cpp +1 -1
- data/lib/quarry_rb/model/model.rb +4 -0
- data/lib/quarry_rb/text_pipeline.rb +2 -2
- data/thera.gemspec +1 -1
- metadata +3 -3
@@ -31,6 +31,20 @@ Object model_rank_text(Object self, Object text) {
|
|
31
31
|
return indexes;
|
32
32
|
}
|
33
33
|
|
34
|
+
Object model_rank_text_names(Object self, Object text) {
|
35
|
+
Model::Model *model = from_ruby<Model::Model *>(self);
|
36
|
+
string example_text = from_ruby<string>(text);
|
37
|
+
Array names;
|
38
|
+
|
39
|
+
vector<Classifier::Score> *ranks = model->rank_text(example_text);
|
40
|
+
DataSet::NominalFeature *categories = model->data_set->category_feature();
|
41
|
+
for(unsigned int i = 0; i < ranks->size(); i++)
|
42
|
+
names.push(categories->names[ranks->at(i).category]);
|
43
|
+
|
44
|
+
delete ranks;
|
45
|
+
return names;
|
46
|
+
}
|
47
|
+
|
34
48
|
|
35
49
|
extern "C" {
|
36
50
|
|
@@ -45,8 +59,8 @@ extern "C" {
|
|
45
59
|
// text pipeline
|
46
60
|
rb_mText.define_module_function("standard_pipeline", &Preprocessing::Text::StandardPipeline);
|
47
61
|
Data_Type<Preprocessing::Text::TextPipeline> rb_cTextPipeline = define_class_under<Preprocessing::Text::TextPipeline>(rb_mQuarry, "ImplTextPipeline")
|
48
|
-
.define_constructor(Constructor<Preprocessing::Text::TextPipeline>())
|
49
|
-
.define_method("process_text", &Preprocessing::Text::TextPipeline::process_text);
|
62
|
+
.define_constructor(Constructor<Preprocessing::Text::TextPipeline>());
|
63
|
+
// .define_method("process_text", &Preprocessing::Text::TextPipeline::process_text);
|
50
64
|
|
51
65
|
// storage
|
52
66
|
Data_Type<Storage::Storage> rb_cStorage = define_class_under<Storage::Storage>(rb_mQuarry, "ImplStorage");
|
@@ -83,7 +97,8 @@ extern "C" {
|
|
83
97
|
.define_method("set_text_pipeline", &Model::Model::set_text_pipeline)
|
84
98
|
.define_method("get_text_pipeline", &Model::Model::get_text_pipeline)
|
85
99
|
.define_method("rank", &model_rank)
|
86
|
-
.define_method("rank_text", &model_rank_text)
|
100
|
+
.define_method("rank_text", &model_rank_text)
|
101
|
+
.define_method("rank_text_names", &model_rank_text_names);
|
87
102
|
|
88
103
|
|
89
104
|
|
@@ -11,7 +11,7 @@ int Model::Model::classify(DataSet::Example *example) {
|
|
11
11
|
}
|
12
12
|
|
13
13
|
int Model::Model::classify_text(string text) {
|
14
|
-
DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str());
|
14
|
+
DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str(), false);
|
15
15
|
int category = classifier->classify(example);
|
16
16
|
delete example;
|
17
17
|
return category;
|
@@ -22,7 +22,7 @@ vector<Classifier::Score> *Model::Model::rank(DataSet::Example *example) {
|
|
22
22
|
}
|
23
23
|
|
24
24
|
vector<Classifier::Score> *Model::Model::rank_text(string text) {
|
25
|
-
DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str());
|
25
|
+
DataSet::SparseExample *example = text_pipeline->process_text((DataSet::SparseDataSet *)data_set, (char *)text.c_str(), false);
|
26
26
|
vector<Classifier::Score> *ranks = classifier->rank(example);
|
27
27
|
delete example;
|
28
28
|
return ranks;
|
@@ -9,7 +9,7 @@ namespace Preprocessing {
|
|
9
9
|
class ExampleGenerator {
|
10
10
|
public:
|
11
11
|
ExampleGenerator() {}
|
12
|
-
virtual DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens) { return NULL; }
|
12
|
+
virtual DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens, bool create_features) { return NULL; }
|
13
13
|
virtual uint32_t mark() = 0;
|
14
14
|
};
|
15
15
|
|
@@ -22,7 +22,7 @@ namespace Preprocessing {
|
|
22
22
|
|
23
23
|
TokenCounter(TokenCounterWeight weight = Count) : ExampleGenerator(), token_counts(), weight(weight) {}
|
24
24
|
|
25
|
-
DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens) {
|
25
|
+
DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens, bool create_features) {
|
26
26
|
int max_count = 0, count = 0;
|
27
27
|
double value = 0.0;
|
28
28
|
token_counts.clear();
|
@@ -38,6 +38,8 @@ namespace Preprocessing {
|
|
38
38
|
|
39
39
|
// construct the example
|
40
40
|
DataSet::SparseExample *example = data_set->new_example(token_counts.size());
|
41
|
+
DataSet::Feature *feature = NULL;
|
42
|
+
|
41
43
|
for(map<string, int>::iterator token_counts_it = token_counts.begin(); token_counts_it != token_counts.end(); token_counts_it++) {
|
42
44
|
value = token_counts_it->second;
|
43
45
|
|
@@ -46,7 +48,13 @@ namespace Preprocessing {
|
|
46
48
|
else if(weight == Binary)
|
47
49
|
value = 1;
|
48
50
|
|
49
|
-
|
51
|
+
if(create_features) {
|
52
|
+
example->set_value(data_set->get_or_create_numeric_feature_by_name(token_counts_it->first)->index, value);
|
53
|
+
} else {
|
54
|
+
feature = data_set->get_feature_by_name(token_counts_it->first);
|
55
|
+
if(feature)
|
56
|
+
example->set_value(feature->index, value);
|
57
|
+
}
|
50
58
|
}
|
51
59
|
|
52
60
|
return example;
|
@@ -1,10 +1,10 @@
|
|
1
1
|
#include "text_pipeline.h"
|
2
2
|
#include <iostream>
|
3
3
|
|
4
|
-
DataSet::SparseExample *Preprocessing::Text::TextPipeline::process_text(DataSet::SparseDataSet *data_set, char *text) {
|
4
|
+
DataSet::SparseExample *Preprocessing::Text::TextPipeline::process_text(DataSet::SparseDataSet *data_set, char *text, bool create_features) {
|
5
5
|
tokens.clear();
|
6
6
|
tokeniser->tokenise(text);
|
7
|
-
return generator->generate(data_set, &tokens);
|
7
|
+
return generator->generate(data_set, &tokens, create_features);
|
8
8
|
}
|
9
9
|
|
10
10
|
void Preprocessing::Text::TextPipeline::process_token(char *start, char *end) {
|
@@ -26,7 +26,7 @@ namespace Preprocessing {
|
|
26
26
|
vector<char *> tokens;
|
27
27
|
|
28
28
|
TextPipeline() : tokeniser(NULL), processors(), selectors(), generator(NULL), tokens() {}
|
29
|
-
DataSet::SparseExample *process_text(DataSet::SparseDataSet *data_set, char *text);
|
29
|
+
DataSet::SparseExample *process_text(DataSet::SparseDataSet *data_set, char *text, bool create_features);
|
30
30
|
void process_token(char *start, char *end);
|
31
31
|
};
|
32
32
|
|
@@ -52,7 +52,7 @@ void Storage::Folders::load_directory(string path, DataSet::SparseDataSet *data_
|
|
52
52
|
fclose(file);
|
53
53
|
|
54
54
|
// insert a new example into the dataset
|
55
|
-
example = pipeline->process_text(data_set, file_data);
|
55
|
+
example = pipeline->process_text(data_set, file_data, true);
|
56
56
|
example->set_category_index(data_set, category_index);
|
57
57
|
|
58
58
|
file_count++;
|
@@ -5,8 +5,8 @@ module Quarry
|
|
5
5
|
@text_pipeline = tp || Quarry::ImplTextPipeline.new
|
6
6
|
end
|
7
7
|
|
8
|
-
def process_text(data_set, text)
|
9
|
-
Example.new(@text_pipeline.process_text(data_set, text))
|
8
|
+
def process_text(data_set, text, create_features = false)
|
9
|
+
Example.new(@text_pipeline.process_text(data_set, text, create_features))
|
10
10
|
end
|
11
11
|
|
12
12
|
def self.standard_pipeline
|
data/thera.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |s|
|
|
9
9
|
s.description = "C++ Data Mining Library for Ruby"
|
10
10
|
s.email = "me@willcannings.com"
|
11
11
|
s.authors = ["Will Cannings"]
|
12
|
-
s.version = '0.0.
|
12
|
+
s.version = '0.0.2'
|
13
13
|
s.extensions = ["ext/extconf.rb"]
|
14
14
|
|
15
15
|
s.files = `git ls-files`.split("\n")
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: thera
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2011-12-07 00:00:00.000000000Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rice
|
16
|
-
requirement: &
|
16
|
+
requirement: &70171050802120 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70171050802120
|
25
25
|
description: C++ Data Mining Library for Ruby
|
26
26
|
email: me@willcannings.com
|
27
27
|
executables: []
|