thera 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/quarry/quarry_toolkit.cpp +20 -0
- data/lib/quarry/src/classifier/classifier.h +2 -0
- data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +8 -3
- data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.h +1 -0
- data/lib/quarry/src/data_set/data_set.h +9 -1
- data/lib/quarry/src/data_set/dense/dense_data_set.h +3 -2
- data/lib/quarry/src/data_set/feature.h +1 -0
- data/lib/quarry/src/data_set/features/nominal_feature.h +2 -0
- data/lib/quarry/src/data_set/features/numeric_feature.h +5 -0
- data/lib/quarry/src/data_set/sparse/sparse_data_set.h +3 -2
- data/lib/quarry/src/data_set/sparse/sparse_example.h +6 -0
- data/lib/quarry/src/model/model.h +9 -0
- data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +1 -0
- data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +1 -1
- data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h +1 -0
- data/lib/quarry/src/preprocessing/text/text_pipeline.h +14 -0
- data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp +3 -7
- data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h +6 -0
- data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h +1 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h +1 -0
- data/lib/quarry/src/storage/binary.cpp +16 -6
- data/lib/quarry/src/storage/folders.cpp +1 -0
- data/thera.gemspec +1 -1
- metadata +4 -4
@@ -45,6 +45,24 @@ Object model_rank_text_names(Object self, Object text) {
|
|
45
45
|
return names;
|
46
46
|
}
|
47
47
|
|
48
|
+
Object quarry_rank_text_names_from_binary_model(Object path, Object text) {
|
49
|
+
string model_path = from_ruby<string>(path);
|
50
|
+
string example_text = from_ruby<string>(text);
|
51
|
+
Array names;
|
52
|
+
|
53
|
+
Storage::Binary reader(model_path);
|
54
|
+
Model::Model *model = reader.read_model();
|
55
|
+
|
56
|
+
vector<Classifier::Score> *ranks = model->rank_text(example_text);
|
57
|
+
DataSet::NominalFeature *categories = model->data_set->category_feature();
|
58
|
+
for(unsigned int i = 0; i < ranks->size(); i++)
|
59
|
+
names.push(categories->names[ranks->at(i).category]);
|
60
|
+
|
61
|
+
delete ranks;
|
62
|
+
delete model;
|
63
|
+
return names;
|
64
|
+
}
|
65
|
+
|
48
66
|
|
49
67
|
extern "C" {
|
50
68
|
|
@@ -55,6 +73,8 @@ extern "C" {
|
|
55
73
|
Module rb_mPreprocessing = define_module_under(rb_mQuarry, "Preprocessing");
|
56
74
|
Module rb_mText = define_module_under(rb_mPreprocessing, "Text");
|
57
75
|
|
76
|
+
// quarry helper
|
77
|
+
rb_mQuarry.define_module_function("rank_text_names_from_binary_model", &quarry_rank_text_names_from_binary_model);
|
58
78
|
|
59
79
|
// text pipeline
|
60
80
|
rb_mText.define_module_function("standard_pipeline", &Preprocessing::Text::StandardPipeline);
|
@@ -74,8 +74,13 @@ void Classifier::NaiveBayesClassifier::write_binary(Storage::Binary *file) {
|
|
74
74
|
void Classifier::NaiveBayesClassifier::read_binary(Storage::Binary *file) {
|
75
75
|
int category_count = file->read_int();
|
76
76
|
feature_caches.resize(category_count + 1);
|
77
|
-
|
77
|
+
vector<double> *probabilities = file->read_vector<double>();
|
78
|
+
category_probabilities = *probabilities;
|
79
|
+
delete probabilities;
|
78
80
|
|
79
|
-
for(int i = 1; i <= category_count; i++)
|
80
|
-
|
81
|
+
for(int i = 1; i <= category_count; i++) {
|
82
|
+
vector<NumericFeatureCache> *caches = file->read_vector<NumericFeatureCache>();
|
83
|
+
feature_caches[i] = *caches;
|
84
|
+
delete caches;
|
85
|
+
}
|
81
86
|
}
|
@@ -33,6 +33,7 @@ namespace Classifier {
|
|
33
33
|
static const uint32_t file_mark = 'naiv';
|
34
34
|
NaiveBayesClassifier(DataSet::DataSet *data_set) : Classifier(data_set) {}
|
35
35
|
NaiveBayesClassifier *clone(DataSet::DataSet *new_data_set);
|
36
|
+
virtual ~NaiveBayesClassifier() {}
|
36
37
|
|
37
38
|
double score(int category, DataSet::Example *example);
|
38
39
|
void prepare();
|
@@ -26,6 +26,14 @@ namespace DataSet {
|
|
26
26
|
features.push_back(feature);
|
27
27
|
}
|
28
28
|
}
|
29
|
+
|
30
|
+
virtual ~DataSet() {
|
31
|
+
for(unsigned int i = 0; i < features.size(); i++)
|
32
|
+
delete features[i];
|
33
|
+
for(unsigned int i = 0; i < examples.size(); i++)
|
34
|
+
delete examples[i];
|
35
|
+
}
|
36
|
+
|
29
37
|
virtual DataSet *clone_without_examples() { return NULL; }
|
30
38
|
|
31
39
|
tr1::unordered_map<string, Feature *> feature_names;
|
@@ -38,7 +46,7 @@ namespace DataSet {
|
|
38
46
|
|
39
47
|
NumericFeature *new_numeric_feature(string name);
|
40
48
|
NominalFeature *new_nominal_feature(string name);
|
41
|
-
virtual Example *new_example() { return NULL; }
|
49
|
+
virtual Example *new_example(bool add_to_data_set = true) { return NULL; }
|
42
50
|
|
43
51
|
void count();
|
44
52
|
void index();
|
@@ -28,9 +28,10 @@ namespace DataSet {
|
|
28
28
|
return new DenseDataSet(this);
|
29
29
|
}
|
30
30
|
|
31
|
-
DenseExample *new_example() {
|
31
|
+
DenseExample *new_example(bool add_to_data_set = true) {
|
32
32
|
DenseExample *example = new DenseExample(features.size());
|
33
|
-
|
33
|
+
if(add_to_data_set)
|
34
|
+
examples.push_back(example);
|
34
35
|
return example;
|
35
36
|
}
|
36
37
|
};
|
@@ -17,6 +17,7 @@ namespace DataSet {
|
|
17
17
|
void set_index(int new_index) { index = new_index; }
|
18
18
|
|
19
19
|
Feature(string name, int index) : name(name), index(index) {}
|
20
|
+
virtual ~Feature() {}
|
20
21
|
virtual Feature *clone() { return NULL; }
|
21
22
|
virtual void reset() {}
|
22
23
|
virtual void print() {}
|
@@ -14,6 +14,8 @@ namespace DataSet {
|
|
14
14
|
NominalFeature(string name, int index) : Feature(name, index), names(1, "") {}
|
15
15
|
NominalFeature(NominalFeature *other) : Feature(other->name, other->index), indexes(other->indexes), names(other->names) {}
|
16
16
|
|
17
|
+
~NominalFeature() {}
|
18
|
+
|
17
19
|
NominalFeature *clone() {
|
18
20
|
return new NominalFeature(this);
|
19
21
|
}
|
@@ -29,9 +29,10 @@ namespace DataSet {
|
|
29
29
|
return new SparseDataSet(this);
|
30
30
|
}
|
31
31
|
|
32
|
-
SparseExample *new_example(int buffer_size = 0) {
|
32
|
+
SparseExample *new_example(int buffer_size = 0, bool add_to_data_set = true) {
|
33
33
|
SparseExample *example = new SparseExample(buffer_size);
|
34
|
-
|
34
|
+
if(add_to_data_set)
|
35
|
+
examples.push_back(example);
|
35
36
|
return example;
|
36
37
|
}
|
37
38
|
};
|
@@ -3,6 +3,7 @@
|
|
3
3
|
#include "data_set/example.h"
|
4
4
|
#include <stdlib.h>
|
5
5
|
#include <string>
|
6
|
+
#include <iostream>
|
6
7
|
using namespace std;
|
7
8
|
|
8
9
|
namespace DataSet {
|
@@ -25,6 +26,11 @@ namespace DataSet {
|
|
25
26
|
values = NULL;
|
26
27
|
}
|
27
28
|
|
29
|
+
~SparseExample() {
|
30
|
+
if(values != NULL)
|
31
|
+
free(values);
|
32
|
+
}
|
33
|
+
|
28
34
|
double get_value(int feature_index);
|
29
35
|
double get_value(string feature_name, SparseDataSet *data_set);
|
30
36
|
void set_value(int feature_index, double new_value);
|
@@ -4,6 +4,7 @@
|
|
4
4
|
#include "data_set/example.h"
|
5
5
|
#include "classifier/classifier.h"
|
6
6
|
#include "preprocessing/text/text_pipeline.h"
|
7
|
+
#include <iostream>
|
7
8
|
|
8
9
|
namespace Model {
|
9
10
|
class Model {
|
@@ -13,6 +14,14 @@ namespace Model {
|
|
13
14
|
Preprocessing::Text::TextPipeline *text_pipeline;
|
14
15
|
|
15
16
|
Model() : data_set(NULL), classifier(NULL), text_pipeline(NULL) {}
|
17
|
+
~Model() {
|
18
|
+
if(data_set)
|
19
|
+
delete data_set;
|
20
|
+
if(classifier)
|
21
|
+
delete classifier;
|
22
|
+
if(text_pipeline)
|
23
|
+
delete text_pipeline;
|
24
|
+
}
|
16
25
|
|
17
26
|
void train(DataSet::Example *example);
|
18
27
|
void train_text(string text);
|
@@ -9,6 +9,7 @@ namespace Preprocessing {
|
|
9
9
|
class ExampleGenerator {
|
10
10
|
public:
|
11
11
|
ExampleGenerator() {}
|
12
|
+
virtual ~ExampleGenerator() {}
|
12
13
|
virtual DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens, bool create_features) { return NULL; }
|
13
14
|
virtual uint32_t mark() = 0;
|
14
15
|
};
|
@@ -37,7 +37,7 @@ namespace Preprocessing {
|
|
37
37
|
}
|
38
38
|
|
39
39
|
// construct the example
|
40
|
-
DataSet::SparseExample *example = data_set->new_example(token_counts.size());
|
40
|
+
DataSet::SparseExample *example = data_set->new_example(token_counts.size(), false);
|
41
41
|
DataSet::Feature *feature = NULL;
|
42
42
|
|
43
43
|
for(map<string, int>::iterator token_counts_it = token_counts.begin(); token_counts_it != token_counts.end(); token_counts_it++) {
|
@@ -28,6 +28,20 @@ namespace Preprocessing {
|
|
28
28
|
TextPipeline() : tokeniser(NULL), processors(), selectors(), generator(NULL), tokens() {}
|
29
29
|
DataSet::SparseExample *process_text(DataSet::SparseDataSet *data_set, char *text, bool create_features);
|
30
30
|
void process_token(char *start, char *end);
|
31
|
+
|
32
|
+
~TextPipeline() {
|
33
|
+
// tokens stores ptrs to offsets of a string which is handled externally,
|
34
|
+
// so doesn't need to be released here
|
35
|
+
if(tokeniser)
|
36
|
+
delete tokeniser;
|
37
|
+
if(generator)
|
38
|
+
delete generator;
|
39
|
+
|
40
|
+
for(unsigned int i = 0; i < processors.size(); i++)
|
41
|
+
delete processors[i];
|
42
|
+
for(unsigned int i = 0; i < selectors.size(); i++)
|
43
|
+
delete selectors[i];
|
44
|
+
}
|
31
45
|
};
|
32
46
|
|
33
47
|
TextPipeline *StandardPipeline();
|
@@ -4,7 +4,6 @@
|
|
4
4
|
using namespace std;
|
5
5
|
using namespace tr1;
|
6
6
|
|
7
|
-
static unordered_set<string> *stop_words = NULL;
|
8
7
|
static int stop_word_count = 586;
|
9
8
|
static string stop_word_list[] = {
|
10
9
|
"a", "able", "about", "above", "abroad", "according", "accordingly", "across", "actually", "adj",
|
@@ -69,14 +68,11 @@ static string stop_word_list[] = {
|
|
69
68
|
};
|
70
69
|
|
71
70
|
Preprocessing::Text::StopWords::StopWords() : TokenSelector() {
|
72
|
-
|
73
|
-
stop_words
|
74
|
-
for(int i = 0; i < stop_word_count; i++)
|
75
|
-
stop_words->insert(stop_word_list[i]);
|
76
|
-
}
|
71
|
+
for(int i = 0; i < stop_word_count; i++)
|
72
|
+
stop_words.insert(stop_word_list[i]);
|
77
73
|
}
|
78
74
|
|
79
75
|
bool Preprocessing::Text::StopWords::select(char *start, char *end) {
|
80
76
|
string token = string(start, (end - start) + 1);
|
81
|
-
return stop_words
|
77
|
+
return stop_words.count(token) == 0;
|
82
78
|
}
|
@@ -1,6 +1,10 @@
|
|
1
1
|
#ifndef __stop_words_h__
|
2
2
|
#define __stop_words_h__
|
3
3
|
#include "token_selector.h"
|
4
|
+
#include <tr1/unordered_set>
|
5
|
+
using namespace std;
|
6
|
+
using namespace tr1;
|
7
|
+
|
4
8
|
|
5
9
|
namespace Preprocessing {
|
6
10
|
namespace Text {
|
@@ -9,8 +13,10 @@ namespace Preprocessing {
|
|
9
13
|
public:
|
10
14
|
static const uint32_t file_mark = 'stop';
|
11
15
|
uint32_t mark() { return file_mark; }
|
16
|
+
unordered_set<string> stop_words;
|
12
17
|
|
13
18
|
StopWords();
|
19
|
+
~StopWords() {}
|
14
20
|
bool select(char *start, char *end);
|
15
21
|
};
|
16
22
|
|
@@ -104,16 +104,26 @@ DataSet::DataSet *Storage::Binary::read_data_set() {
|
|
104
104
|
|
105
105
|
// read cached frequencies and probabilities if present
|
106
106
|
if(data_set->counted) {
|
107
|
-
|
108
|
-
|
107
|
+
vector<int> *frequencies = read_vector<int>();
|
108
|
+
vector<double> *probabilities = read_vector<double>();
|
109
|
+
nominal_feature->frequencies = *frequencies;
|
110
|
+
nominal_feature->probabilities = *probabilities;
|
109
111
|
nominal_feature->category_frequencies.resize(num_categories + 1);
|
110
112
|
nominal_feature->category_probabilities.resize(num_categories + 1);
|
113
|
+
delete frequencies;
|
114
|
+
delete probabilities;
|
111
115
|
|
112
|
-
for(int i = 1; i <= num_categories; i++)
|
113
|
-
|
116
|
+
for(int i = 1; i <= num_categories; i++) {
|
117
|
+
frequencies = read_vector<int>();
|
118
|
+
nominal_feature->category_frequencies[i] = *frequencies;
|
119
|
+
delete frequencies;
|
120
|
+
}
|
114
121
|
|
115
|
-
for(int i = 1; i <= num_categories; i++)
|
116
|
-
|
122
|
+
for(int i = 1; i <= num_categories; i++) {
|
123
|
+
probabilities = read_vector<double>();
|
124
|
+
nominal_feature->category_probabilities[i] = *probabilities;
|
125
|
+
delete probabilities;
|
126
|
+
}
|
117
127
|
}
|
118
128
|
|
119
129
|
// TODO: read cached indexes
|
@@ -55,6 +55,7 @@ void Storage::Folders::load_directory(string path, DataSet::SparseDataSet *data_
|
|
55
55
|
// insert a new example into the dataset
|
56
56
|
example = pipeline->process_text(data_set, file_data, true);
|
57
57
|
example->set_category_index(data_set, category_index);
|
58
|
+
data_set->examples.push_back(example);
|
58
59
|
|
59
60
|
file_count++;
|
60
61
|
if((file_count % 10000) == 0)
|
data/thera.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |s|
|
|
9
9
|
s.description = "C++ Data Mining Library for Ruby"
|
10
10
|
s.email = "me@willcannings.com"
|
11
11
|
s.authors = ["Will Cannings"]
|
12
|
-
s.version = '0.0.
|
12
|
+
s.version = '0.0.8'
|
13
13
|
s.extensions = ["ext/extconf.rb"]
|
14
14
|
|
15
15
|
s.files = `git ls-files`.split("\n")
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: thera
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2012-01-21 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rice
|
16
|
-
requirement: &
|
16
|
+
requirement: &70151526602040 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70151526602040
|
25
25
|
description: C++ Data Mining Library for Ruby
|
26
26
|
email: me@willcannings.com
|
27
27
|
executables: []
|