thera 0.0.7 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/quarry/quarry_toolkit.cpp +20 -0
- data/lib/quarry/src/classifier/classifier.h +2 -0
- data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +8 -3
- data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.h +1 -0
- data/lib/quarry/src/data_set/data_set.h +9 -1
- data/lib/quarry/src/data_set/dense/dense_data_set.h +3 -2
- data/lib/quarry/src/data_set/feature.h +1 -0
- data/lib/quarry/src/data_set/features/nominal_feature.h +2 -0
- data/lib/quarry/src/data_set/features/numeric_feature.h +5 -0
- data/lib/quarry/src/data_set/sparse/sparse_data_set.h +3 -2
- data/lib/quarry/src/data_set/sparse/sparse_example.h +6 -0
- data/lib/quarry/src/model/model.h +9 -0
- data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +1 -0
- data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +1 -1
- data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h +1 -0
- data/lib/quarry/src/preprocessing/text/text_pipeline.h +14 -0
- data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp +3 -7
- data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h +6 -0
- data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h +1 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h +1 -0
- data/lib/quarry/src/storage/binary.cpp +16 -6
- data/lib/quarry/src/storage/folders.cpp +1 -0
- data/thera.gemspec +1 -1
- metadata +4 -4
@@ -45,6 +45,24 @@ Object model_rank_text_names(Object self, Object text) {
|
|
45
45
|
return names;
|
46
46
|
}
|
47
47
|
|
48
|
+
Object quarry_rank_text_names_from_binary_model(Object path, Object text) {
|
49
|
+
string model_path = from_ruby<string>(path);
|
50
|
+
string example_text = from_ruby<string>(text);
|
51
|
+
Array names;
|
52
|
+
|
53
|
+
Storage::Binary reader(model_path);
|
54
|
+
Model::Model *model = reader.read_model();
|
55
|
+
|
56
|
+
vector<Classifier::Score> *ranks = model->rank_text(example_text);
|
57
|
+
DataSet::NominalFeature *categories = model->data_set->category_feature();
|
58
|
+
for(unsigned int i = 0; i < ranks->size(); i++)
|
59
|
+
names.push(categories->names[ranks->at(i).category]);
|
60
|
+
|
61
|
+
delete ranks;
|
62
|
+
delete model;
|
63
|
+
return names;
|
64
|
+
}
|
65
|
+
|
48
66
|
|
49
67
|
extern "C" {
|
50
68
|
|
@@ -55,6 +73,8 @@ extern "C" {
|
|
55
73
|
Module rb_mPreprocessing = define_module_under(rb_mQuarry, "Preprocessing");
|
56
74
|
Module rb_mText = define_module_under(rb_mPreprocessing, "Text");
|
57
75
|
|
76
|
+
// quarry helper
|
77
|
+
rb_mQuarry.define_module_function("rank_text_names_from_binary_model", &quarry_rank_text_names_from_binary_model);
|
58
78
|
|
59
79
|
// text pipeline
|
60
80
|
rb_mText.define_module_function("standard_pipeline", &Preprocessing::Text::StandardPipeline);
|
@@ -74,8 +74,13 @@ void Classifier::NaiveBayesClassifier::write_binary(Storage::Binary *file) {
|
|
74
74
|
void Classifier::NaiveBayesClassifier::read_binary(Storage::Binary *file) {
|
75
75
|
int category_count = file->read_int();
|
76
76
|
feature_caches.resize(category_count + 1);
|
77
|
-
|
77
|
+
vector<double> *probabilities = file->read_vector<double>();
|
78
|
+
category_probabilities = *probabilities;
|
79
|
+
delete probabilities;
|
78
80
|
|
79
|
-
for(int i = 1; i <= category_count; i++)
|
80
|
-
|
81
|
+
for(int i = 1; i <= category_count; i++) {
|
82
|
+
vector<NumericFeatureCache> *caches = file->read_vector<NumericFeatureCache>();
|
83
|
+
feature_caches[i] = *caches;
|
84
|
+
delete caches;
|
85
|
+
}
|
81
86
|
}
|
@@ -33,6 +33,7 @@ namespace Classifier {
|
|
33
33
|
static const uint32_t file_mark = 'naiv';
|
34
34
|
NaiveBayesClassifier(DataSet::DataSet *data_set) : Classifier(data_set) {}
|
35
35
|
NaiveBayesClassifier *clone(DataSet::DataSet *new_data_set);
|
36
|
+
virtual ~NaiveBayesClassifier() {}
|
36
37
|
|
37
38
|
double score(int category, DataSet::Example *example);
|
38
39
|
void prepare();
|
@@ -26,6 +26,14 @@ namespace DataSet {
|
|
26
26
|
features.push_back(feature);
|
27
27
|
}
|
28
28
|
}
|
29
|
+
|
30
|
+
virtual ~DataSet() {
|
31
|
+
for(unsigned int i = 0; i < features.size(); i++)
|
32
|
+
delete features[i];
|
33
|
+
for(unsigned int i = 0; i < examples.size(); i++)
|
34
|
+
delete examples[i];
|
35
|
+
}
|
36
|
+
|
29
37
|
virtual DataSet *clone_without_examples() { return NULL; }
|
30
38
|
|
31
39
|
tr1::unordered_map<string, Feature *> feature_names;
|
@@ -38,7 +46,7 @@ namespace DataSet {
|
|
38
46
|
|
39
47
|
NumericFeature *new_numeric_feature(string name);
|
40
48
|
NominalFeature *new_nominal_feature(string name);
|
41
|
-
virtual Example *new_example() { return NULL; }
|
49
|
+
virtual Example *new_example(bool add_to_data_set = true) { return NULL; }
|
42
50
|
|
43
51
|
void count();
|
44
52
|
void index();
|
@@ -28,9 +28,10 @@ namespace DataSet {
|
|
28
28
|
return new DenseDataSet(this);
|
29
29
|
}
|
30
30
|
|
31
|
-
DenseExample *new_example() {
|
31
|
+
DenseExample *new_example(bool add_to_data_set = true) {
|
32
32
|
DenseExample *example = new DenseExample(features.size());
|
33
|
-
|
33
|
+
if(add_to_data_set)
|
34
|
+
examples.push_back(example);
|
34
35
|
return example;
|
35
36
|
}
|
36
37
|
};
|
@@ -17,6 +17,7 @@ namespace DataSet {
|
|
17
17
|
void set_index(int new_index) { index = new_index; }
|
18
18
|
|
19
19
|
Feature(string name, int index) : name(name), index(index) {}
|
20
|
+
virtual ~Feature() {}
|
20
21
|
virtual Feature *clone() { return NULL; }
|
21
22
|
virtual void reset() {}
|
22
23
|
virtual void print() {}
|
@@ -14,6 +14,8 @@ namespace DataSet {
|
|
14
14
|
NominalFeature(string name, int index) : Feature(name, index), names(1, "") {}
|
15
15
|
NominalFeature(NominalFeature *other) : Feature(other->name, other->index), indexes(other->indexes), names(other->names) {}
|
16
16
|
|
17
|
+
~NominalFeature() {}
|
18
|
+
|
17
19
|
NominalFeature *clone() {
|
18
20
|
return new NominalFeature(this);
|
19
21
|
}
|
@@ -29,9 +29,10 @@ namespace DataSet {
|
|
29
29
|
return new SparseDataSet(this);
|
30
30
|
}
|
31
31
|
|
32
|
-
SparseExample *new_example(int buffer_size = 0) {
|
32
|
+
SparseExample *new_example(int buffer_size = 0, bool add_to_data_set = true) {
|
33
33
|
SparseExample *example = new SparseExample(buffer_size);
|
34
|
-
|
34
|
+
if(add_to_data_set)
|
35
|
+
examples.push_back(example);
|
35
36
|
return example;
|
36
37
|
}
|
37
38
|
};
|
@@ -3,6 +3,7 @@
|
|
3
3
|
#include "data_set/example.h"
|
4
4
|
#include <stdlib.h>
|
5
5
|
#include <string>
|
6
|
+
#include <iostream>
|
6
7
|
using namespace std;
|
7
8
|
|
8
9
|
namespace DataSet {
|
@@ -25,6 +26,11 @@ namespace DataSet {
|
|
25
26
|
values = NULL;
|
26
27
|
}
|
27
28
|
|
29
|
+
~SparseExample() {
|
30
|
+
if(values != NULL)
|
31
|
+
free(values);
|
32
|
+
}
|
33
|
+
|
28
34
|
double get_value(int feature_index);
|
29
35
|
double get_value(string feature_name, SparseDataSet *data_set);
|
30
36
|
void set_value(int feature_index, double new_value);
|
@@ -4,6 +4,7 @@
|
|
4
4
|
#include "data_set/example.h"
|
5
5
|
#include "classifier/classifier.h"
|
6
6
|
#include "preprocessing/text/text_pipeline.h"
|
7
|
+
#include <iostream>
|
7
8
|
|
8
9
|
namespace Model {
|
9
10
|
class Model {
|
@@ -13,6 +14,14 @@ namespace Model {
|
|
13
14
|
Preprocessing::Text::TextPipeline *text_pipeline;
|
14
15
|
|
15
16
|
Model() : data_set(NULL), classifier(NULL), text_pipeline(NULL) {}
|
17
|
+
~Model() {
|
18
|
+
if(data_set)
|
19
|
+
delete data_set;
|
20
|
+
if(classifier)
|
21
|
+
delete classifier;
|
22
|
+
if(text_pipeline)
|
23
|
+
delete text_pipeline;
|
24
|
+
}
|
16
25
|
|
17
26
|
void train(DataSet::Example *example);
|
18
27
|
void train_text(string text);
|
@@ -9,6 +9,7 @@ namespace Preprocessing {
|
|
9
9
|
class ExampleGenerator {
|
10
10
|
public:
|
11
11
|
ExampleGenerator() {}
|
12
|
+
virtual ~ExampleGenerator() {}
|
12
13
|
virtual DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens, bool create_features) { return NULL; }
|
13
14
|
virtual uint32_t mark() = 0;
|
14
15
|
};
|
@@ -37,7 +37,7 @@ namespace Preprocessing {
|
|
37
37
|
}
|
38
38
|
|
39
39
|
// construct the example
|
40
|
-
DataSet::SparseExample *example = data_set->new_example(token_counts.size());
|
40
|
+
DataSet::SparseExample *example = data_set->new_example(token_counts.size(), false);
|
41
41
|
DataSet::Feature *feature = NULL;
|
42
42
|
|
43
43
|
for(map<string, int>::iterator token_counts_it = token_counts.begin(); token_counts_it != token_counts.end(); token_counts_it++) {
|
@@ -28,6 +28,20 @@ namespace Preprocessing {
|
|
28
28
|
TextPipeline() : tokeniser(NULL), processors(), selectors(), generator(NULL), tokens() {}
|
29
29
|
DataSet::SparseExample *process_text(DataSet::SparseDataSet *data_set, char *text, bool create_features);
|
30
30
|
void process_token(char *start, char *end);
|
31
|
+
|
32
|
+
~TextPipeline() {
|
33
|
+
// tokens stores ptrs to offsets of a string which is handled externally,
|
34
|
+
// so doesn't need to be released here
|
35
|
+
if(tokeniser)
|
36
|
+
delete tokeniser;
|
37
|
+
if(generator)
|
38
|
+
delete generator;
|
39
|
+
|
40
|
+
for(unsigned int i = 0; i < processors.size(); i++)
|
41
|
+
delete processors[i];
|
42
|
+
for(unsigned int i = 0; i < selectors.size(); i++)
|
43
|
+
delete selectors[i];
|
44
|
+
}
|
31
45
|
};
|
32
46
|
|
33
47
|
TextPipeline *StandardPipeline();
|
@@ -4,7 +4,6 @@
|
|
4
4
|
using namespace std;
|
5
5
|
using namespace tr1;
|
6
6
|
|
7
|
-
static unordered_set<string> *stop_words = NULL;
|
8
7
|
static int stop_word_count = 586;
|
9
8
|
static string stop_word_list[] = {
|
10
9
|
"a", "able", "about", "above", "abroad", "according", "accordingly", "across", "actually", "adj",
|
@@ -69,14 +68,11 @@ static string stop_word_list[] = {
|
|
69
68
|
};
|
70
69
|
|
71
70
|
Preprocessing::Text::StopWords::StopWords() : TokenSelector() {
|
72
|
-
|
73
|
-
stop_words
|
74
|
-
for(int i = 0; i < stop_word_count; i++)
|
75
|
-
stop_words->insert(stop_word_list[i]);
|
76
|
-
}
|
71
|
+
for(int i = 0; i < stop_word_count; i++)
|
72
|
+
stop_words.insert(stop_word_list[i]);
|
77
73
|
}
|
78
74
|
|
79
75
|
bool Preprocessing::Text::StopWords::select(char *start, char *end) {
|
80
76
|
string token = string(start, (end - start) + 1);
|
81
|
-
return stop_words
|
77
|
+
return stop_words.count(token) == 0;
|
82
78
|
}
|
@@ -1,6 +1,10 @@
|
|
1
1
|
#ifndef __stop_words_h__
|
2
2
|
#define __stop_words_h__
|
3
3
|
#include "token_selector.h"
|
4
|
+
#include <tr1/unordered_set>
|
5
|
+
using namespace std;
|
6
|
+
using namespace tr1;
|
7
|
+
|
4
8
|
|
5
9
|
namespace Preprocessing {
|
6
10
|
namespace Text {
|
@@ -9,8 +13,10 @@ namespace Preprocessing {
|
|
9
13
|
public:
|
10
14
|
static const uint32_t file_mark = 'stop';
|
11
15
|
uint32_t mark() { return file_mark; }
|
16
|
+
unordered_set<string> stop_words;
|
12
17
|
|
13
18
|
StopWords();
|
19
|
+
~StopWords() {}
|
14
20
|
bool select(char *start, char *end);
|
15
21
|
};
|
16
22
|
|
@@ -104,16 +104,26 @@ DataSet::DataSet *Storage::Binary::read_data_set() {
|
|
104
104
|
|
105
105
|
// read cached frequencies and probabilities if present
|
106
106
|
if(data_set->counted) {
|
107
|
-
|
108
|
-
|
107
|
+
vector<int> *frequencies = read_vector<int>();
|
108
|
+
vector<double> *probabilities = read_vector<double>();
|
109
|
+
nominal_feature->frequencies = *frequencies;
|
110
|
+
nominal_feature->probabilities = *probabilities;
|
109
111
|
nominal_feature->category_frequencies.resize(num_categories + 1);
|
110
112
|
nominal_feature->category_probabilities.resize(num_categories + 1);
|
113
|
+
delete frequencies;
|
114
|
+
delete probabilities;
|
111
115
|
|
112
|
-
for(int i = 1; i <= num_categories; i++)
|
113
|
-
|
116
|
+
for(int i = 1; i <= num_categories; i++) {
|
117
|
+
frequencies = read_vector<int>();
|
118
|
+
nominal_feature->category_frequencies[i] = *frequencies;
|
119
|
+
delete frequencies;
|
120
|
+
}
|
114
121
|
|
115
|
-
for(int i = 1; i <= num_categories; i++)
|
116
|
-
|
122
|
+
for(int i = 1; i <= num_categories; i++) {
|
123
|
+
probabilities = read_vector<double>();
|
124
|
+
nominal_feature->category_probabilities[i] = *probabilities;
|
125
|
+
delete probabilities;
|
126
|
+
}
|
117
127
|
}
|
118
128
|
|
119
129
|
// TODO: read cached indexes
|
@@ -55,6 +55,7 @@ void Storage::Folders::load_directory(string path, DataSet::SparseDataSet *data_
|
|
55
55
|
// insert a new example into the dataset
|
56
56
|
example = pipeline->process_text(data_set, file_data, true);
|
57
57
|
example->set_category_index(data_set, category_index);
|
58
|
+
data_set->examples.push_back(example);
|
58
59
|
|
59
60
|
file_count++;
|
60
61
|
if((file_count % 10000) == 0)
|
data/thera.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |s|
|
|
9
9
|
s.description = "C++ Data Mining Library for Ruby"
|
10
10
|
s.email = "me@willcannings.com"
|
11
11
|
s.authors = ["Will Cannings"]
|
12
|
-
s.version = '0.0.
|
12
|
+
s.version = '0.0.8'
|
13
13
|
s.extensions = ["ext/extconf.rb"]
|
14
14
|
|
15
15
|
s.files = `git ls-files`.split("\n")
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: thera
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2012-01-21 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rice
|
16
|
-
requirement: &
|
16
|
+
requirement: &70151526602040 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70151526602040
|
25
25
|
description: C++ Data Mining Library for Ruby
|
26
26
|
email: me@willcannings.com
|
27
27
|
executables: []
|