thera 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +56 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +20 -0
- data/LICENSE.txt +1 -0
- data/README.rdoc +8 -0
- data/Rakefile +1 -0
- data/ext/Makefile +225 -0
- data/ext/extconf.rb +29 -0
- data/ext/quarry/quarry_toolkit.cpp +148 -0
- data/lib/quarry/Makefile.linux +2 -0
- data/lib/quarry/Makefile.osx +6 -0
- data/lib/quarry/Makefile.targets +23 -0
- data/lib/quarry/obj/.gitkeep +0 -0
- data/lib/quarry/src/classifier/aode/aode_classifier.cpp +0 -0
- data/lib/quarry/src/classifier/aode/aode_classifier.h +0 -0
- data/lib/quarry/src/classifier/centroid/centroid_classifier.cpp +0 -0
- data/lib/quarry/src/classifier/centroid/centroid_classifier.h +0 -0
- data/lib/quarry/src/classifier/classifier.cpp +32 -0
- data/lib/quarry/src/classifier/classifier.h +59 -0
- data/lib/quarry/src/classifier/knn/knn_classifier.cpp +0 -0
- data/lib/quarry/src/classifier/knn/knn_classifier.h +0 -0
- data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.cpp +40 -0
- data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.h +18 -0
- data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +80 -0
- data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.h +52 -0
- data/lib/quarry/src/data_set/data_set.cpp +130 -0
- data/lib/quarry/src/data_set/data_set.h +78 -0
- data/lib/quarry/src/data_set/dense/dense_data_set.h +39 -0
- data/lib/quarry/src/data_set/dense/dense_example.h +44 -0
- data/lib/quarry/src/data_set/example.cpp +10 -0
- data/lib/quarry/src/data_set/example.h +23 -0
- data/lib/quarry/src/data_set/feature.h +36 -0
- data/lib/quarry/src/data_set/features/nominal_feature.cpp +57 -0
- data/lib/quarry/src/data_set/features/nominal_feature.h +76 -0
- data/lib/quarry/src/data_set/features/numeric_feature.cpp +69 -0
- data/lib/quarry/src/data_set/features/numeric_feature.h +78 -0
- data/lib/quarry/src/data_set/sparse/sparse_data_set.h +40 -0
- data/lib/quarry/src/data_set/sparse/sparse_example.cpp +82 -0
- data/lib/quarry/src/data_set/sparse/sparse_example.h +38 -0
- data/lib/quarry/src/metrics/confusion_matrix.cpp +129 -0
- data/lib/quarry/src/metrics/confusion_matrix.h +82 -0
- data/lib/quarry/src/model/model.cpp +29 -0
- data/lib/quarry/src/model/model.h +50 -0
- data/lib/quarry/src/preprocessing/examples/example_preprocessor.h +20 -0
- data/lib/quarry/src/preprocessing/examples/weights/binary_weight.h +20 -0
- data/lib/quarry/src/preprocessing/examples/weights/local_weight.h +29 -0
- data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +19 -0
- data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +59 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/downcase.h +26 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h +17 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer.h +44 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer_original.cpp +375 -0
- data/lib/quarry/src/preprocessing/text/text_pipeline.cpp +29 -0
- data/lib/quarry/src/preprocessing/text/text_pipeline.h +37 -0
- data/lib/quarry/src/preprocessing/text/token_selector/pos_tag_selector.h +21 -0
- data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp +82 -0
- data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h +20 -0
- data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h +17 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.cpp +29 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.h +20 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h +19 -0
- data/lib/quarry/src/quarry.cpp +1 -0
- data/lib/quarry/src/quarry.h +29 -0
- data/lib/quarry/src/storage/arff.cpp +198 -0
- data/lib/quarry/src/storage/arff.h +26 -0
- data/lib/quarry/src/storage/binary.cpp +457 -0
- data/lib/quarry/src/storage/binary.h +79 -0
- data/lib/quarry/src/storage/folders.cpp +98 -0
- data/lib/quarry/src/storage/folders.h +25 -0
- data/lib/quarry/src/storage/storage.h +19 -0
- data/lib/quarry/src/test.cpp +6 -0
- data/lib/quarry_rb/classifier/classifier.rb +22 -0
- data/lib/quarry_rb/classifier/naive_bayes_classifier.rb +10 -0
- data/lib/quarry_rb/confusion_matrix.rb +58 -0
- data/lib/quarry_rb/data_set/data_set.rb +42 -0
- data/lib/quarry_rb/data_set/example.rb +33 -0
- data/lib/quarry_rb/data_set/feature.rb +28 -0
- data/lib/quarry_rb/enumerable_helper.rb +32 -0
- data/lib/quarry_rb/model/model.rb +56 -0
- data/lib/quarry_rb/storage/arff.rb +11 -0
- data/lib/quarry_rb/storage/binary.rb +23 -0
- data/lib/quarry_rb/storage/folders.rb +11 -0
- data/lib/quarry_rb/text_pipeline.rb +16 -0
- data/lib/thera.rb +20 -0
- data/test/helper.rb +19 -0
- data/test/test_quarry.rb +33 -0
- data/thera.gemspec +21 -0
- metadata +148 -0
|
@@ -0,0 +1,457 @@
|
|
|
1
|
+
#include "binary.h"
|
|
2
|
+
#include <stdexcept>
|
|
3
|
+
#include <vector>
|
|
4
|
+
|
|
5
|
+
// ------------------------------------------
|
|
6
|
+
// integer 'magic marks' are used to identify
|
|
7
|
+
// binary files, delineate sections of the
|
|
8
|
+
// file, and can be used to test endianess
|
|
9
|
+
// ------------------------------------------
|
|
10
|
+
static const uint32_t file_mark = 'quar';
|
|
11
|
+
static const uint32_t none_mark = 'none';
|
|
12
|
+
static const uint32_t classifier_mark = 'clas';
|
|
13
|
+
static const uint32_t text_pipeline_mark = 'texp';
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
// ------------------------------------------
|
|
17
|
+
// low level read and write operations
|
|
18
|
+
// ------------------------------------------
|
|
19
|
+
void Storage::Binary::write_int(int number) {
|
|
20
|
+
file.write((char *)(&number), sizeof(int));
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
int Storage::Binary::read_int() {
|
|
24
|
+
int value = 0;
|
|
25
|
+
file.read((char *)(&value), sizeof(int));
|
|
26
|
+
return value;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
void Storage::Binary::write_mark(uint32_t mark) {
|
|
30
|
+
file.write((char *)(&mark), sizeof(uint32_t));
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
uint32_t Storage::Binary::read_mark() {
|
|
34
|
+
uint32_t value = 0;
|
|
35
|
+
file.read((char *)(&value), sizeof(uint32_t));
|
|
36
|
+
return value;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
void Storage::Binary::write_bool(bool value) {
|
|
40
|
+
char file_value = (value ? 1 : 0);
|
|
41
|
+
file.write(&file_value, 1);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
bool Storage::Binary::read_bool() {
|
|
45
|
+
char value = 0;
|
|
46
|
+
file.read(&value, 1);
|
|
47
|
+
return value != 0;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
void Storage::Binary::write_string(string str) {
|
|
51
|
+
file.write(str.c_str(), str.length() + 1);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
string Storage::Binary::read_string() {
|
|
55
|
+
string str;
|
|
56
|
+
std::getline(file, str, '\0');
|
|
57
|
+
return str;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
// ------------------------------------------
|
|
62
|
+
// data set
|
|
63
|
+
// ------------------------------------------
|
|
64
|
+
DataSet::DataSet *Storage::Binary::read_data_set() {
|
|
65
|
+
DataSet::DataSet *data_set = NULL;
|
|
66
|
+
bool sparse = read_bool();
|
|
67
|
+
|
|
68
|
+
// determine the type of data set to create
|
|
69
|
+
if(sparse)
|
|
70
|
+
data_set = new DataSet::SparseDataSet();
|
|
71
|
+
else
|
|
72
|
+
data_set = new DataSet::DenseDataSet();
|
|
73
|
+
|
|
74
|
+
// initialise the data set
|
|
75
|
+
data_set->name = read_string();
|
|
76
|
+
data_set->category_index = read_int();
|
|
77
|
+
data_set->counted = read_bool();
|
|
78
|
+
data_set->indexed = read_bool();
|
|
79
|
+
|
|
80
|
+
// initialise the data set's features
|
|
81
|
+
DataSet::NominalFeature *nominal_feature;
|
|
82
|
+
DataSet::NumericFeature *numeric_feature;
|
|
83
|
+
int index = 0, count = 0;
|
|
84
|
+
bool nominal = false;
|
|
85
|
+
string name;
|
|
86
|
+
|
|
87
|
+
// determine the number of features to read; count caches need to know the number of categories up front
|
|
88
|
+
int num_features = read_int();
|
|
89
|
+
int num_categories = read_int();
|
|
90
|
+
|
|
91
|
+
for(int i = 0; i < num_features; i++) {
|
|
92
|
+
nominal = read_bool();
|
|
93
|
+
index = read_int();
|
|
94
|
+
name = read_string();
|
|
95
|
+
|
|
96
|
+
if(nominal) {
|
|
97
|
+
nominal_feature = data_set->new_nominal_feature(name);
|
|
98
|
+
nominal_feature->index = index;
|
|
99
|
+
|
|
100
|
+
// read the nominal category names
|
|
101
|
+
count = read_int();
|
|
102
|
+
for(int i = 0; i < count; i++)
|
|
103
|
+
nominal_feature->add_value(read_string());
|
|
104
|
+
|
|
105
|
+
// read cached frequencies and probabilities if present
|
|
106
|
+
if(data_set->counted) {
|
|
107
|
+
nominal_feature->frequencies = *read_vector<int>();
|
|
108
|
+
nominal_feature->probabilities = *read_vector<double>();
|
|
109
|
+
nominal_feature->category_frequencies.resize(num_categories + 1);
|
|
110
|
+
nominal_feature->category_probabilities.resize(num_categories + 1);
|
|
111
|
+
|
|
112
|
+
for(int i = 1; i <= num_categories; i++)
|
|
113
|
+
nominal_feature->category_frequencies[i] = *read_vector<int>();
|
|
114
|
+
|
|
115
|
+
for(int i = 1; i <= num_categories; i++)
|
|
116
|
+
nominal_feature->category_probabilities[i] = *read_vector<double>();
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// TODO: read cached indexes
|
|
120
|
+
if(data_set->indexed) {
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
} else {
|
|
124
|
+
numeric_feature = data_set->new_numeric_feature(name);
|
|
125
|
+
numeric_feature->index = index;
|
|
126
|
+
|
|
127
|
+
// cached counts
|
|
128
|
+
if(data_set->counted) {
|
|
129
|
+
file.read((char *)&(numeric_feature->counts), sizeof(DataSet::NumericFeature::Counts));
|
|
130
|
+
numeric_feature->category_counts = (DataSet::NumericFeature::Counts *) malloc(sizeof(DataSet::NumericFeature::Counts) * (num_categories + 1));
|
|
131
|
+
for(int i = 1; i <= num_categories; i++)
|
|
132
|
+
file.read((char *)&(numeric_feature->category_counts[i]), sizeof(DataSet::NumericFeature::Counts));
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// TODO: cached indexes
|
|
136
|
+
if(data_set->indexed) {
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// read examples if present
|
|
142
|
+
should_write_examples = read_bool();
|
|
143
|
+
if(should_write_examples) {
|
|
144
|
+
int num_examples = read_int();
|
|
145
|
+
|
|
146
|
+
if(sparse) {
|
|
147
|
+
DataSet::SparseExample *sparse_example;
|
|
148
|
+
|
|
149
|
+
for(int i = 0; i < num_examples; i++) {
|
|
150
|
+
// number of non-zero values
|
|
151
|
+
count = read_int();
|
|
152
|
+
|
|
153
|
+
// construct & read the example
|
|
154
|
+
sparse_example = ((DataSet::SparseDataSet *) data_set)->new_example(count);
|
|
155
|
+
file.read((char *)sparse_example->values, count * sizeof(DataSet::SparseExample::Value));
|
|
156
|
+
sparse_example->size = count;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
} else {
|
|
160
|
+
// each dense example stores the same number of values
|
|
161
|
+
count = read_int();
|
|
162
|
+
|
|
163
|
+
// read each example
|
|
164
|
+
DataSet::DenseExample *dense_example;
|
|
165
|
+
for(int i = 0; i < num_examples; i++) {
|
|
166
|
+
dense_example = ((DataSet::DenseDataSet *) data_set)->new_example();
|
|
167
|
+
file.read((char *)dense_example->values, count * sizeof(double));
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
return data_set;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
void Storage::Binary::write_data_set(DataSet::DataSet *data_set) {
|
|
176
|
+
bool sparse = (typeid(*data_set) == typeid(DataSet::SparseDataSet));
|
|
177
|
+
int num_categories = data_set->categories_size();
|
|
178
|
+
int num_features = data_set->features_size();
|
|
179
|
+
int num_examples = data_set->examples_size();
|
|
180
|
+
|
|
181
|
+
// data set header
|
|
182
|
+
write_bool(sparse);
|
|
183
|
+
write_string(data_set->name);
|
|
184
|
+
write_int(data_set->category_index);
|
|
185
|
+
write_bool(data_set->counted);
|
|
186
|
+
write_bool(data_set->indexed);
|
|
187
|
+
write_int(num_features);
|
|
188
|
+
write_int(num_categories);
|
|
189
|
+
|
|
190
|
+
// features
|
|
191
|
+
DataSet::NominalFeature *nominal_feature;
|
|
192
|
+
DataSet::NumericFeature *numeric_feature;
|
|
193
|
+
DataSet::Feature *feature;
|
|
194
|
+
uint32_t count = 0;
|
|
195
|
+
bool nominal;
|
|
196
|
+
|
|
197
|
+
for(int i = 0; i < num_features; i++) {
|
|
198
|
+
feature = data_set->features[i];
|
|
199
|
+
nominal = (typeid(*feature) == typeid(DataSet::NominalFeature));
|
|
200
|
+
write_bool(nominal);
|
|
201
|
+
write_int(feature->index);
|
|
202
|
+
write_string(feature->name);
|
|
203
|
+
|
|
204
|
+
if(nominal) {
|
|
205
|
+
nominal_feature = (DataSet::NominalFeature *)feature;
|
|
206
|
+
|
|
207
|
+
// category names
|
|
208
|
+
count = nominal_feature->names.size();
|
|
209
|
+
write_int(count - 1);
|
|
210
|
+
for(int i = 1; i < count; i++)
|
|
211
|
+
write_string(nominal_feature->names.at(i));
|
|
212
|
+
|
|
213
|
+
// cached counts
|
|
214
|
+
if(data_set->counted) {
|
|
215
|
+
write_vector<int>(&(nominal_feature->frequencies));
|
|
216
|
+
write_vector<double>(&(nominal_feature->probabilities));
|
|
217
|
+
|
|
218
|
+
for(int i = 1; i <= num_categories; i++)
|
|
219
|
+
write_vector<int>(&(nominal_feature->category_frequencies.at(i)));
|
|
220
|
+
|
|
221
|
+
for(int i = 1; i <= num_categories; i++)
|
|
222
|
+
write_vector<double>(&(nominal_feature->category_probabilities.at(i)));
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// TODO: cached indexes
|
|
226
|
+
if(data_set->indexed) {
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
} else {
|
|
230
|
+
numeric_feature = (DataSet::NumericFeature *)feature;
|
|
231
|
+
|
|
232
|
+
// cached counts
|
|
233
|
+
if(data_set->counted) {
|
|
234
|
+
file.write((const char *)&(numeric_feature->counts), sizeof(DataSet::NumericFeature::Counts));
|
|
235
|
+
for(int i = 1; i <= num_categories; i++)
|
|
236
|
+
file.write((const char *)&(numeric_feature->category_counts[i]), sizeof(DataSet::NumericFeature::Counts));
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
// TODO: cached indexes
|
|
240
|
+
if(data_set->indexed) {
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// examples
|
|
246
|
+
write_bool(should_write_examples);
|
|
247
|
+
if(should_write_examples) {
|
|
248
|
+
write_int(num_examples);
|
|
249
|
+
|
|
250
|
+
if(sparse) {
|
|
251
|
+
DataSet::SparseExample *example;
|
|
252
|
+
for(int i = 0; i < num_examples; i++) {
|
|
253
|
+
example = (DataSet::SparseExample *) data_set->examples[i];
|
|
254
|
+
count = example->size;
|
|
255
|
+
write_int(count);
|
|
256
|
+
file.write((char *)(example->values), count * sizeof(DataSet::SparseExample::Value));
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
} else {
|
|
260
|
+
// each dense example stores the same number of values
|
|
261
|
+
count = data_set->examples[0]->size;
|
|
262
|
+
write_int(count);
|
|
263
|
+
|
|
264
|
+
// write each example
|
|
265
|
+
for(int i = 0; i < num_examples; i++)
|
|
266
|
+
file.write((char *)((DataSet::DenseExample *)data_set->examples[i])->values, count * sizeof(double));
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
// ------------------------------------------
|
|
273
|
+
// classifiers
|
|
274
|
+
// ------------------------------------------
|
|
275
|
+
Classifier::Classifier *Storage::Binary::read_classifier(DataSet::DataSet *data_set) {
|
|
276
|
+
uint32_t mark = read_mark();
|
|
277
|
+
if(mark == none_mark)
|
|
278
|
+
return NULL;
|
|
279
|
+
else if(mark != classifier_mark)
|
|
280
|
+
throw runtime_error("Expected classifier section");
|
|
281
|
+
|
|
282
|
+
Classifier::Classifier *classifier = NULL;
|
|
283
|
+
uint32_t type = read_mark();
|
|
284
|
+
|
|
285
|
+
switch(type) {
|
|
286
|
+
case Classifier::NaiveBayesClassifier::file_mark:
|
|
287
|
+
classifier = new Classifier::NaiveBayesClassifier(data_set);
|
|
288
|
+
break;
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
if(classifier)
|
|
292
|
+
classifier->read_binary(this);
|
|
293
|
+
return classifier;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
void Storage::Binary::write_classifier(Classifier::Classifier *classifier) {
|
|
297
|
+
if(!classifier) {
|
|
298
|
+
write_mark(none_mark);
|
|
299
|
+
} else {
|
|
300
|
+
write_mark(classifier_mark);
|
|
301
|
+
write_mark(classifier->mark());
|
|
302
|
+
classifier->write_binary(this);
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
// ------------------------------------------
|
|
308
|
+
// text pipeline
|
|
309
|
+
// ------------------------------------------
|
|
310
|
+
Preprocessing::Text::TextPipeline *Storage::Binary::read_text_pipeline() {
|
|
311
|
+
uint32_t mark = read_mark();
|
|
312
|
+
if(mark == none_mark)
|
|
313
|
+
return NULL;
|
|
314
|
+
else if(mark != text_pipeline_mark)
|
|
315
|
+
throw runtime_error("Expected text pipeline section");
|
|
316
|
+
|
|
317
|
+
Preprocessing::Text::TextPipeline *pipeline = new Preprocessing::Text::TextPipeline();
|
|
318
|
+
|
|
319
|
+
// tokeniser
|
|
320
|
+
switch(read_mark()) {
|
|
321
|
+
case Preprocessing::Text::SimpleTokeniser::file_mark:
|
|
322
|
+
pipeline->tokeniser = new Preprocessing::Text::SimpleTokeniser(pipeline);
|
|
323
|
+
break;
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
// inplace processors
|
|
327
|
+
int count = read_int();
|
|
328
|
+
for(int i = 0; i < count; i++) {
|
|
329
|
+
switch(read_mark()) {
|
|
330
|
+
case Preprocessing::Text::Downcase::file_mark:
|
|
331
|
+
pipeline->processors.push_back(new Preprocessing::Text::Downcase());
|
|
332
|
+
break;
|
|
333
|
+
case Preprocessing::Text::PorterStemmer::file_mark:
|
|
334
|
+
pipeline->processors.push_back(new Preprocessing::Text::PorterStemmer());
|
|
335
|
+
break;
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
// token selectors
|
|
340
|
+
count = read_int();
|
|
341
|
+
for(int i = 0; i < count; i++) {
|
|
342
|
+
switch(read_mark()) {
|
|
343
|
+
case Preprocessing::Text::StopWords::file_mark:
|
|
344
|
+
pipeline->selectors.push_back(new Preprocessing::Text::StopWords());
|
|
345
|
+
break;
|
|
346
|
+
case Preprocessing::Text::POSTagSelector::file_mark:
|
|
347
|
+
pipeline->selectors.push_back(new Preprocessing::Text::POSTagSelector());
|
|
348
|
+
break;
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
// example generator
|
|
353
|
+
switch(read_mark()) {
|
|
354
|
+
case Preprocessing::Text::TokenCounter::file_mark:
|
|
355
|
+
pipeline->generator = new Preprocessing::Text::TokenCounter();
|
|
356
|
+
break;
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
return pipeline;
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
void Storage::Binary::write_text_pipeline(Preprocessing::Text::TextPipeline *pipeline) {
|
|
363
|
+
if(!pipeline) {
|
|
364
|
+
write_mark(none_mark);
|
|
365
|
+
return;
|
|
366
|
+
} else {
|
|
367
|
+
write_mark(text_pipeline_mark);
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
// tokeniser
|
|
371
|
+
write_mark(pipeline->tokeniser->mark());
|
|
372
|
+
|
|
373
|
+
// inplace processors
|
|
374
|
+
int count = pipeline->processors.size();
|
|
375
|
+
write_int(count);
|
|
376
|
+
for(int i = 0; i < count; i++)
|
|
377
|
+
write_mark(pipeline->processors[i]->mark());
|
|
378
|
+
|
|
379
|
+
// token selectors
|
|
380
|
+
count = pipeline->selectors.size();
|
|
381
|
+
write_int(count);
|
|
382
|
+
for(int i = 0; i < count; i++)
|
|
383
|
+
write_mark(pipeline->selectors[i]->mark());
|
|
384
|
+
|
|
385
|
+
// example generator
|
|
386
|
+
write_mark(pipeline->generator->mark());
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
// ------------------------------------------
|
|
391
|
+
// helpers
|
|
392
|
+
// ------------------------------------------
|
|
393
|
+
void Storage::Binary::open_for_reading() {
|
|
394
|
+
// open file
|
|
395
|
+
file.open(path.c_str(), fstream::in | fstream::binary);
|
|
396
|
+
|
|
397
|
+
// ensure file is ok for reading
|
|
398
|
+
if(!file.good())
|
|
399
|
+
throw runtime_error("Error opening binary file for reading");
|
|
400
|
+
|
|
401
|
+
// quick sanity check
|
|
402
|
+
if(read_mark() != file_mark)
|
|
403
|
+
throw runtime_error("Binary file mark is invalid");
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
void Storage::Binary::open_for_writing() {
|
|
407
|
+
// open/create file
|
|
408
|
+
file.open(path.c_str(), fstream::out | fstream::binary);
|
|
409
|
+
|
|
410
|
+
// ensure file is ok for writing
|
|
411
|
+
if(!file.good())
|
|
412
|
+
throw runtime_error("Error opening binary file for writing");
|
|
413
|
+
|
|
414
|
+
// write the file marker so reads can test the file format
|
|
415
|
+
write_mark(file_mark);
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
// ------------------------------------------
|
|
420
|
+
// public read & write methods
|
|
421
|
+
// ------------------------------------------
|
|
422
|
+
DataSet::DataSet *Storage::Binary::read() {
|
|
423
|
+
open_for_reading();
|
|
424
|
+
DataSet::DataSet *data_set = read_data_set();
|
|
425
|
+
file.close();
|
|
426
|
+
return data_set;
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
void Storage::Binary::write(DataSet::DataSet *data_set) {
|
|
430
|
+
open_for_writing();
|
|
431
|
+
write_data_set(data_set);
|
|
432
|
+
file.close();
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
Model::Model *Storage::Binary::read_model() {
|
|
436
|
+
open_for_reading();
|
|
437
|
+
|
|
438
|
+
// read the 3 model components
|
|
439
|
+
Model::Model *model = new Model::Model();
|
|
440
|
+
model->data_set = read_data_set();
|
|
441
|
+
model->classifier = read_classifier(model->data_set);
|
|
442
|
+
model->text_pipeline = read_text_pipeline();
|
|
443
|
+
|
|
444
|
+
file.close();
|
|
445
|
+
return model;
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
void Storage::Binary::write_model(Model::Model *model) {
|
|
449
|
+
open_for_writing();
|
|
450
|
+
|
|
451
|
+
// write the 3 model components
|
|
452
|
+
write_data_set(model->data_set);
|
|
453
|
+
write_classifier(model->classifier);
|
|
454
|
+
write_text_pipeline(model->text_pipeline);
|
|
455
|
+
|
|
456
|
+
file.close();
|
|
457
|
+
}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
#ifndef __binary_h__
|
|
2
|
+
#define __binary_h__
|
|
3
|
+
#include "storage/storage.h"
|
|
4
|
+
#include "data_set/dense/dense_data_set.h"
|
|
5
|
+
#include "data_set/sparse/sparse_data_set.h"
|
|
6
|
+
#include "classifier/naive_bayes/naive_bayes_classifier.h"
|
|
7
|
+
#include <fstream>
|
|
8
|
+
using namespace std;
|
|
9
|
+
|
|
10
|
+
namespace Storage {
|
|
11
|
+
class Binary : public Storage {
|
|
12
|
+
string path;
|
|
13
|
+
fstream file;
|
|
14
|
+
|
|
15
|
+
// helpers
|
|
16
|
+
void open_for_reading();
|
|
17
|
+
void open_for_writing();
|
|
18
|
+
|
|
19
|
+
// low level IO
|
|
20
|
+
void write_string(string str);
|
|
21
|
+
string read_string();
|
|
22
|
+
void write_int(int number);
|
|
23
|
+
int read_int();
|
|
24
|
+
void write_mark(uint32_t mark);
|
|
25
|
+
uint32_t read_mark();
|
|
26
|
+
void write_bool(bool value);
|
|
27
|
+
bool read_bool();
|
|
28
|
+
|
|
29
|
+
// these templated functions are used outside this class,
|
|
30
|
+
// so their definition needs to be in this header file for
|
|
31
|
+
// each version of the function to be generated
|
|
32
|
+
template<class T> vector<T> *read_vector() {
|
|
33
|
+
vector<T> *values = new vector<T>();
|
|
34
|
+
int size = read_int();
|
|
35
|
+
values->reserve(size);
|
|
36
|
+
T value;
|
|
37
|
+
|
|
38
|
+
for(int i = 0; i < size; i++) {
|
|
39
|
+
file.read((char *)(&value), sizeof(T));
|
|
40
|
+
values->push_back(value);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
return values;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
template<class T> void write_vector(vector<T> *values) {
|
|
47
|
+
uint32_t size = values->size();
|
|
48
|
+
write_int(size);
|
|
49
|
+
|
|
50
|
+
for(int i = 0; i < size; i++)
|
|
51
|
+
file.write((char *)(&values->at(i)), sizeof(T));
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// serialisation
|
|
55
|
+
DataSet::DataSet *read_data_set();
|
|
56
|
+
void write_data_set(DataSet::DataSet *data_set);
|
|
57
|
+
Classifier::Classifier *read_classifier(DataSet::DataSet *data_set);
|
|
58
|
+
void write_classifier(Classifier::Classifier *classifier);
|
|
59
|
+
Preprocessing::Text::TextPipeline *read_text_pipeline();
|
|
60
|
+
void write_text_pipeline(Preprocessing::Text::TextPipeline *pipeline);
|
|
61
|
+
|
|
62
|
+
public:
|
|
63
|
+
bool should_write_examples;
|
|
64
|
+
Binary(string path) : path(path), should_write_examples(false) {}
|
|
65
|
+
bool get_write_examples() { return should_write_examples; }
|
|
66
|
+
void set_write_examples(bool write) { should_write_examples = write;}
|
|
67
|
+
|
|
68
|
+
DataSet::DataSet *read();
|
|
69
|
+
Model::Model *read_model();
|
|
70
|
+
void write(DataSet::DataSet *data_set);
|
|
71
|
+
void write_model(Model::Model *model);
|
|
72
|
+
|
|
73
|
+
friend class Preprocessing::Text::TextPipeline;
|
|
74
|
+
friend class Classifier::Classifier;
|
|
75
|
+
friend class Classifier::NaiveBayesClassifier;
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
#endif
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
#include "folders.h"
|
|
2
|
+
#include <fstream>
|
|
3
|
+
#include <iostream>
|
|
4
|
+
#include <stdlib.h>
|
|
5
|
+
#include <dirent.h>
|
|
6
|
+
#include <sys/stat.h>
|
|
7
|
+
using namespace std;
|
|
8
|
+
|
|
9
|
+
static char *file_data = NULL;
|
|
10
|
+
static int file_data_size = 0;
|
|
11
|
+
static int file_count = 0;
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
void Storage::Folders::load_directory(string path, DataSet::SparseDataSet *data_set, int category_index) {
|
|
15
|
+
DataSet::SparseExample *example;
|
|
16
|
+
DIR *dir = opendir(path.c_str());
|
|
17
|
+
struct dirent *dp;
|
|
18
|
+
char *name;
|
|
19
|
+
string newpath;
|
|
20
|
+
struct stat info;
|
|
21
|
+
FILE *file;
|
|
22
|
+
int file_length;
|
|
23
|
+
|
|
24
|
+
while((dp = readdir(dir))) {
|
|
25
|
+
// ignore files starting with a dot
|
|
26
|
+
name = dp->d_name;
|
|
27
|
+
if(*name == '.')
|
|
28
|
+
continue;
|
|
29
|
+
|
|
30
|
+
// ensure this is a file, not a folder
|
|
31
|
+
newpath = path + "/" + name;
|
|
32
|
+
stat(newpath.c_str(), &info);
|
|
33
|
+
if(info.st_mode & S_IFDIR)
|
|
34
|
+
continue;
|
|
35
|
+
|
|
36
|
+
// determine if the file_data buffer is large enough to hold this file
|
|
37
|
+
file = fopen(newpath.c_str(), "rb");
|
|
38
|
+
fseek(file, 0, SEEK_END);
|
|
39
|
+
file_length = ftell(file) + 1;
|
|
40
|
+
rewind(file);
|
|
41
|
+
|
|
42
|
+
if(file_data_size < file_length) {
|
|
43
|
+
if(file_data != NULL)
|
|
44
|
+
free(file_data);
|
|
45
|
+
file_data = (char *)malloc(file_length);
|
|
46
|
+
file_data_size = file_length;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// read into the buffer
|
|
50
|
+
fread(file_data, 1, file_length - 1, file);
|
|
51
|
+
file_data[file_length - 1] = 0;
|
|
52
|
+
fclose(file);
|
|
53
|
+
|
|
54
|
+
// insert a new example into the dataset
|
|
55
|
+
example = pipeline->process_text(data_set, file_data);
|
|
56
|
+
example->set_category_index(data_set, category_index);
|
|
57
|
+
|
|
58
|
+
file_count++;
|
|
59
|
+
if((file_count % 10000) == 0)
|
|
60
|
+
cout << "Read " << file_count << endl;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
closedir(dir);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
DataSet::DataSet *Storage::Folders::read() {
|
|
67
|
+
DataSet::SparseDataSet *data_set = new DataSet::SparseDataSet();
|
|
68
|
+
DIR *dir = opendir(path.c_str());
|
|
69
|
+
struct dirent *dp;
|
|
70
|
+
char *name;
|
|
71
|
+
string newpath;
|
|
72
|
+
struct stat info;
|
|
73
|
+
|
|
74
|
+
// create an initial feature "Category"
|
|
75
|
+
DataSet::NominalFeature *categories = data_set->new_nominal_feature("Category");
|
|
76
|
+
data_set->set_category_index(0);
|
|
77
|
+
int category_index = 0;
|
|
78
|
+
|
|
79
|
+
while((dp = readdir(dir))) {
|
|
80
|
+
// ignore files starting with a dot
|
|
81
|
+
name = dp->d_name;
|
|
82
|
+
if(*name == '.')
|
|
83
|
+
continue;
|
|
84
|
+
|
|
85
|
+
// ensure this is a folder
|
|
86
|
+
newpath = path + "/" + name;
|
|
87
|
+
stat(newpath.c_str(), &info);
|
|
88
|
+
if(info.st_mode & S_IFDIR) {
|
|
89
|
+
category_index = categories->value_index(string(name));
|
|
90
|
+
load_directory(newpath, data_set, category_index);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
return data_set;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
void Storage::Folders::write(DataSet::DataSet *data_set) {
|
|
98
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
#ifndef __folders_h__
|
|
2
|
+
#define __folders_h__
|
|
3
|
+
#include "preprocessing/text/text_pipeline.h"
|
|
4
|
+
#include "data_set/data_set.h"
|
|
5
|
+
#include "storage/storage.h"
|
|
6
|
+
#include <algorithm>
|
|
7
|
+
#include <cctype>
|
|
8
|
+
#include <string>
|
|
9
|
+
using namespace std;
|
|
10
|
+
|
|
11
|
+
namespace Storage {
|
|
12
|
+
class Folders : public Storage {
|
|
13
|
+
void load_directory(string path, DataSet::SparseDataSet *data_set, int category_index);
|
|
14
|
+
|
|
15
|
+
public:
|
|
16
|
+
string path;
|
|
17
|
+
Preprocessing::Text::TextPipeline *pipeline;
|
|
18
|
+
|
|
19
|
+
Folders(string path, Preprocessing::Text::TextPipeline *pipeline) : path(path), pipeline(pipeline) {}
|
|
20
|
+
DataSet::DataSet *read();
|
|
21
|
+
void write(DataSet::DataSet *data_set);
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
#endif
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
#ifndef __storage_h__
|
|
2
|
+
#define __storage_h__
|
|
3
|
+
#include "data_set/data_set.h"
|
|
4
|
+
#include "model/model.h"
|
|
5
|
+
|
|
6
|
+
namespace Storage {
|
|
7
|
+
class Storage {
|
|
8
|
+
public:
|
|
9
|
+
// all storage implementations must be able to read and write data sets
|
|
10
|
+
virtual DataSet::DataSet *read() = 0;
|
|
11
|
+
virtual void write(DataSet::DataSet *data_set) = 0;
|
|
12
|
+
|
|
13
|
+
// some implementations can read and write trained models
|
|
14
|
+
virtual Model::Model *read_model() { return NULL; }
|
|
15
|
+
virtual void write_model(Model::Model *model) {}
|
|
16
|
+
};
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
#endif
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
module Quarry
|
|
2
|
+
module Classifier
|
|
3
|
+
class Classifier
|
|
4
|
+
attr_reader :classifier
|
|
5
|
+
def initialize(data_set)
|
|
6
|
+
@data_set = data_set
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def prepare
|
|
10
|
+
@classifier.prepare
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def classify(example)
|
|
14
|
+
@data_set.categories[@classifier.classify_to_index(example.example)]
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def rank(example)
|
|
18
|
+
@classifier.rank(example.example)
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|