thera 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +56 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +20 -0
- data/LICENSE.txt +1 -0
- data/README.rdoc +8 -0
- data/Rakefile +1 -0
- data/ext/Makefile +225 -0
- data/ext/extconf.rb +29 -0
- data/ext/quarry/quarry_toolkit.cpp +148 -0
- data/lib/quarry/Makefile.linux +2 -0
- data/lib/quarry/Makefile.osx +6 -0
- data/lib/quarry/Makefile.targets +23 -0
- data/lib/quarry/obj/.gitkeep +0 -0
- data/lib/quarry/src/classifier/aode/aode_classifier.cpp +0 -0
- data/lib/quarry/src/classifier/aode/aode_classifier.h +0 -0
- data/lib/quarry/src/classifier/centroid/centroid_classifier.cpp +0 -0
- data/lib/quarry/src/classifier/centroid/centroid_classifier.h +0 -0
- data/lib/quarry/src/classifier/classifier.cpp +32 -0
- data/lib/quarry/src/classifier/classifier.h +59 -0
- data/lib/quarry/src/classifier/knn/knn_classifier.cpp +0 -0
- data/lib/quarry/src/classifier/knn/knn_classifier.h +0 -0
- data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.cpp +40 -0
- data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.h +18 -0
- data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +80 -0
- data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.h +52 -0
- data/lib/quarry/src/data_set/data_set.cpp +130 -0
- data/lib/quarry/src/data_set/data_set.h +78 -0
- data/lib/quarry/src/data_set/dense/dense_data_set.h +39 -0
- data/lib/quarry/src/data_set/dense/dense_example.h +44 -0
- data/lib/quarry/src/data_set/example.cpp +10 -0
- data/lib/quarry/src/data_set/example.h +23 -0
- data/lib/quarry/src/data_set/feature.h +36 -0
- data/lib/quarry/src/data_set/features/nominal_feature.cpp +57 -0
- data/lib/quarry/src/data_set/features/nominal_feature.h +76 -0
- data/lib/quarry/src/data_set/features/numeric_feature.cpp +69 -0
- data/lib/quarry/src/data_set/features/numeric_feature.h +78 -0
- data/lib/quarry/src/data_set/sparse/sparse_data_set.h +40 -0
- data/lib/quarry/src/data_set/sparse/sparse_example.cpp +82 -0
- data/lib/quarry/src/data_set/sparse/sparse_example.h +38 -0
- data/lib/quarry/src/metrics/confusion_matrix.cpp +129 -0
- data/lib/quarry/src/metrics/confusion_matrix.h +82 -0
- data/lib/quarry/src/model/model.cpp +29 -0
- data/lib/quarry/src/model/model.h +50 -0
- data/lib/quarry/src/preprocessing/examples/example_preprocessor.h +20 -0
- data/lib/quarry/src/preprocessing/examples/weights/binary_weight.h +20 -0
- data/lib/quarry/src/preprocessing/examples/weights/local_weight.h +29 -0
- data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +19 -0
- data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +59 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/downcase.h +26 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h +17 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer.h +44 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer_original.cpp +375 -0
- data/lib/quarry/src/preprocessing/text/text_pipeline.cpp +29 -0
- data/lib/quarry/src/preprocessing/text/text_pipeline.h +37 -0
- data/lib/quarry/src/preprocessing/text/token_selector/pos_tag_selector.h +21 -0
- data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp +82 -0
- data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h +20 -0
- data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h +17 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.cpp +29 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.h +20 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h +19 -0
- data/lib/quarry/src/quarry.cpp +1 -0
- data/lib/quarry/src/quarry.h +29 -0
- data/lib/quarry/src/storage/arff.cpp +198 -0
- data/lib/quarry/src/storage/arff.h +26 -0
- data/lib/quarry/src/storage/binary.cpp +457 -0
- data/lib/quarry/src/storage/binary.h +79 -0
- data/lib/quarry/src/storage/folders.cpp +98 -0
- data/lib/quarry/src/storage/folders.h +25 -0
- data/lib/quarry/src/storage/storage.h +19 -0
- data/lib/quarry/src/test.cpp +6 -0
- data/lib/quarry_rb/classifier/classifier.rb +22 -0
- data/lib/quarry_rb/classifier/naive_bayes_classifier.rb +10 -0
- data/lib/quarry_rb/confusion_matrix.rb +58 -0
- data/lib/quarry_rb/data_set/data_set.rb +42 -0
- data/lib/quarry_rb/data_set/example.rb +33 -0
- data/lib/quarry_rb/data_set/feature.rb +28 -0
- data/lib/quarry_rb/enumerable_helper.rb +32 -0
- data/lib/quarry_rb/model/model.rb +56 -0
- data/lib/quarry_rb/storage/arff.rb +11 -0
- data/lib/quarry_rb/storage/binary.rb +23 -0
- data/lib/quarry_rb/storage/folders.rb +11 -0
- data/lib/quarry_rb/text_pipeline.rb +16 -0
- data/lib/thera.rb +20 -0
- data/test/helper.rb +19 -0
- data/test/test_quarry.rb +33 -0
- data/thera.gemspec +21 -0
- metadata +148 -0
@@ -0,0 +1,457 @@
|
|
1
|
+
#include "binary.h"
|
2
|
+
#include <stdexcept>
|
3
|
+
#include <vector>
|
4
|
+
|
5
|
+
// ------------------------------------------
|
6
|
+
// integer 'magic marks' are used to identify
|
7
|
+
// binary files, delineate sections of the
|
8
|
+
// file, and can be used to test endianess
|
9
|
+
// ------------------------------------------
|
10
|
+
static const uint32_t file_mark = 'quar';
|
11
|
+
static const uint32_t none_mark = 'none';
|
12
|
+
static const uint32_t classifier_mark = 'clas';
|
13
|
+
static const uint32_t text_pipeline_mark = 'texp';
|
14
|
+
|
15
|
+
|
16
|
+
// ------------------------------------------
|
17
|
+
// low level read and write operations
|
18
|
+
// ------------------------------------------
|
19
|
+
void Storage::Binary::write_int(int number) {
|
20
|
+
file.write((char *)(&number), sizeof(int));
|
21
|
+
}
|
22
|
+
|
23
|
+
int Storage::Binary::read_int() {
|
24
|
+
int value = 0;
|
25
|
+
file.read((char *)(&value), sizeof(int));
|
26
|
+
return value;
|
27
|
+
}
|
28
|
+
|
29
|
+
void Storage::Binary::write_mark(uint32_t mark) {
|
30
|
+
file.write((char *)(&mark), sizeof(uint32_t));
|
31
|
+
}
|
32
|
+
|
33
|
+
uint32_t Storage::Binary::read_mark() {
|
34
|
+
uint32_t value = 0;
|
35
|
+
file.read((char *)(&value), sizeof(uint32_t));
|
36
|
+
return value;
|
37
|
+
}
|
38
|
+
|
39
|
+
void Storage::Binary::write_bool(bool value) {
|
40
|
+
char file_value = (value ? 1 : 0);
|
41
|
+
file.write(&file_value, 1);
|
42
|
+
}
|
43
|
+
|
44
|
+
bool Storage::Binary::read_bool() {
|
45
|
+
char value = 0;
|
46
|
+
file.read(&value, 1);
|
47
|
+
return value != 0;
|
48
|
+
}
|
49
|
+
|
50
|
+
void Storage::Binary::write_string(string str) {
|
51
|
+
file.write(str.c_str(), str.length() + 1);
|
52
|
+
}
|
53
|
+
|
54
|
+
string Storage::Binary::read_string() {
|
55
|
+
string str;
|
56
|
+
std::getline(file, str, '\0');
|
57
|
+
return str;
|
58
|
+
}
|
59
|
+
|
60
|
+
|
61
|
+
// ------------------------------------------
|
62
|
+
// data set
|
63
|
+
// ------------------------------------------
|
64
|
+
DataSet::DataSet *Storage::Binary::read_data_set() {
|
65
|
+
DataSet::DataSet *data_set = NULL;
|
66
|
+
bool sparse = read_bool();
|
67
|
+
|
68
|
+
// determine the type of data set to create
|
69
|
+
if(sparse)
|
70
|
+
data_set = new DataSet::SparseDataSet();
|
71
|
+
else
|
72
|
+
data_set = new DataSet::DenseDataSet();
|
73
|
+
|
74
|
+
// initialise the data set
|
75
|
+
data_set->name = read_string();
|
76
|
+
data_set->category_index = read_int();
|
77
|
+
data_set->counted = read_bool();
|
78
|
+
data_set->indexed = read_bool();
|
79
|
+
|
80
|
+
// initialise the data set's features
|
81
|
+
DataSet::NominalFeature *nominal_feature;
|
82
|
+
DataSet::NumericFeature *numeric_feature;
|
83
|
+
int index = 0, count = 0;
|
84
|
+
bool nominal = false;
|
85
|
+
string name;
|
86
|
+
|
87
|
+
// determine the number of features to read; count caches need to know the number of categories up front
|
88
|
+
int num_features = read_int();
|
89
|
+
int num_categories = read_int();
|
90
|
+
|
91
|
+
for(int i = 0; i < num_features; i++) {
|
92
|
+
nominal = read_bool();
|
93
|
+
index = read_int();
|
94
|
+
name = read_string();
|
95
|
+
|
96
|
+
if(nominal) {
|
97
|
+
nominal_feature = data_set->new_nominal_feature(name);
|
98
|
+
nominal_feature->index = index;
|
99
|
+
|
100
|
+
// read the nominal category names
|
101
|
+
count = read_int();
|
102
|
+
for(int i = 0; i < count; i++)
|
103
|
+
nominal_feature->add_value(read_string());
|
104
|
+
|
105
|
+
// read cached frequencies and probabilities if present
|
106
|
+
if(data_set->counted) {
|
107
|
+
nominal_feature->frequencies = *read_vector<int>();
|
108
|
+
nominal_feature->probabilities = *read_vector<double>();
|
109
|
+
nominal_feature->category_frequencies.resize(num_categories + 1);
|
110
|
+
nominal_feature->category_probabilities.resize(num_categories + 1);
|
111
|
+
|
112
|
+
for(int i = 1; i <= num_categories; i++)
|
113
|
+
nominal_feature->category_frequencies[i] = *read_vector<int>();
|
114
|
+
|
115
|
+
for(int i = 1; i <= num_categories; i++)
|
116
|
+
nominal_feature->category_probabilities[i] = *read_vector<double>();
|
117
|
+
}
|
118
|
+
|
119
|
+
// TODO: read cached indexes
|
120
|
+
if(data_set->indexed) {
|
121
|
+
}
|
122
|
+
|
123
|
+
} else {
|
124
|
+
numeric_feature = data_set->new_numeric_feature(name);
|
125
|
+
numeric_feature->index = index;
|
126
|
+
|
127
|
+
// cached counts
|
128
|
+
if(data_set->counted) {
|
129
|
+
file.read((char *)&(numeric_feature->counts), sizeof(DataSet::NumericFeature::Counts));
|
130
|
+
numeric_feature->category_counts = (DataSet::NumericFeature::Counts *) malloc(sizeof(DataSet::NumericFeature::Counts) * (num_categories + 1));
|
131
|
+
for(int i = 1; i <= num_categories; i++)
|
132
|
+
file.read((char *)&(numeric_feature->category_counts[i]), sizeof(DataSet::NumericFeature::Counts));
|
133
|
+
}
|
134
|
+
|
135
|
+
// TODO: cached indexes
|
136
|
+
if(data_set->indexed) {
|
137
|
+
}
|
138
|
+
}
|
139
|
+
}
|
140
|
+
|
141
|
+
// read examples if present
|
142
|
+
should_write_examples = read_bool();
|
143
|
+
if(should_write_examples) {
|
144
|
+
int num_examples = read_int();
|
145
|
+
|
146
|
+
if(sparse) {
|
147
|
+
DataSet::SparseExample *sparse_example;
|
148
|
+
|
149
|
+
for(int i = 0; i < num_examples; i++) {
|
150
|
+
// number of non-zero values
|
151
|
+
count = read_int();
|
152
|
+
|
153
|
+
// construct & read the example
|
154
|
+
sparse_example = ((DataSet::SparseDataSet *) data_set)->new_example(count);
|
155
|
+
file.read((char *)sparse_example->values, count * sizeof(DataSet::SparseExample::Value));
|
156
|
+
sparse_example->size = count;
|
157
|
+
}
|
158
|
+
|
159
|
+
} else {
|
160
|
+
// each dense example stores the same number of values
|
161
|
+
count = read_int();
|
162
|
+
|
163
|
+
// read each example
|
164
|
+
DataSet::DenseExample *dense_example;
|
165
|
+
for(int i = 0; i < num_examples; i++) {
|
166
|
+
dense_example = ((DataSet::DenseDataSet *) data_set)->new_example();
|
167
|
+
file.read((char *)dense_example->values, count * sizeof(double));
|
168
|
+
}
|
169
|
+
}
|
170
|
+
}
|
171
|
+
|
172
|
+
return data_set;
|
173
|
+
}
|
174
|
+
|
175
|
+
void Storage::Binary::write_data_set(DataSet::DataSet *data_set) {
|
176
|
+
bool sparse = (typeid(*data_set) == typeid(DataSet::SparseDataSet));
|
177
|
+
int num_categories = data_set->categories_size();
|
178
|
+
int num_features = data_set->features_size();
|
179
|
+
int num_examples = data_set->examples_size();
|
180
|
+
|
181
|
+
// data set header
|
182
|
+
write_bool(sparse);
|
183
|
+
write_string(data_set->name);
|
184
|
+
write_int(data_set->category_index);
|
185
|
+
write_bool(data_set->counted);
|
186
|
+
write_bool(data_set->indexed);
|
187
|
+
write_int(num_features);
|
188
|
+
write_int(num_categories);
|
189
|
+
|
190
|
+
// features
|
191
|
+
DataSet::NominalFeature *nominal_feature;
|
192
|
+
DataSet::NumericFeature *numeric_feature;
|
193
|
+
DataSet::Feature *feature;
|
194
|
+
uint32_t count = 0;
|
195
|
+
bool nominal;
|
196
|
+
|
197
|
+
for(int i = 0; i < num_features; i++) {
|
198
|
+
feature = data_set->features[i];
|
199
|
+
nominal = (typeid(*feature) == typeid(DataSet::NominalFeature));
|
200
|
+
write_bool(nominal);
|
201
|
+
write_int(feature->index);
|
202
|
+
write_string(feature->name);
|
203
|
+
|
204
|
+
if(nominal) {
|
205
|
+
nominal_feature = (DataSet::NominalFeature *)feature;
|
206
|
+
|
207
|
+
// category names
|
208
|
+
count = nominal_feature->names.size();
|
209
|
+
write_int(count - 1);
|
210
|
+
for(int i = 1; i < count; i++)
|
211
|
+
write_string(nominal_feature->names.at(i));
|
212
|
+
|
213
|
+
// cached counts
|
214
|
+
if(data_set->counted) {
|
215
|
+
write_vector<int>(&(nominal_feature->frequencies));
|
216
|
+
write_vector<double>(&(nominal_feature->probabilities));
|
217
|
+
|
218
|
+
for(int i = 1; i <= num_categories; i++)
|
219
|
+
write_vector<int>(&(nominal_feature->category_frequencies.at(i)));
|
220
|
+
|
221
|
+
for(int i = 1; i <= num_categories; i++)
|
222
|
+
write_vector<double>(&(nominal_feature->category_probabilities.at(i)));
|
223
|
+
}
|
224
|
+
|
225
|
+
// TODO: cached indexes
|
226
|
+
if(data_set->indexed) {
|
227
|
+
}
|
228
|
+
|
229
|
+
} else {
|
230
|
+
numeric_feature = (DataSet::NumericFeature *)feature;
|
231
|
+
|
232
|
+
// cached counts
|
233
|
+
if(data_set->counted) {
|
234
|
+
file.write((const char *)&(numeric_feature->counts), sizeof(DataSet::NumericFeature::Counts));
|
235
|
+
for(int i = 1; i <= num_categories; i++)
|
236
|
+
file.write((const char *)&(numeric_feature->category_counts[i]), sizeof(DataSet::NumericFeature::Counts));
|
237
|
+
}
|
238
|
+
|
239
|
+
// TODO: cached indexes
|
240
|
+
if(data_set->indexed) {
|
241
|
+
}
|
242
|
+
}
|
243
|
+
}
|
244
|
+
|
245
|
+
// examples
|
246
|
+
write_bool(should_write_examples);
|
247
|
+
if(should_write_examples) {
|
248
|
+
write_int(num_examples);
|
249
|
+
|
250
|
+
if(sparse) {
|
251
|
+
DataSet::SparseExample *example;
|
252
|
+
for(int i = 0; i < num_examples; i++) {
|
253
|
+
example = (DataSet::SparseExample *) data_set->examples[i];
|
254
|
+
count = example->size;
|
255
|
+
write_int(count);
|
256
|
+
file.write((char *)(example->values), count * sizeof(DataSet::SparseExample::Value));
|
257
|
+
}
|
258
|
+
|
259
|
+
} else {
|
260
|
+
// each dense example stores the same number of values
|
261
|
+
count = data_set->examples[0]->size;
|
262
|
+
write_int(count);
|
263
|
+
|
264
|
+
// write each example
|
265
|
+
for(int i = 0; i < num_examples; i++)
|
266
|
+
file.write((char *)((DataSet::DenseExample *)data_set->examples[i])->values, count * sizeof(double));
|
267
|
+
}
|
268
|
+
}
|
269
|
+
}
|
270
|
+
|
271
|
+
|
272
|
+
// ------------------------------------------
|
273
|
+
// classifiers
|
274
|
+
// ------------------------------------------
|
275
|
+
Classifier::Classifier *Storage::Binary::read_classifier(DataSet::DataSet *data_set) {
|
276
|
+
uint32_t mark = read_mark();
|
277
|
+
if(mark == none_mark)
|
278
|
+
return NULL;
|
279
|
+
else if(mark != classifier_mark)
|
280
|
+
throw runtime_error("Expected classifier section");
|
281
|
+
|
282
|
+
Classifier::Classifier *classifier = NULL;
|
283
|
+
uint32_t type = read_mark();
|
284
|
+
|
285
|
+
switch(type) {
|
286
|
+
case Classifier::NaiveBayesClassifier::file_mark:
|
287
|
+
classifier = new Classifier::NaiveBayesClassifier(data_set);
|
288
|
+
break;
|
289
|
+
}
|
290
|
+
|
291
|
+
if(classifier)
|
292
|
+
classifier->read_binary(this);
|
293
|
+
return classifier;
|
294
|
+
}
|
295
|
+
|
296
|
+
void Storage::Binary::write_classifier(Classifier::Classifier *classifier) {
|
297
|
+
if(!classifier) {
|
298
|
+
write_mark(none_mark);
|
299
|
+
} else {
|
300
|
+
write_mark(classifier_mark);
|
301
|
+
write_mark(classifier->mark());
|
302
|
+
classifier->write_binary(this);
|
303
|
+
}
|
304
|
+
}
|
305
|
+
|
306
|
+
|
307
|
+
// ------------------------------------------
|
308
|
+
// text pipeline
|
309
|
+
// ------------------------------------------
|
310
|
+
Preprocessing::Text::TextPipeline *Storage::Binary::read_text_pipeline() {
|
311
|
+
uint32_t mark = read_mark();
|
312
|
+
if(mark == none_mark)
|
313
|
+
return NULL;
|
314
|
+
else if(mark != text_pipeline_mark)
|
315
|
+
throw runtime_error("Expected text pipeline section");
|
316
|
+
|
317
|
+
Preprocessing::Text::TextPipeline *pipeline = new Preprocessing::Text::TextPipeline();
|
318
|
+
|
319
|
+
// tokeniser
|
320
|
+
switch(read_mark()) {
|
321
|
+
case Preprocessing::Text::SimpleTokeniser::file_mark:
|
322
|
+
pipeline->tokeniser = new Preprocessing::Text::SimpleTokeniser(pipeline);
|
323
|
+
break;
|
324
|
+
}
|
325
|
+
|
326
|
+
// inplace processors
|
327
|
+
int count = read_int();
|
328
|
+
for(int i = 0; i < count; i++) {
|
329
|
+
switch(read_mark()) {
|
330
|
+
case Preprocessing::Text::Downcase::file_mark:
|
331
|
+
pipeline->processors.push_back(new Preprocessing::Text::Downcase());
|
332
|
+
break;
|
333
|
+
case Preprocessing::Text::PorterStemmer::file_mark:
|
334
|
+
pipeline->processors.push_back(new Preprocessing::Text::PorterStemmer());
|
335
|
+
break;
|
336
|
+
}
|
337
|
+
}
|
338
|
+
|
339
|
+
// token selectors
|
340
|
+
count = read_int();
|
341
|
+
for(int i = 0; i < count; i++) {
|
342
|
+
switch(read_mark()) {
|
343
|
+
case Preprocessing::Text::StopWords::file_mark:
|
344
|
+
pipeline->selectors.push_back(new Preprocessing::Text::StopWords());
|
345
|
+
break;
|
346
|
+
case Preprocessing::Text::POSTagSelector::file_mark:
|
347
|
+
pipeline->selectors.push_back(new Preprocessing::Text::POSTagSelector());
|
348
|
+
break;
|
349
|
+
}
|
350
|
+
}
|
351
|
+
|
352
|
+
// example generator
|
353
|
+
switch(read_mark()) {
|
354
|
+
case Preprocessing::Text::TokenCounter::file_mark:
|
355
|
+
pipeline->generator = new Preprocessing::Text::TokenCounter();
|
356
|
+
break;
|
357
|
+
}
|
358
|
+
|
359
|
+
return pipeline;
|
360
|
+
}
|
361
|
+
|
362
|
+
void Storage::Binary::write_text_pipeline(Preprocessing::Text::TextPipeline *pipeline) {
|
363
|
+
if(!pipeline) {
|
364
|
+
write_mark(none_mark);
|
365
|
+
return;
|
366
|
+
} else {
|
367
|
+
write_mark(text_pipeline_mark);
|
368
|
+
}
|
369
|
+
|
370
|
+
// tokeniser
|
371
|
+
write_mark(pipeline->tokeniser->mark());
|
372
|
+
|
373
|
+
// inplace processors
|
374
|
+
int count = pipeline->processors.size();
|
375
|
+
write_int(count);
|
376
|
+
for(int i = 0; i < count; i++)
|
377
|
+
write_mark(pipeline->processors[i]->mark());
|
378
|
+
|
379
|
+
// token selectors
|
380
|
+
count = pipeline->selectors.size();
|
381
|
+
write_int(count);
|
382
|
+
for(int i = 0; i < count; i++)
|
383
|
+
write_mark(pipeline->selectors[i]->mark());
|
384
|
+
|
385
|
+
// example generator
|
386
|
+
write_mark(pipeline->generator->mark());
|
387
|
+
}
|
388
|
+
|
389
|
+
|
390
|
+
// ------------------------------------------
|
391
|
+
// helpers
|
392
|
+
// ------------------------------------------
|
393
|
+
void Storage::Binary::open_for_reading() {
|
394
|
+
// open file
|
395
|
+
file.open(path.c_str(), fstream::in | fstream::binary);
|
396
|
+
|
397
|
+
// ensure file is ok for reading
|
398
|
+
if(!file.good())
|
399
|
+
throw runtime_error("Error opening binary file for reading");
|
400
|
+
|
401
|
+
// quick sanity check
|
402
|
+
if(read_mark() != file_mark)
|
403
|
+
throw runtime_error("Binary file mark is invalid");
|
404
|
+
}
|
405
|
+
|
406
|
+
void Storage::Binary::open_for_writing() {
|
407
|
+
// open/create file
|
408
|
+
file.open(path.c_str(), fstream::out | fstream::binary);
|
409
|
+
|
410
|
+
// ensure file is ok for writing
|
411
|
+
if(!file.good())
|
412
|
+
throw runtime_error("Error opening binary file for writing");
|
413
|
+
|
414
|
+
// write the file marker so reads can test the file format
|
415
|
+
write_mark(file_mark);
|
416
|
+
}
|
417
|
+
|
418
|
+
|
419
|
+
// ------------------------------------------
|
420
|
+
// public read & write methods
|
421
|
+
// ------------------------------------------
|
422
|
+
DataSet::DataSet *Storage::Binary::read() {
|
423
|
+
open_for_reading();
|
424
|
+
DataSet::DataSet *data_set = read_data_set();
|
425
|
+
file.close();
|
426
|
+
return data_set;
|
427
|
+
}
|
428
|
+
|
429
|
+
void Storage::Binary::write(DataSet::DataSet *data_set) {
|
430
|
+
open_for_writing();
|
431
|
+
write_data_set(data_set);
|
432
|
+
file.close();
|
433
|
+
}
|
434
|
+
|
435
|
+
Model::Model *Storage::Binary::read_model() {
|
436
|
+
open_for_reading();
|
437
|
+
|
438
|
+
// read the 3 model components
|
439
|
+
Model::Model *model = new Model::Model();
|
440
|
+
model->data_set = read_data_set();
|
441
|
+
model->classifier = read_classifier(model->data_set);
|
442
|
+
model->text_pipeline = read_text_pipeline();
|
443
|
+
|
444
|
+
file.close();
|
445
|
+
return model;
|
446
|
+
}
|
447
|
+
|
448
|
+
void Storage::Binary::write_model(Model::Model *model) {
|
449
|
+
open_for_writing();
|
450
|
+
|
451
|
+
// write the 3 model components
|
452
|
+
write_data_set(model->data_set);
|
453
|
+
write_classifier(model->classifier);
|
454
|
+
write_text_pipeline(model->text_pipeline);
|
455
|
+
|
456
|
+
file.close();
|
457
|
+
}
|
@@ -0,0 +1,79 @@
|
|
1
|
+
#ifndef __binary_h__
|
2
|
+
#define __binary_h__
|
3
|
+
#include "storage/storage.h"
|
4
|
+
#include "data_set/dense/dense_data_set.h"
|
5
|
+
#include "data_set/sparse/sparse_data_set.h"
|
6
|
+
#include "classifier/naive_bayes/naive_bayes_classifier.h"
|
7
|
+
#include <fstream>
|
8
|
+
using namespace std;
|
9
|
+
|
10
|
+
namespace Storage {
|
11
|
+
class Binary : public Storage {
|
12
|
+
string path;
|
13
|
+
fstream file;
|
14
|
+
|
15
|
+
// helpers
|
16
|
+
void open_for_reading();
|
17
|
+
void open_for_writing();
|
18
|
+
|
19
|
+
// low level IO
|
20
|
+
void write_string(string str);
|
21
|
+
string read_string();
|
22
|
+
void write_int(int number);
|
23
|
+
int read_int();
|
24
|
+
void write_mark(uint32_t mark);
|
25
|
+
uint32_t read_mark();
|
26
|
+
void write_bool(bool value);
|
27
|
+
bool read_bool();
|
28
|
+
|
29
|
+
// these templated functions are used outside this class,
|
30
|
+
// so their definition needs to be in this header file for
|
31
|
+
// each version of the function to be generated
|
32
|
+
template<class T> vector<T> *read_vector() {
|
33
|
+
vector<T> *values = new vector<T>();
|
34
|
+
int size = read_int();
|
35
|
+
values->reserve(size);
|
36
|
+
T value;
|
37
|
+
|
38
|
+
for(int i = 0; i < size; i++) {
|
39
|
+
file.read((char *)(&value), sizeof(T));
|
40
|
+
values->push_back(value);
|
41
|
+
}
|
42
|
+
|
43
|
+
return values;
|
44
|
+
}
|
45
|
+
|
46
|
+
template<class T> void write_vector(vector<T> *values) {
|
47
|
+
uint32_t size = values->size();
|
48
|
+
write_int(size);
|
49
|
+
|
50
|
+
for(int i = 0; i < size; i++)
|
51
|
+
file.write((char *)(&values->at(i)), sizeof(T));
|
52
|
+
}
|
53
|
+
|
54
|
+
// serialisation
|
55
|
+
DataSet::DataSet *read_data_set();
|
56
|
+
void write_data_set(DataSet::DataSet *data_set);
|
57
|
+
Classifier::Classifier *read_classifier(DataSet::DataSet *data_set);
|
58
|
+
void write_classifier(Classifier::Classifier *classifier);
|
59
|
+
Preprocessing::Text::TextPipeline *read_text_pipeline();
|
60
|
+
void write_text_pipeline(Preprocessing::Text::TextPipeline *pipeline);
|
61
|
+
|
62
|
+
public:
|
63
|
+
bool should_write_examples;
|
64
|
+
Binary(string path) : path(path), should_write_examples(false) {}
|
65
|
+
bool get_write_examples() { return should_write_examples; }
|
66
|
+
void set_write_examples(bool write) { should_write_examples = write;}
|
67
|
+
|
68
|
+
DataSet::DataSet *read();
|
69
|
+
Model::Model *read_model();
|
70
|
+
void write(DataSet::DataSet *data_set);
|
71
|
+
void write_model(Model::Model *model);
|
72
|
+
|
73
|
+
friend class Preprocessing::Text::TextPipeline;
|
74
|
+
friend class Classifier::Classifier;
|
75
|
+
friend class Classifier::NaiveBayesClassifier;
|
76
|
+
};
|
77
|
+
}
|
78
|
+
|
79
|
+
#endif
|
@@ -0,0 +1,98 @@
|
|
1
|
+
#include "folders.h"
|
2
|
+
#include <fstream>
|
3
|
+
#include <iostream>
|
4
|
+
#include <stdlib.h>
|
5
|
+
#include <dirent.h>
|
6
|
+
#include <sys/stat.h>
|
7
|
+
using namespace std;
|
8
|
+
|
9
|
+
static char *file_data = NULL;
|
10
|
+
static int file_data_size = 0;
|
11
|
+
static int file_count = 0;
|
12
|
+
|
13
|
+
|
14
|
+
void Storage::Folders::load_directory(string path, DataSet::SparseDataSet *data_set, int category_index) {
|
15
|
+
DataSet::SparseExample *example;
|
16
|
+
DIR *dir = opendir(path.c_str());
|
17
|
+
struct dirent *dp;
|
18
|
+
char *name;
|
19
|
+
string newpath;
|
20
|
+
struct stat info;
|
21
|
+
FILE *file;
|
22
|
+
int file_length;
|
23
|
+
|
24
|
+
while((dp = readdir(dir))) {
|
25
|
+
// ignore files starting with a dot
|
26
|
+
name = dp->d_name;
|
27
|
+
if(*name == '.')
|
28
|
+
continue;
|
29
|
+
|
30
|
+
// ensure this is a file, not a folder
|
31
|
+
newpath = path + "/" + name;
|
32
|
+
stat(newpath.c_str(), &info);
|
33
|
+
if(info.st_mode & S_IFDIR)
|
34
|
+
continue;
|
35
|
+
|
36
|
+
// determine if the file_data buffer is large enough to hold this file
|
37
|
+
file = fopen(newpath.c_str(), "rb");
|
38
|
+
fseek(file, 0, SEEK_END);
|
39
|
+
file_length = ftell(file) + 1;
|
40
|
+
rewind(file);
|
41
|
+
|
42
|
+
if(file_data_size < file_length) {
|
43
|
+
if(file_data != NULL)
|
44
|
+
free(file_data);
|
45
|
+
file_data = (char *)malloc(file_length);
|
46
|
+
file_data_size = file_length;
|
47
|
+
}
|
48
|
+
|
49
|
+
// read into the buffer
|
50
|
+
fread(file_data, 1, file_length - 1, file);
|
51
|
+
file_data[file_length - 1] = 0;
|
52
|
+
fclose(file);
|
53
|
+
|
54
|
+
// insert a new example into the dataset
|
55
|
+
example = pipeline->process_text(data_set, file_data);
|
56
|
+
example->set_category_index(data_set, category_index);
|
57
|
+
|
58
|
+
file_count++;
|
59
|
+
if((file_count % 10000) == 0)
|
60
|
+
cout << "Read " << file_count << endl;
|
61
|
+
}
|
62
|
+
|
63
|
+
closedir(dir);
|
64
|
+
}
|
65
|
+
|
66
|
+
DataSet::DataSet *Storage::Folders::read() {
|
67
|
+
DataSet::SparseDataSet *data_set = new DataSet::SparseDataSet();
|
68
|
+
DIR *dir = opendir(path.c_str());
|
69
|
+
struct dirent *dp;
|
70
|
+
char *name;
|
71
|
+
string newpath;
|
72
|
+
struct stat info;
|
73
|
+
|
74
|
+
// create an initial feature "Category"
|
75
|
+
DataSet::NominalFeature *categories = data_set->new_nominal_feature("Category");
|
76
|
+
data_set->set_category_index(0);
|
77
|
+
int category_index = 0;
|
78
|
+
|
79
|
+
while((dp = readdir(dir))) {
|
80
|
+
// ignore files starting with a dot
|
81
|
+
name = dp->d_name;
|
82
|
+
if(*name == '.')
|
83
|
+
continue;
|
84
|
+
|
85
|
+
// ensure this is a folder
|
86
|
+
newpath = path + "/" + name;
|
87
|
+
stat(newpath.c_str(), &info);
|
88
|
+
if(info.st_mode & S_IFDIR) {
|
89
|
+
category_index = categories->value_index(string(name));
|
90
|
+
load_directory(newpath, data_set, category_index);
|
91
|
+
}
|
92
|
+
}
|
93
|
+
|
94
|
+
return data_set;
|
95
|
+
}
|
96
|
+
|
97
|
+
void Storage::Folders::write(DataSet::DataSet *data_set) {
|
98
|
+
}
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#ifndef __folders_h__
|
2
|
+
#define __folders_h__
|
3
|
+
#include "preprocessing/text/text_pipeline.h"
|
4
|
+
#include "data_set/data_set.h"
|
5
|
+
#include "storage/storage.h"
|
6
|
+
#include <algorithm>
|
7
|
+
#include <cctype>
|
8
|
+
#include <string>
|
9
|
+
using namespace std;
|
10
|
+
|
11
|
+
namespace Storage {
|
12
|
+
class Folders : public Storage {
|
13
|
+
void load_directory(string path, DataSet::SparseDataSet *data_set, int category_index);
|
14
|
+
|
15
|
+
public:
|
16
|
+
string path;
|
17
|
+
Preprocessing::Text::TextPipeline *pipeline;
|
18
|
+
|
19
|
+
Folders(string path, Preprocessing::Text::TextPipeline *pipeline) : path(path), pipeline(pipeline) {}
|
20
|
+
DataSet::DataSet *read();
|
21
|
+
void write(DataSet::DataSet *data_set);
|
22
|
+
};
|
23
|
+
}
|
24
|
+
|
25
|
+
#endif
|
@@ -0,0 +1,19 @@
|
|
1
|
+
#ifndef __storage_h__
|
2
|
+
#define __storage_h__
|
3
|
+
#include "data_set/data_set.h"
|
4
|
+
#include "model/model.h"
|
5
|
+
|
6
|
+
namespace Storage {
|
7
|
+
class Storage {
|
8
|
+
public:
|
9
|
+
// all storage implementations must be able to read and write data sets
|
10
|
+
virtual DataSet::DataSet *read() = 0;
|
11
|
+
virtual void write(DataSet::DataSet *data_set) = 0;
|
12
|
+
|
13
|
+
// some implementations can read and write trained models
|
14
|
+
virtual Model::Model *read_model() { return NULL; }
|
15
|
+
virtual void write_model(Model::Model *model) {}
|
16
|
+
};
|
17
|
+
}
|
18
|
+
|
19
|
+
#endif
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Quarry
|
2
|
+
module Classifier
|
3
|
+
class Classifier
|
4
|
+
attr_reader :classifier
|
5
|
+
def initialize(data_set)
|
6
|
+
@data_set = data_set
|
7
|
+
end
|
8
|
+
|
9
|
+
def prepare
|
10
|
+
@classifier.prepare
|
11
|
+
end
|
12
|
+
|
13
|
+
def classify(example)
|
14
|
+
@data_set.categories[@classifier.classify_to_index(example.example)]
|
15
|
+
end
|
16
|
+
|
17
|
+
def rank(example)
|
18
|
+
@classifier.rank(example.example)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|