thera 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. data/.document +5 -0
  2. data/.gitignore +56 -0
  3. data/Gemfile +2 -0
  4. data/Gemfile.lock +20 -0
  5. data/LICENSE.txt +1 -0
  6. data/README.rdoc +8 -0
  7. data/Rakefile +1 -0
  8. data/ext/Makefile +225 -0
  9. data/ext/extconf.rb +29 -0
  10. data/ext/quarry/quarry_toolkit.cpp +148 -0
  11. data/lib/quarry/Makefile.linux +2 -0
  12. data/lib/quarry/Makefile.osx +6 -0
  13. data/lib/quarry/Makefile.targets +23 -0
  14. data/lib/quarry/obj/.gitkeep +0 -0
  15. data/lib/quarry/src/classifier/aode/aode_classifier.cpp +0 -0
  16. data/lib/quarry/src/classifier/aode/aode_classifier.h +0 -0
  17. data/lib/quarry/src/classifier/centroid/centroid_classifier.cpp +0 -0
  18. data/lib/quarry/src/classifier/centroid/centroid_classifier.h +0 -0
  19. data/lib/quarry/src/classifier/classifier.cpp +32 -0
  20. data/lib/quarry/src/classifier/classifier.h +59 -0
  21. data/lib/quarry/src/classifier/knn/knn_classifier.cpp +0 -0
  22. data/lib/quarry/src/classifier/knn/knn_classifier.h +0 -0
  23. data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.cpp +40 -0
  24. data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.h +18 -0
  25. data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +80 -0
  26. data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.h +52 -0
  27. data/lib/quarry/src/data_set/data_set.cpp +130 -0
  28. data/lib/quarry/src/data_set/data_set.h +78 -0
  29. data/lib/quarry/src/data_set/dense/dense_data_set.h +39 -0
  30. data/lib/quarry/src/data_set/dense/dense_example.h +44 -0
  31. data/lib/quarry/src/data_set/example.cpp +10 -0
  32. data/lib/quarry/src/data_set/example.h +23 -0
  33. data/lib/quarry/src/data_set/feature.h +36 -0
  34. data/lib/quarry/src/data_set/features/nominal_feature.cpp +57 -0
  35. data/lib/quarry/src/data_set/features/nominal_feature.h +76 -0
  36. data/lib/quarry/src/data_set/features/numeric_feature.cpp +69 -0
  37. data/lib/quarry/src/data_set/features/numeric_feature.h +78 -0
  38. data/lib/quarry/src/data_set/sparse/sparse_data_set.h +40 -0
  39. data/lib/quarry/src/data_set/sparse/sparse_example.cpp +82 -0
  40. data/lib/quarry/src/data_set/sparse/sparse_example.h +38 -0
  41. data/lib/quarry/src/metrics/confusion_matrix.cpp +129 -0
  42. data/lib/quarry/src/metrics/confusion_matrix.h +82 -0
  43. data/lib/quarry/src/model/model.cpp +29 -0
  44. data/lib/quarry/src/model/model.h +50 -0
  45. data/lib/quarry/src/preprocessing/examples/example_preprocessor.h +20 -0
  46. data/lib/quarry/src/preprocessing/examples/weights/binary_weight.h +20 -0
  47. data/lib/quarry/src/preprocessing/examples/weights/local_weight.h +29 -0
  48. data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +19 -0
  49. data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +59 -0
  50. data/lib/quarry/src/preprocessing/text/inplace_processor/downcase.h +26 -0
  51. data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h +17 -0
  52. data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer.h +44 -0
  53. data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer_original.cpp +375 -0
  54. data/lib/quarry/src/preprocessing/text/text_pipeline.cpp +29 -0
  55. data/lib/quarry/src/preprocessing/text/text_pipeline.h +37 -0
  56. data/lib/quarry/src/preprocessing/text/token_selector/pos_tag_selector.h +21 -0
  57. data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp +82 -0
  58. data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h +20 -0
  59. data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h +17 -0
  60. data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.cpp +29 -0
  61. data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.h +20 -0
  62. data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h +19 -0
  63. data/lib/quarry/src/quarry.cpp +1 -0
  64. data/lib/quarry/src/quarry.h +29 -0
  65. data/lib/quarry/src/storage/arff.cpp +198 -0
  66. data/lib/quarry/src/storage/arff.h +26 -0
  67. data/lib/quarry/src/storage/binary.cpp +457 -0
  68. data/lib/quarry/src/storage/binary.h +79 -0
  69. data/lib/quarry/src/storage/folders.cpp +98 -0
  70. data/lib/quarry/src/storage/folders.h +25 -0
  71. data/lib/quarry/src/storage/storage.h +19 -0
  72. data/lib/quarry/src/test.cpp +6 -0
  73. data/lib/quarry_rb/classifier/classifier.rb +22 -0
  74. data/lib/quarry_rb/classifier/naive_bayes_classifier.rb +10 -0
  75. data/lib/quarry_rb/confusion_matrix.rb +58 -0
  76. data/lib/quarry_rb/data_set/data_set.rb +42 -0
  77. data/lib/quarry_rb/data_set/example.rb +33 -0
  78. data/lib/quarry_rb/data_set/feature.rb +28 -0
  79. data/lib/quarry_rb/enumerable_helper.rb +32 -0
  80. data/lib/quarry_rb/model/model.rb +56 -0
  81. data/lib/quarry_rb/storage/arff.rb +11 -0
  82. data/lib/quarry_rb/storage/binary.rb +23 -0
  83. data/lib/quarry_rb/storage/folders.rb +11 -0
  84. data/lib/quarry_rb/text_pipeline.rb +16 -0
  85. data/lib/thera.rb +20 -0
  86. data/test/helper.rb +19 -0
  87. data/test/test_quarry.rb +33 -0
  88. data/thera.gemspec +21 -0
  89. metadata +148 -0
@@ -0,0 +1,457 @@
1
+ #include "binary.h"
2
+ #include <stdexcept>
3
+ #include <vector>
4
+
5
+ // ------------------------------------------
6
+ // integer 'magic marks' are used to identify
7
+ // binary files, delineate sections of the
8
+ // file, and can be used to test endianess
9
+ // ------------------------------------------
10
+ static const uint32_t file_mark = 'quar';
11
+ static const uint32_t none_mark = 'none';
12
+ static const uint32_t classifier_mark = 'clas';
13
+ static const uint32_t text_pipeline_mark = 'texp';
14
+
15
+
16
+ // ------------------------------------------
17
+ // low level read and write operations
18
+ // ------------------------------------------
19
+ void Storage::Binary::write_int(int number) {
20
+ file.write((char *)(&number), sizeof(int));
21
+ }
22
+
23
+ int Storage::Binary::read_int() {
24
+ int value = 0;
25
+ file.read((char *)(&value), sizeof(int));
26
+ return value;
27
+ }
28
+
29
+ void Storage::Binary::write_mark(uint32_t mark) {
30
+ file.write((char *)(&mark), sizeof(uint32_t));
31
+ }
32
+
33
+ uint32_t Storage::Binary::read_mark() {
34
+ uint32_t value = 0;
35
+ file.read((char *)(&value), sizeof(uint32_t));
36
+ return value;
37
+ }
38
+
39
+ void Storage::Binary::write_bool(bool value) {
40
+ char file_value = (value ? 1 : 0);
41
+ file.write(&file_value, 1);
42
+ }
43
+
44
+ bool Storage::Binary::read_bool() {
45
+ char value = 0;
46
+ file.read(&value, 1);
47
+ return value != 0;
48
+ }
49
+
50
+ void Storage::Binary::write_string(string str) {
51
+ file.write(str.c_str(), str.length() + 1);
52
+ }
53
+
54
+ string Storage::Binary::read_string() {
55
+ string str;
56
+ std::getline(file, str, '\0');
57
+ return str;
58
+ }
59
+
60
+
61
+ // ------------------------------------------
62
+ // data set
63
+ // ------------------------------------------
64
+ DataSet::DataSet *Storage::Binary::read_data_set() {
65
+ DataSet::DataSet *data_set = NULL;
66
+ bool sparse = read_bool();
67
+
68
+ // determine the type of data set to create
69
+ if(sparse)
70
+ data_set = new DataSet::SparseDataSet();
71
+ else
72
+ data_set = new DataSet::DenseDataSet();
73
+
74
+ // initialise the data set
75
+ data_set->name = read_string();
76
+ data_set->category_index = read_int();
77
+ data_set->counted = read_bool();
78
+ data_set->indexed = read_bool();
79
+
80
+ // initialise the data set's features
81
+ DataSet::NominalFeature *nominal_feature;
82
+ DataSet::NumericFeature *numeric_feature;
83
+ int index = 0, count = 0;
84
+ bool nominal = false;
85
+ string name;
86
+
87
+ // determine the number of features to read; count caches need to know the number of categories up front
88
+ int num_features = read_int();
89
+ int num_categories = read_int();
90
+
91
+ for(int i = 0; i < num_features; i++) {
92
+ nominal = read_bool();
93
+ index = read_int();
94
+ name = read_string();
95
+
96
+ if(nominal) {
97
+ nominal_feature = data_set->new_nominal_feature(name);
98
+ nominal_feature->index = index;
99
+
100
+ // read the nominal category names
101
+ count = read_int();
102
+ for(int i = 0; i < count; i++)
103
+ nominal_feature->add_value(read_string());
104
+
105
+ // read cached frequencies and probabilities if present
106
+ if(data_set->counted) {
107
+ nominal_feature->frequencies = *read_vector<int>();
108
+ nominal_feature->probabilities = *read_vector<double>();
109
+ nominal_feature->category_frequencies.resize(num_categories + 1);
110
+ nominal_feature->category_probabilities.resize(num_categories + 1);
111
+
112
+ for(int i = 1; i <= num_categories; i++)
113
+ nominal_feature->category_frequencies[i] = *read_vector<int>();
114
+
115
+ for(int i = 1; i <= num_categories; i++)
116
+ nominal_feature->category_probabilities[i] = *read_vector<double>();
117
+ }
118
+
119
+ // TODO: read cached indexes
120
+ if(data_set->indexed) {
121
+ }
122
+
123
+ } else {
124
+ numeric_feature = data_set->new_numeric_feature(name);
125
+ numeric_feature->index = index;
126
+
127
+ // cached counts
128
+ if(data_set->counted) {
129
+ file.read((char *)&(numeric_feature->counts), sizeof(DataSet::NumericFeature::Counts));
130
+ numeric_feature->category_counts = (DataSet::NumericFeature::Counts *) malloc(sizeof(DataSet::NumericFeature::Counts) * (num_categories + 1));
131
+ for(int i = 1; i <= num_categories; i++)
132
+ file.read((char *)&(numeric_feature->category_counts[i]), sizeof(DataSet::NumericFeature::Counts));
133
+ }
134
+
135
+ // TODO: cached indexes
136
+ if(data_set->indexed) {
137
+ }
138
+ }
139
+ }
140
+
141
+ // read examples if present
142
+ should_write_examples = read_bool();
143
+ if(should_write_examples) {
144
+ int num_examples = read_int();
145
+
146
+ if(sparse) {
147
+ DataSet::SparseExample *sparse_example;
148
+
149
+ for(int i = 0; i < num_examples; i++) {
150
+ // number of non-zero values
151
+ count = read_int();
152
+
153
+ // construct & read the example
154
+ sparse_example = ((DataSet::SparseDataSet *) data_set)->new_example(count);
155
+ file.read((char *)sparse_example->values, count * sizeof(DataSet::SparseExample::Value));
156
+ sparse_example->size = count;
157
+ }
158
+
159
+ } else {
160
+ // each dense example stores the same number of values
161
+ count = read_int();
162
+
163
+ // read each example
164
+ DataSet::DenseExample *dense_example;
165
+ for(int i = 0; i < num_examples; i++) {
166
+ dense_example = ((DataSet::DenseDataSet *) data_set)->new_example();
167
+ file.read((char *)dense_example->values, count * sizeof(double));
168
+ }
169
+ }
170
+ }
171
+
172
+ return data_set;
173
+ }
174
+
175
+ void Storage::Binary::write_data_set(DataSet::DataSet *data_set) {
176
+ bool sparse = (typeid(*data_set) == typeid(DataSet::SparseDataSet));
177
+ int num_categories = data_set->categories_size();
178
+ int num_features = data_set->features_size();
179
+ int num_examples = data_set->examples_size();
180
+
181
+ // data set header
182
+ write_bool(sparse);
183
+ write_string(data_set->name);
184
+ write_int(data_set->category_index);
185
+ write_bool(data_set->counted);
186
+ write_bool(data_set->indexed);
187
+ write_int(num_features);
188
+ write_int(num_categories);
189
+
190
+ // features
191
+ DataSet::NominalFeature *nominal_feature;
192
+ DataSet::NumericFeature *numeric_feature;
193
+ DataSet::Feature *feature;
194
+ uint32_t count = 0;
195
+ bool nominal;
196
+
197
+ for(int i = 0; i < num_features; i++) {
198
+ feature = data_set->features[i];
199
+ nominal = (typeid(*feature) == typeid(DataSet::NominalFeature));
200
+ write_bool(nominal);
201
+ write_int(feature->index);
202
+ write_string(feature->name);
203
+
204
+ if(nominal) {
205
+ nominal_feature = (DataSet::NominalFeature *)feature;
206
+
207
+ // category names
208
+ count = nominal_feature->names.size();
209
+ write_int(count - 1);
210
+ for(int i = 1; i < count; i++)
211
+ write_string(nominal_feature->names.at(i));
212
+
213
+ // cached counts
214
+ if(data_set->counted) {
215
+ write_vector<int>(&(nominal_feature->frequencies));
216
+ write_vector<double>(&(nominal_feature->probabilities));
217
+
218
+ for(int i = 1; i <= num_categories; i++)
219
+ write_vector<int>(&(nominal_feature->category_frequencies.at(i)));
220
+
221
+ for(int i = 1; i <= num_categories; i++)
222
+ write_vector<double>(&(nominal_feature->category_probabilities.at(i)));
223
+ }
224
+
225
+ // TODO: cached indexes
226
+ if(data_set->indexed) {
227
+ }
228
+
229
+ } else {
230
+ numeric_feature = (DataSet::NumericFeature *)feature;
231
+
232
+ // cached counts
233
+ if(data_set->counted) {
234
+ file.write((const char *)&(numeric_feature->counts), sizeof(DataSet::NumericFeature::Counts));
235
+ for(int i = 1; i <= num_categories; i++)
236
+ file.write((const char *)&(numeric_feature->category_counts[i]), sizeof(DataSet::NumericFeature::Counts));
237
+ }
238
+
239
+ // TODO: cached indexes
240
+ if(data_set->indexed) {
241
+ }
242
+ }
243
+ }
244
+
245
+ // examples
246
+ write_bool(should_write_examples);
247
+ if(should_write_examples) {
248
+ write_int(num_examples);
249
+
250
+ if(sparse) {
251
+ DataSet::SparseExample *example;
252
+ for(int i = 0; i < num_examples; i++) {
253
+ example = (DataSet::SparseExample *) data_set->examples[i];
254
+ count = example->size;
255
+ write_int(count);
256
+ file.write((char *)(example->values), count * sizeof(DataSet::SparseExample::Value));
257
+ }
258
+
259
+ } else {
260
+ // each dense example stores the same number of values
261
+ count = data_set->examples[0]->size;
262
+ write_int(count);
263
+
264
+ // write each example
265
+ for(int i = 0; i < num_examples; i++)
266
+ file.write((char *)((DataSet::DenseExample *)data_set->examples[i])->values, count * sizeof(double));
267
+ }
268
+ }
269
+ }
270
+
271
+
272
+ // ------------------------------------------
273
+ // classifiers
274
+ // ------------------------------------------
275
+ Classifier::Classifier *Storage::Binary::read_classifier(DataSet::DataSet *data_set) {
276
+ uint32_t mark = read_mark();
277
+ if(mark == none_mark)
278
+ return NULL;
279
+ else if(mark != classifier_mark)
280
+ throw runtime_error("Expected classifier section");
281
+
282
+ Classifier::Classifier *classifier = NULL;
283
+ uint32_t type = read_mark();
284
+
285
+ switch(type) {
286
+ case Classifier::NaiveBayesClassifier::file_mark:
287
+ classifier = new Classifier::NaiveBayesClassifier(data_set);
288
+ break;
289
+ }
290
+
291
+ if(classifier)
292
+ classifier->read_binary(this);
293
+ return classifier;
294
+ }
295
+
296
+ void Storage::Binary::write_classifier(Classifier::Classifier *classifier) {
297
+ if(!classifier) {
298
+ write_mark(none_mark);
299
+ } else {
300
+ write_mark(classifier_mark);
301
+ write_mark(classifier->mark());
302
+ classifier->write_binary(this);
303
+ }
304
+ }
305
+
306
+
307
+ // ------------------------------------------
308
+ // text pipeline
309
+ // ------------------------------------------
310
+ Preprocessing::Text::TextPipeline *Storage::Binary::read_text_pipeline() {
311
+ uint32_t mark = read_mark();
312
+ if(mark == none_mark)
313
+ return NULL;
314
+ else if(mark != text_pipeline_mark)
315
+ throw runtime_error("Expected text pipeline section");
316
+
317
+ Preprocessing::Text::TextPipeline *pipeline = new Preprocessing::Text::TextPipeline();
318
+
319
+ // tokeniser
320
+ switch(read_mark()) {
321
+ case Preprocessing::Text::SimpleTokeniser::file_mark:
322
+ pipeline->tokeniser = new Preprocessing::Text::SimpleTokeniser(pipeline);
323
+ break;
324
+ }
325
+
326
+ // inplace processors
327
+ int count = read_int();
328
+ for(int i = 0; i < count; i++) {
329
+ switch(read_mark()) {
330
+ case Preprocessing::Text::Downcase::file_mark:
331
+ pipeline->processors.push_back(new Preprocessing::Text::Downcase());
332
+ break;
333
+ case Preprocessing::Text::PorterStemmer::file_mark:
334
+ pipeline->processors.push_back(new Preprocessing::Text::PorterStemmer());
335
+ break;
336
+ }
337
+ }
338
+
339
+ // token selectors
340
+ count = read_int();
341
+ for(int i = 0; i < count; i++) {
342
+ switch(read_mark()) {
343
+ case Preprocessing::Text::StopWords::file_mark:
344
+ pipeline->selectors.push_back(new Preprocessing::Text::StopWords());
345
+ break;
346
+ case Preprocessing::Text::POSTagSelector::file_mark:
347
+ pipeline->selectors.push_back(new Preprocessing::Text::POSTagSelector());
348
+ break;
349
+ }
350
+ }
351
+
352
+ // example generator
353
+ switch(read_mark()) {
354
+ case Preprocessing::Text::TokenCounter::file_mark:
355
+ pipeline->generator = new Preprocessing::Text::TokenCounter();
356
+ break;
357
+ }
358
+
359
+ return pipeline;
360
+ }
361
+
362
+ void Storage::Binary::write_text_pipeline(Preprocessing::Text::TextPipeline *pipeline) {
363
+ if(!pipeline) {
364
+ write_mark(none_mark);
365
+ return;
366
+ } else {
367
+ write_mark(text_pipeline_mark);
368
+ }
369
+
370
+ // tokeniser
371
+ write_mark(pipeline->tokeniser->mark());
372
+
373
+ // inplace processors
374
+ int count = pipeline->processors.size();
375
+ write_int(count);
376
+ for(int i = 0; i < count; i++)
377
+ write_mark(pipeline->processors[i]->mark());
378
+
379
+ // token selectors
380
+ count = pipeline->selectors.size();
381
+ write_int(count);
382
+ for(int i = 0; i < count; i++)
383
+ write_mark(pipeline->selectors[i]->mark());
384
+
385
+ // example generator
386
+ write_mark(pipeline->generator->mark());
387
+ }
388
+
389
+
390
+ // ------------------------------------------
391
+ // helpers
392
+ // ------------------------------------------
393
+ void Storage::Binary::open_for_reading() {
394
+ // open file
395
+ file.open(path.c_str(), fstream::in | fstream::binary);
396
+
397
+ // ensure file is ok for reading
398
+ if(!file.good())
399
+ throw runtime_error("Error opening binary file for reading");
400
+
401
+ // quick sanity check
402
+ if(read_mark() != file_mark)
403
+ throw runtime_error("Binary file mark is invalid");
404
+ }
405
+
406
+ void Storage::Binary::open_for_writing() {
407
+ // open/create file
408
+ file.open(path.c_str(), fstream::out | fstream::binary);
409
+
410
+ // ensure file is ok for writing
411
+ if(!file.good())
412
+ throw runtime_error("Error opening binary file for writing");
413
+
414
+ // write the file marker so reads can test the file format
415
+ write_mark(file_mark);
416
+ }
417
+
418
+
419
+ // ------------------------------------------
420
+ // public read & write methods
421
+ // ------------------------------------------
422
+ DataSet::DataSet *Storage::Binary::read() {
423
+ open_for_reading();
424
+ DataSet::DataSet *data_set = read_data_set();
425
+ file.close();
426
+ return data_set;
427
+ }
428
+
429
+ void Storage::Binary::write(DataSet::DataSet *data_set) {
430
+ open_for_writing();
431
+ write_data_set(data_set);
432
+ file.close();
433
+ }
434
+
435
+ Model::Model *Storage::Binary::read_model() {
436
+ open_for_reading();
437
+
438
+ // read the 3 model components
439
+ Model::Model *model = new Model::Model();
440
+ model->data_set = read_data_set();
441
+ model->classifier = read_classifier(model->data_set);
442
+ model->text_pipeline = read_text_pipeline();
443
+
444
+ file.close();
445
+ return model;
446
+ }
447
+
448
+ void Storage::Binary::write_model(Model::Model *model) {
449
+ open_for_writing();
450
+
451
+ // write the 3 model components
452
+ write_data_set(model->data_set);
453
+ write_classifier(model->classifier);
454
+ write_text_pipeline(model->text_pipeline);
455
+
456
+ file.close();
457
+ }
@@ -0,0 +1,79 @@
1
+ #ifndef __binary_h__
2
+ #define __binary_h__
3
+ #include "storage/storage.h"
4
+ #include "data_set/dense/dense_data_set.h"
5
+ #include "data_set/sparse/sparse_data_set.h"
6
+ #include "classifier/naive_bayes/naive_bayes_classifier.h"
7
+ #include <fstream>
8
+ using namespace std;
9
+
10
+ namespace Storage {
11
+ class Binary : public Storage {
12
+ string path;
13
+ fstream file;
14
+
15
+ // helpers
16
+ void open_for_reading();
17
+ void open_for_writing();
18
+
19
+ // low level IO
20
+ void write_string(string str);
21
+ string read_string();
22
+ void write_int(int number);
23
+ int read_int();
24
+ void write_mark(uint32_t mark);
25
+ uint32_t read_mark();
26
+ void write_bool(bool value);
27
+ bool read_bool();
28
+
29
+ // these templated functions are used outside this class,
30
+ // so their definition needs to be in this header file for
31
+ // each version of the function to be generated
32
+ template<class T> vector<T> *read_vector() {
33
+ vector<T> *values = new vector<T>();
34
+ int size = read_int();
35
+ values->reserve(size);
36
+ T value;
37
+
38
+ for(int i = 0; i < size; i++) {
39
+ file.read((char *)(&value), sizeof(T));
40
+ values->push_back(value);
41
+ }
42
+
43
+ return values;
44
+ }
45
+
46
+ template<class T> void write_vector(vector<T> *values) {
47
+ uint32_t size = values->size();
48
+ write_int(size);
49
+
50
+ for(int i = 0; i < size; i++)
51
+ file.write((char *)(&values->at(i)), sizeof(T));
52
+ }
53
+
54
+ // serialisation
55
+ DataSet::DataSet *read_data_set();
56
+ void write_data_set(DataSet::DataSet *data_set);
57
+ Classifier::Classifier *read_classifier(DataSet::DataSet *data_set);
58
+ void write_classifier(Classifier::Classifier *classifier);
59
+ Preprocessing::Text::TextPipeline *read_text_pipeline();
60
+ void write_text_pipeline(Preprocessing::Text::TextPipeline *pipeline);
61
+
62
+ public:
63
+ bool should_write_examples;
64
+ Binary(string path) : path(path), should_write_examples(false) {}
65
+ bool get_write_examples() { return should_write_examples; }
66
+ void set_write_examples(bool write) { should_write_examples = write;}
67
+
68
+ DataSet::DataSet *read();
69
+ Model::Model *read_model();
70
+ void write(DataSet::DataSet *data_set);
71
+ void write_model(Model::Model *model);
72
+
73
+ friend class Preprocessing::Text::TextPipeline;
74
+ friend class Classifier::Classifier;
75
+ friend class Classifier::NaiveBayesClassifier;
76
+ };
77
+ }
78
+
79
+ #endif
@@ -0,0 +1,98 @@
1
+ #include "folders.h"
2
+ #include <fstream>
3
+ #include <iostream>
4
+ #include <stdlib.h>
5
+ #include <dirent.h>
6
+ #include <sys/stat.h>
7
+ using namespace std;
8
+
9
+ static char *file_data = NULL;
10
+ static int file_data_size = 0;
11
+ static int file_count = 0;
12
+
13
+
14
+ void Storage::Folders::load_directory(string path, DataSet::SparseDataSet *data_set, int category_index) {
15
+ DataSet::SparseExample *example;
16
+ DIR *dir = opendir(path.c_str());
17
+ struct dirent *dp;
18
+ char *name;
19
+ string newpath;
20
+ struct stat info;
21
+ FILE *file;
22
+ int file_length;
23
+
24
+ while((dp = readdir(dir))) {
25
+ // ignore files starting with a dot
26
+ name = dp->d_name;
27
+ if(*name == '.')
28
+ continue;
29
+
30
+ // ensure this is a file, not a folder
31
+ newpath = path + "/" + name;
32
+ stat(newpath.c_str(), &info);
33
+ if(info.st_mode & S_IFDIR)
34
+ continue;
35
+
36
+ // determine if the file_data buffer is large enough to hold this file
37
+ file = fopen(newpath.c_str(), "rb");
38
+ fseek(file, 0, SEEK_END);
39
+ file_length = ftell(file) + 1;
40
+ rewind(file);
41
+
42
+ if(file_data_size < file_length) {
43
+ if(file_data != NULL)
44
+ free(file_data);
45
+ file_data = (char *)malloc(file_length);
46
+ file_data_size = file_length;
47
+ }
48
+
49
+ // read into the buffer
50
+ fread(file_data, 1, file_length - 1, file);
51
+ file_data[file_length - 1] = 0;
52
+ fclose(file);
53
+
54
+ // insert a new example into the dataset
55
+ example = pipeline->process_text(data_set, file_data);
56
+ example->set_category_index(data_set, category_index);
57
+
58
+ file_count++;
59
+ if((file_count % 10000) == 0)
60
+ cout << "Read " << file_count << endl;
61
+ }
62
+
63
+ closedir(dir);
64
+ }
65
+
66
+ DataSet::DataSet *Storage::Folders::read() {
67
+ DataSet::SparseDataSet *data_set = new DataSet::SparseDataSet();
68
+ DIR *dir = opendir(path.c_str());
69
+ struct dirent *dp;
70
+ char *name;
71
+ string newpath;
72
+ struct stat info;
73
+
74
+ // create an initial feature "Category"
75
+ DataSet::NominalFeature *categories = data_set->new_nominal_feature("Category");
76
+ data_set->set_category_index(0);
77
+ int category_index = 0;
78
+
79
+ while((dp = readdir(dir))) {
80
+ // ignore files starting with a dot
81
+ name = dp->d_name;
82
+ if(*name == '.')
83
+ continue;
84
+
85
+ // ensure this is a folder
86
+ newpath = path + "/" + name;
87
+ stat(newpath.c_str(), &info);
88
+ if(info.st_mode & S_IFDIR) {
89
+ category_index = categories->value_index(string(name));
90
+ load_directory(newpath, data_set, category_index);
91
+ }
92
+ }
93
+
94
+ return data_set;
95
+ }
96
+
97
+ void Storage::Folders::write(DataSet::DataSet *data_set) {
98
+ }
@@ -0,0 +1,25 @@
1
+ #ifndef __folders_h__
2
+ #define __folders_h__
3
+ #include "preprocessing/text/text_pipeline.h"
4
+ #include "data_set/data_set.h"
5
+ #include "storage/storage.h"
6
+ #include <algorithm>
7
+ #include <cctype>
8
+ #include <string>
9
+ using namespace std;
10
+
11
+ namespace Storage {
12
+ class Folders : public Storage {
13
+ void load_directory(string path, DataSet::SparseDataSet *data_set, int category_index);
14
+
15
+ public:
16
+ string path;
17
+ Preprocessing::Text::TextPipeline *pipeline;
18
+
19
+ Folders(string path, Preprocessing::Text::TextPipeline *pipeline) : path(path), pipeline(pipeline) {}
20
+ DataSet::DataSet *read();
21
+ void write(DataSet::DataSet *data_set);
22
+ };
23
+ }
24
+
25
+ #endif
@@ -0,0 +1,19 @@
1
+ #ifndef __storage_h__
2
+ #define __storage_h__
3
+ #include "data_set/data_set.h"
4
+ #include "model/model.h"
5
+
6
+ namespace Storage {
7
+ class Storage {
8
+ public:
9
+ // all storage implementations must be able to read and write data sets
10
+ virtual DataSet::DataSet *read() = 0;
11
+ virtual void write(DataSet::DataSet *data_set) = 0;
12
+
13
+ // some implementations can read and write trained models
14
+ virtual Model::Model *read_model() { return NULL; }
15
+ virtual void write_model(Model::Model *model) {}
16
+ };
17
+ }
18
+
19
+ #endif
@@ -0,0 +1,6 @@
1
+ #include "quarry.h"
2
+ #include <stdexcept>
3
+ #include <cstring>
4
+
5
+ int main() {
6
+ }
@@ -0,0 +1,22 @@
1
+ module Quarry
2
+ module Classifier
3
+ class Classifier
4
+ attr_reader :classifier
5
+ def initialize(data_set)
6
+ @data_set = data_set
7
+ end
8
+
9
+ def prepare
10
+ @classifier.prepare
11
+ end
12
+
13
+ def classify(example)
14
+ @data_set.categories[@classifier.classify_to_index(example.example)]
15
+ end
16
+
17
+ def rank(example)
18
+ @classifier.rank(example.example)
19
+ end
20
+ end
21
+ end
22
+ end