thera 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. data/.document +5 -0
  2. data/.gitignore +56 -0
  3. data/Gemfile +2 -0
  4. data/Gemfile.lock +20 -0
  5. data/LICENSE.txt +1 -0
  6. data/README.rdoc +8 -0
  7. data/Rakefile +1 -0
  8. data/ext/Makefile +225 -0
  9. data/ext/extconf.rb +29 -0
  10. data/ext/quarry/quarry_toolkit.cpp +148 -0
  11. data/lib/quarry/Makefile.linux +2 -0
  12. data/lib/quarry/Makefile.osx +6 -0
  13. data/lib/quarry/Makefile.targets +23 -0
  14. data/lib/quarry/obj/.gitkeep +0 -0
  15. data/lib/quarry/src/classifier/aode/aode_classifier.cpp +0 -0
  16. data/lib/quarry/src/classifier/aode/aode_classifier.h +0 -0
  17. data/lib/quarry/src/classifier/centroid/centroid_classifier.cpp +0 -0
  18. data/lib/quarry/src/classifier/centroid/centroid_classifier.h +0 -0
  19. data/lib/quarry/src/classifier/classifier.cpp +32 -0
  20. data/lib/quarry/src/classifier/classifier.h +59 -0
  21. data/lib/quarry/src/classifier/knn/knn_classifier.cpp +0 -0
  22. data/lib/quarry/src/classifier/knn/knn_classifier.h +0 -0
  23. data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.cpp +40 -0
  24. data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.h +18 -0
  25. data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +80 -0
  26. data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.h +52 -0
  27. data/lib/quarry/src/data_set/data_set.cpp +130 -0
  28. data/lib/quarry/src/data_set/data_set.h +78 -0
  29. data/lib/quarry/src/data_set/dense/dense_data_set.h +39 -0
  30. data/lib/quarry/src/data_set/dense/dense_example.h +44 -0
  31. data/lib/quarry/src/data_set/example.cpp +10 -0
  32. data/lib/quarry/src/data_set/example.h +23 -0
  33. data/lib/quarry/src/data_set/feature.h +36 -0
  34. data/lib/quarry/src/data_set/features/nominal_feature.cpp +57 -0
  35. data/lib/quarry/src/data_set/features/nominal_feature.h +76 -0
  36. data/lib/quarry/src/data_set/features/numeric_feature.cpp +69 -0
  37. data/lib/quarry/src/data_set/features/numeric_feature.h +78 -0
  38. data/lib/quarry/src/data_set/sparse/sparse_data_set.h +40 -0
  39. data/lib/quarry/src/data_set/sparse/sparse_example.cpp +82 -0
  40. data/lib/quarry/src/data_set/sparse/sparse_example.h +38 -0
  41. data/lib/quarry/src/metrics/confusion_matrix.cpp +129 -0
  42. data/lib/quarry/src/metrics/confusion_matrix.h +82 -0
  43. data/lib/quarry/src/model/model.cpp +29 -0
  44. data/lib/quarry/src/model/model.h +50 -0
  45. data/lib/quarry/src/preprocessing/examples/example_preprocessor.h +20 -0
  46. data/lib/quarry/src/preprocessing/examples/weights/binary_weight.h +20 -0
  47. data/lib/quarry/src/preprocessing/examples/weights/local_weight.h +29 -0
  48. data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +19 -0
  49. data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +59 -0
  50. data/lib/quarry/src/preprocessing/text/inplace_processor/downcase.h +26 -0
  51. data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h +17 -0
  52. data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer.h +44 -0
  53. data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer_original.cpp +375 -0
  54. data/lib/quarry/src/preprocessing/text/text_pipeline.cpp +29 -0
  55. data/lib/quarry/src/preprocessing/text/text_pipeline.h +37 -0
  56. data/lib/quarry/src/preprocessing/text/token_selector/pos_tag_selector.h +21 -0
  57. data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp +82 -0
  58. data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h +20 -0
  59. data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h +17 -0
  60. data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.cpp +29 -0
  61. data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.h +20 -0
  62. data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h +19 -0
  63. data/lib/quarry/src/quarry.cpp +1 -0
  64. data/lib/quarry/src/quarry.h +29 -0
  65. data/lib/quarry/src/storage/arff.cpp +198 -0
  66. data/lib/quarry/src/storage/arff.h +26 -0
  67. data/lib/quarry/src/storage/binary.cpp +457 -0
  68. data/lib/quarry/src/storage/binary.h +79 -0
  69. data/lib/quarry/src/storage/folders.cpp +98 -0
  70. data/lib/quarry/src/storage/folders.h +25 -0
  71. data/lib/quarry/src/storage/storage.h +19 -0
  72. data/lib/quarry/src/test.cpp +6 -0
  73. data/lib/quarry_rb/classifier/classifier.rb +22 -0
  74. data/lib/quarry_rb/classifier/naive_bayes_classifier.rb +10 -0
  75. data/lib/quarry_rb/confusion_matrix.rb +58 -0
  76. data/lib/quarry_rb/data_set/data_set.rb +42 -0
  77. data/lib/quarry_rb/data_set/example.rb +33 -0
  78. data/lib/quarry_rb/data_set/feature.rb +28 -0
  79. data/lib/quarry_rb/enumerable_helper.rb +32 -0
  80. data/lib/quarry_rb/model/model.rb +56 -0
  81. data/lib/quarry_rb/storage/arff.rb +11 -0
  82. data/lib/quarry_rb/storage/binary.rb +23 -0
  83. data/lib/quarry_rb/storage/folders.rb +11 -0
  84. data/lib/quarry_rb/text_pipeline.rb +16 -0
  85. data/lib/thera.rb +20 -0
  86. data/test/helper.rb +19 -0
  87. data/test/test_quarry.rb +33 -0
  88. data/thera.gemspec +21 -0
  89. metadata +148 -0
@@ -0,0 +1,457 @@
1
+ #include "binary.h"
2
+ #include <stdexcept>
3
+ #include <vector>
4
+
5
+ // ------------------------------------------
6
+ // integer 'magic marks' are used to identify
7
+ // binary files, delineate sections of the
8
+ // file, and can be used to test endianess
9
+ // ------------------------------------------
10
+ static const uint32_t file_mark = 'quar';
11
+ static const uint32_t none_mark = 'none';
12
+ static const uint32_t classifier_mark = 'clas';
13
+ static const uint32_t text_pipeline_mark = 'texp';
14
+
15
+
16
+ // ------------------------------------------
17
+ // low level read and write operations
18
+ // ------------------------------------------
19
+ void Storage::Binary::write_int(int number) {
20
+ file.write((char *)(&number), sizeof(int));
21
+ }
22
+
23
+ int Storage::Binary::read_int() {
24
+ int value = 0;
25
+ file.read((char *)(&value), sizeof(int));
26
+ return value;
27
+ }
28
+
29
+ void Storage::Binary::write_mark(uint32_t mark) {
30
+ file.write((char *)(&mark), sizeof(uint32_t));
31
+ }
32
+
33
+ uint32_t Storage::Binary::read_mark() {
34
+ uint32_t value = 0;
35
+ file.read((char *)(&value), sizeof(uint32_t));
36
+ return value;
37
+ }
38
+
39
+ void Storage::Binary::write_bool(bool value) {
40
+ char file_value = (value ? 1 : 0);
41
+ file.write(&file_value, 1);
42
+ }
43
+
44
+ bool Storage::Binary::read_bool() {
45
+ char value = 0;
46
+ file.read(&value, 1);
47
+ return value != 0;
48
+ }
49
+
50
+ void Storage::Binary::write_string(string str) {
51
+ file.write(str.c_str(), str.length() + 1);
52
+ }
53
+
54
+ string Storage::Binary::read_string() {
55
+ string str;
56
+ std::getline(file, str, '\0');
57
+ return str;
58
+ }
59
+
60
+
61
+ // ------------------------------------------
62
+ // data set
63
+ // ------------------------------------------
64
+ DataSet::DataSet *Storage::Binary::read_data_set() {
65
+ DataSet::DataSet *data_set = NULL;
66
+ bool sparse = read_bool();
67
+
68
+ // determine the type of data set to create
69
+ if(sparse)
70
+ data_set = new DataSet::SparseDataSet();
71
+ else
72
+ data_set = new DataSet::DenseDataSet();
73
+
74
+ // initialise the data set
75
+ data_set->name = read_string();
76
+ data_set->category_index = read_int();
77
+ data_set->counted = read_bool();
78
+ data_set->indexed = read_bool();
79
+
80
+ // initialise the data set's features
81
+ DataSet::NominalFeature *nominal_feature;
82
+ DataSet::NumericFeature *numeric_feature;
83
+ int index = 0, count = 0;
84
+ bool nominal = false;
85
+ string name;
86
+
87
+ // determine the number of features to read; count caches need to know the number of categories up front
88
+ int num_features = read_int();
89
+ int num_categories = read_int();
90
+
91
+ for(int i = 0; i < num_features; i++) {
92
+ nominal = read_bool();
93
+ index = read_int();
94
+ name = read_string();
95
+
96
+ if(nominal) {
97
+ nominal_feature = data_set->new_nominal_feature(name);
98
+ nominal_feature->index = index;
99
+
100
+ // read the nominal category names
101
+ count = read_int();
102
+ for(int i = 0; i < count; i++)
103
+ nominal_feature->add_value(read_string());
104
+
105
+ // read cached frequencies and probabilities if present
106
+ if(data_set->counted) {
107
+ nominal_feature->frequencies = *read_vector<int>();
108
+ nominal_feature->probabilities = *read_vector<double>();
109
+ nominal_feature->category_frequencies.resize(num_categories + 1);
110
+ nominal_feature->category_probabilities.resize(num_categories + 1);
111
+
112
+ for(int i = 1; i <= num_categories; i++)
113
+ nominal_feature->category_frequencies[i] = *read_vector<int>();
114
+
115
+ for(int i = 1; i <= num_categories; i++)
116
+ nominal_feature->category_probabilities[i] = *read_vector<double>();
117
+ }
118
+
119
+ // TODO: read cached indexes
120
+ if(data_set->indexed) {
121
+ }
122
+
123
+ } else {
124
+ numeric_feature = data_set->new_numeric_feature(name);
125
+ numeric_feature->index = index;
126
+
127
+ // cached counts
128
+ if(data_set->counted) {
129
+ file.read((char *)&(numeric_feature->counts), sizeof(DataSet::NumericFeature::Counts));
130
+ numeric_feature->category_counts = (DataSet::NumericFeature::Counts *) malloc(sizeof(DataSet::NumericFeature::Counts) * (num_categories + 1));
131
+ for(int i = 1; i <= num_categories; i++)
132
+ file.read((char *)&(numeric_feature->category_counts[i]), sizeof(DataSet::NumericFeature::Counts));
133
+ }
134
+
135
+ // TODO: cached indexes
136
+ if(data_set->indexed) {
137
+ }
138
+ }
139
+ }
140
+
141
+ // read examples if present
142
+ should_write_examples = read_bool();
143
+ if(should_write_examples) {
144
+ int num_examples = read_int();
145
+
146
+ if(sparse) {
147
+ DataSet::SparseExample *sparse_example;
148
+
149
+ for(int i = 0; i < num_examples; i++) {
150
+ // number of non-zero values
151
+ count = read_int();
152
+
153
+ // construct & read the example
154
+ sparse_example = ((DataSet::SparseDataSet *) data_set)->new_example(count);
155
+ file.read((char *)sparse_example->values, count * sizeof(DataSet::SparseExample::Value));
156
+ sparse_example->size = count;
157
+ }
158
+
159
+ } else {
160
+ // each dense example stores the same number of values
161
+ count = read_int();
162
+
163
+ // read each example
164
+ DataSet::DenseExample *dense_example;
165
+ for(int i = 0; i < num_examples; i++) {
166
+ dense_example = ((DataSet::DenseDataSet *) data_set)->new_example();
167
+ file.read((char *)dense_example->values, count * sizeof(double));
168
+ }
169
+ }
170
+ }
171
+
172
+ return data_set;
173
+ }
174
+
175
+ void Storage::Binary::write_data_set(DataSet::DataSet *data_set) {
176
+ bool sparse = (typeid(*data_set) == typeid(DataSet::SparseDataSet));
177
+ int num_categories = data_set->categories_size();
178
+ int num_features = data_set->features_size();
179
+ int num_examples = data_set->examples_size();
180
+
181
+ // data set header
182
+ write_bool(sparse);
183
+ write_string(data_set->name);
184
+ write_int(data_set->category_index);
185
+ write_bool(data_set->counted);
186
+ write_bool(data_set->indexed);
187
+ write_int(num_features);
188
+ write_int(num_categories);
189
+
190
+ // features
191
+ DataSet::NominalFeature *nominal_feature;
192
+ DataSet::NumericFeature *numeric_feature;
193
+ DataSet::Feature *feature;
194
+ uint32_t count = 0;
195
+ bool nominal;
196
+
197
+ for(int i = 0; i < num_features; i++) {
198
+ feature = data_set->features[i];
199
+ nominal = (typeid(*feature) == typeid(DataSet::NominalFeature));
200
+ write_bool(nominal);
201
+ write_int(feature->index);
202
+ write_string(feature->name);
203
+
204
+ if(nominal) {
205
+ nominal_feature = (DataSet::NominalFeature *)feature;
206
+
207
+ // category names
208
+ count = nominal_feature->names.size();
209
+ write_int(count - 1);
210
+ for(int i = 1; i < count; i++)
211
+ write_string(nominal_feature->names.at(i));
212
+
213
+ // cached counts
214
+ if(data_set->counted) {
215
+ write_vector<int>(&(nominal_feature->frequencies));
216
+ write_vector<double>(&(nominal_feature->probabilities));
217
+
218
+ for(int i = 1; i <= num_categories; i++)
219
+ write_vector<int>(&(nominal_feature->category_frequencies.at(i)));
220
+
221
+ for(int i = 1; i <= num_categories; i++)
222
+ write_vector<double>(&(nominal_feature->category_probabilities.at(i)));
223
+ }
224
+
225
+ // TODO: cached indexes
226
+ if(data_set->indexed) {
227
+ }
228
+
229
+ } else {
230
+ numeric_feature = (DataSet::NumericFeature *)feature;
231
+
232
+ // cached counts
233
+ if(data_set->counted) {
234
+ file.write((const char *)&(numeric_feature->counts), sizeof(DataSet::NumericFeature::Counts));
235
+ for(int i = 1; i <= num_categories; i++)
236
+ file.write((const char *)&(numeric_feature->category_counts[i]), sizeof(DataSet::NumericFeature::Counts));
237
+ }
238
+
239
+ // TODO: cached indexes
240
+ if(data_set->indexed) {
241
+ }
242
+ }
243
+ }
244
+
245
+ // examples
246
+ write_bool(should_write_examples);
247
+ if(should_write_examples) {
248
+ write_int(num_examples);
249
+
250
+ if(sparse) {
251
+ DataSet::SparseExample *example;
252
+ for(int i = 0; i < num_examples; i++) {
253
+ example = (DataSet::SparseExample *) data_set->examples[i];
254
+ count = example->size;
255
+ write_int(count);
256
+ file.write((char *)(example->values), count * sizeof(DataSet::SparseExample::Value));
257
+ }
258
+
259
+ } else {
260
+ // each dense example stores the same number of values
261
+ count = data_set->examples[0]->size;
262
+ write_int(count);
263
+
264
+ // write each example
265
+ for(int i = 0; i < num_examples; i++)
266
+ file.write((char *)((DataSet::DenseExample *)data_set->examples[i])->values, count * sizeof(double));
267
+ }
268
+ }
269
+ }
270
+
271
+
272
+ // ------------------------------------------
273
+ // classifiers
274
+ // ------------------------------------------
275
+ Classifier::Classifier *Storage::Binary::read_classifier(DataSet::DataSet *data_set) {
276
+ uint32_t mark = read_mark();
277
+ if(mark == none_mark)
278
+ return NULL;
279
+ else if(mark != classifier_mark)
280
+ throw runtime_error("Expected classifier section");
281
+
282
+ Classifier::Classifier *classifier = NULL;
283
+ uint32_t type = read_mark();
284
+
285
+ switch(type) {
286
+ case Classifier::NaiveBayesClassifier::file_mark:
287
+ classifier = new Classifier::NaiveBayesClassifier(data_set);
288
+ break;
289
+ }
290
+
291
+ if(classifier)
292
+ classifier->read_binary(this);
293
+ return classifier;
294
+ }
295
+
296
+ void Storage::Binary::write_classifier(Classifier::Classifier *classifier) {
297
+ if(!classifier) {
298
+ write_mark(none_mark);
299
+ } else {
300
+ write_mark(classifier_mark);
301
+ write_mark(classifier->mark());
302
+ classifier->write_binary(this);
303
+ }
304
+ }
305
+
306
+
307
+ // ------------------------------------------
308
+ // text pipeline
309
+ // ------------------------------------------
310
+ Preprocessing::Text::TextPipeline *Storage::Binary::read_text_pipeline() {
311
+ uint32_t mark = read_mark();
312
+ if(mark == none_mark)
313
+ return NULL;
314
+ else if(mark != text_pipeline_mark)
315
+ throw runtime_error("Expected text pipeline section");
316
+
317
+ Preprocessing::Text::TextPipeline *pipeline = new Preprocessing::Text::TextPipeline();
318
+
319
+ // tokeniser
320
+ switch(read_mark()) {
321
+ case Preprocessing::Text::SimpleTokeniser::file_mark:
322
+ pipeline->tokeniser = new Preprocessing::Text::SimpleTokeniser(pipeline);
323
+ break;
324
+ }
325
+
326
+ // inplace processors
327
+ int count = read_int();
328
+ for(int i = 0; i < count; i++) {
329
+ switch(read_mark()) {
330
+ case Preprocessing::Text::Downcase::file_mark:
331
+ pipeline->processors.push_back(new Preprocessing::Text::Downcase());
332
+ break;
333
+ case Preprocessing::Text::PorterStemmer::file_mark:
334
+ pipeline->processors.push_back(new Preprocessing::Text::PorterStemmer());
335
+ break;
336
+ }
337
+ }
338
+
339
+ // token selectors
340
+ count = read_int();
341
+ for(int i = 0; i < count; i++) {
342
+ switch(read_mark()) {
343
+ case Preprocessing::Text::StopWords::file_mark:
344
+ pipeline->selectors.push_back(new Preprocessing::Text::StopWords());
345
+ break;
346
+ case Preprocessing::Text::POSTagSelector::file_mark:
347
+ pipeline->selectors.push_back(new Preprocessing::Text::POSTagSelector());
348
+ break;
349
+ }
350
+ }
351
+
352
+ // example generator
353
+ switch(read_mark()) {
354
+ case Preprocessing::Text::TokenCounter::file_mark:
355
+ pipeline->generator = new Preprocessing::Text::TokenCounter();
356
+ break;
357
+ }
358
+
359
+ return pipeline;
360
+ }
361
+
362
+ void Storage::Binary::write_text_pipeline(Preprocessing::Text::TextPipeline *pipeline) {
363
+ if(!pipeline) {
364
+ write_mark(none_mark);
365
+ return;
366
+ } else {
367
+ write_mark(text_pipeline_mark);
368
+ }
369
+
370
+ // tokeniser
371
+ write_mark(pipeline->tokeniser->mark());
372
+
373
+ // inplace processors
374
+ int count = pipeline->processors.size();
375
+ write_int(count);
376
+ for(int i = 0; i < count; i++)
377
+ write_mark(pipeline->processors[i]->mark());
378
+
379
+ // token selectors
380
+ count = pipeline->selectors.size();
381
+ write_int(count);
382
+ for(int i = 0; i < count; i++)
383
+ write_mark(pipeline->selectors[i]->mark());
384
+
385
+ // example generator
386
+ write_mark(pipeline->generator->mark());
387
+ }
388
+
389
+
390
+ // ------------------------------------------
391
+ // helpers
392
+ // ------------------------------------------
393
+ void Storage::Binary::open_for_reading() {
394
+ // open file
395
+ file.open(path.c_str(), fstream::in | fstream::binary);
396
+
397
+ // ensure file is ok for reading
398
+ if(!file.good())
399
+ throw runtime_error("Error opening binary file for reading");
400
+
401
+ // quick sanity check
402
+ if(read_mark() != file_mark)
403
+ throw runtime_error("Binary file mark is invalid");
404
+ }
405
+
406
+ void Storage::Binary::open_for_writing() {
407
+ // open/create file
408
+ file.open(path.c_str(), fstream::out | fstream::binary);
409
+
410
+ // ensure file is ok for writing
411
+ if(!file.good())
412
+ throw runtime_error("Error opening binary file for writing");
413
+
414
+ // write the file marker so reads can test the file format
415
+ write_mark(file_mark);
416
+ }
417
+
418
+
419
+ // ------------------------------------------
420
+ // public read & write methods
421
+ // ------------------------------------------
422
+ DataSet::DataSet *Storage::Binary::read() {
423
+ open_for_reading();
424
+ DataSet::DataSet *data_set = read_data_set();
425
+ file.close();
426
+ return data_set;
427
+ }
428
+
429
+ void Storage::Binary::write(DataSet::DataSet *data_set) {
430
+ open_for_writing();
431
+ write_data_set(data_set);
432
+ file.close();
433
+ }
434
+
435
+ Model::Model *Storage::Binary::read_model() {
436
+ open_for_reading();
437
+
438
+ // read the 3 model components
439
+ Model::Model *model = new Model::Model();
440
+ model->data_set = read_data_set();
441
+ model->classifier = read_classifier(model->data_set);
442
+ model->text_pipeline = read_text_pipeline();
443
+
444
+ file.close();
445
+ return model;
446
+ }
447
+
448
+ void Storage::Binary::write_model(Model::Model *model) {
449
+ open_for_writing();
450
+
451
+ // write the 3 model components
452
+ write_data_set(model->data_set);
453
+ write_classifier(model->classifier);
454
+ write_text_pipeline(model->text_pipeline);
455
+
456
+ file.close();
457
+ }
@@ -0,0 +1,79 @@
1
+ #ifndef __binary_h__
2
+ #define __binary_h__
3
+ #include "storage/storage.h"
4
+ #include "data_set/dense/dense_data_set.h"
5
+ #include "data_set/sparse/sparse_data_set.h"
6
+ #include "classifier/naive_bayes/naive_bayes_classifier.h"
7
+ #include <fstream>
8
+ using namespace std;
9
+
10
+ namespace Storage {
11
+ class Binary : public Storage {
12
+ string path;
13
+ fstream file;
14
+
15
+ // helpers
16
+ void open_for_reading();
17
+ void open_for_writing();
18
+
19
+ // low level IO
20
+ void write_string(string str);
21
+ string read_string();
22
+ void write_int(int number);
23
+ int read_int();
24
+ void write_mark(uint32_t mark);
25
+ uint32_t read_mark();
26
+ void write_bool(bool value);
27
+ bool read_bool();
28
+
29
+ // these templated functions are used outside this class,
30
+ // so their definition needs to be in this header file for
31
+ // each version of the function to be generated
32
+ template<class T> vector<T> *read_vector() {
33
+ vector<T> *values = new vector<T>();
34
+ int size = read_int();
35
+ values->reserve(size);
36
+ T value;
37
+
38
+ for(int i = 0; i < size; i++) {
39
+ file.read((char *)(&value), sizeof(T));
40
+ values->push_back(value);
41
+ }
42
+
43
+ return values;
44
+ }
45
+
46
+ template<class T> void write_vector(vector<T> *values) {
47
+ uint32_t size = values->size();
48
+ write_int(size);
49
+
50
+ for(int i = 0; i < size; i++)
51
+ file.write((char *)(&values->at(i)), sizeof(T));
52
+ }
53
+
54
+ // serialisation
55
+ DataSet::DataSet *read_data_set();
56
+ void write_data_set(DataSet::DataSet *data_set);
57
+ Classifier::Classifier *read_classifier(DataSet::DataSet *data_set);
58
+ void write_classifier(Classifier::Classifier *classifier);
59
+ Preprocessing::Text::TextPipeline *read_text_pipeline();
60
+ void write_text_pipeline(Preprocessing::Text::TextPipeline *pipeline);
61
+
62
+ public:
63
+ bool should_write_examples;
64
+ Binary(string path) : path(path), should_write_examples(false) {}
65
+ bool get_write_examples() { return should_write_examples; }
66
+ void set_write_examples(bool write) { should_write_examples = write;}
67
+
68
+ DataSet::DataSet *read();
69
+ Model::Model *read_model();
70
+ void write(DataSet::DataSet *data_set);
71
+ void write_model(Model::Model *model);
72
+
73
+ friend class Preprocessing::Text::TextPipeline;
74
+ friend class Classifier::Classifier;
75
+ friend class Classifier::NaiveBayesClassifier;
76
+ };
77
+ }
78
+
79
+ #endif
@@ -0,0 +1,98 @@
1
+ #include "folders.h"
2
+ #include <fstream>
3
+ #include <iostream>
4
+ #include <stdlib.h>
5
+ #include <dirent.h>
6
+ #include <sys/stat.h>
7
+ using namespace std;
8
+
9
+ static char *file_data = NULL;
10
+ static int file_data_size = 0;
11
+ static int file_count = 0;
12
+
13
+
14
+ void Storage::Folders::load_directory(string path, DataSet::SparseDataSet *data_set, int category_index) {
15
+ DataSet::SparseExample *example;
16
+ DIR *dir = opendir(path.c_str());
17
+ struct dirent *dp;
18
+ char *name;
19
+ string newpath;
20
+ struct stat info;
21
+ FILE *file;
22
+ int file_length;
23
+
24
+ while((dp = readdir(dir))) {
25
+ // ignore files starting with a dot
26
+ name = dp->d_name;
27
+ if(*name == '.')
28
+ continue;
29
+
30
+ // ensure this is a file, not a folder
31
+ newpath = path + "/" + name;
32
+ stat(newpath.c_str(), &info);
33
+ if(info.st_mode & S_IFDIR)
34
+ continue;
35
+
36
+ // determine if the file_data buffer is large enough to hold this file
37
+ file = fopen(newpath.c_str(), "rb");
38
+ fseek(file, 0, SEEK_END);
39
+ file_length = ftell(file) + 1;
40
+ rewind(file);
41
+
42
+ if(file_data_size < file_length) {
43
+ if(file_data != NULL)
44
+ free(file_data);
45
+ file_data = (char *)malloc(file_length);
46
+ file_data_size = file_length;
47
+ }
48
+
49
+ // read into the buffer
50
+ fread(file_data, 1, file_length - 1, file);
51
+ file_data[file_length - 1] = 0;
52
+ fclose(file);
53
+
54
+ // insert a new example into the dataset
55
+ example = pipeline->process_text(data_set, file_data);
56
+ example->set_category_index(data_set, category_index);
57
+
58
+ file_count++;
59
+ if((file_count % 10000) == 0)
60
+ cout << "Read " << file_count << endl;
61
+ }
62
+
63
+ closedir(dir);
64
+ }
65
+
66
+ DataSet::DataSet *Storage::Folders::read() {
67
+ DataSet::SparseDataSet *data_set = new DataSet::SparseDataSet();
68
+ DIR *dir = opendir(path.c_str());
69
+ struct dirent *dp;
70
+ char *name;
71
+ string newpath;
72
+ struct stat info;
73
+
74
+ // create an initial feature "Category"
75
+ DataSet::NominalFeature *categories = data_set->new_nominal_feature("Category");
76
+ data_set->set_category_index(0);
77
+ int category_index = 0;
78
+
79
+ while((dp = readdir(dir))) {
80
+ // ignore files starting with a dot
81
+ name = dp->d_name;
82
+ if(*name == '.')
83
+ continue;
84
+
85
+ // ensure this is a folder
86
+ newpath = path + "/" + name;
87
+ stat(newpath.c_str(), &info);
88
+ if(info.st_mode & S_IFDIR) {
89
+ category_index = categories->value_index(string(name));
90
+ load_directory(newpath, data_set, category_index);
91
+ }
92
+ }
93
+
94
+ return data_set;
95
+ }
96
+
97
+ void Storage::Folders::write(DataSet::DataSet *data_set) {
98
+ }
@@ -0,0 +1,25 @@
1
+ #ifndef __folders_h__
2
+ #define __folders_h__
3
+ #include "preprocessing/text/text_pipeline.h"
4
+ #include "data_set/data_set.h"
5
+ #include "storage/storage.h"
6
+ #include <algorithm>
7
+ #include <cctype>
8
+ #include <string>
9
+ using namespace std;
10
+
11
+ namespace Storage {
12
+ class Folders : public Storage {
13
+ void load_directory(string path, DataSet::SparseDataSet *data_set, int category_index);
14
+
15
+ public:
16
+ string path;
17
+ Preprocessing::Text::TextPipeline *pipeline;
18
+
19
+ Folders(string path, Preprocessing::Text::TextPipeline *pipeline) : path(path), pipeline(pipeline) {}
20
+ DataSet::DataSet *read();
21
+ void write(DataSet::DataSet *data_set);
22
+ };
23
+ }
24
+
25
+ #endif
@@ -0,0 +1,19 @@
1
+ #ifndef __storage_h__
2
+ #define __storage_h__
3
+ #include "data_set/data_set.h"
4
+ #include "model/model.h"
5
+
6
+ namespace Storage {
7
+ class Storage {
8
+ public:
9
+ // all storage implementations must be able to read and write data sets
10
+ virtual DataSet::DataSet *read() = 0;
11
+ virtual void write(DataSet::DataSet *data_set) = 0;
12
+
13
+ // some implementations can read and write trained models
14
+ virtual Model::Model *read_model() { return NULL; }
15
+ virtual void write_model(Model::Model *model) {}
16
+ };
17
+ }
18
+
19
+ #endif
@@ -0,0 +1,6 @@
1
+ #include "quarry.h"
2
+ #include <stdexcept>
3
+ #include <cstring>
4
+
5
+ int main() {
6
+ }
@@ -0,0 +1,22 @@
1
+ module Quarry
2
+ module Classifier
3
+ class Classifier
4
+ attr_reader :classifier
5
+ def initialize(data_set)
6
+ @data_set = data_set
7
+ end
8
+
9
+ def prepare
10
+ @classifier.prepare
11
+ end
12
+
13
+ def classify(example)
14
+ @data_set.categories[@classifier.classify_to_index(example.example)]
15
+ end
16
+
17
+ def rank(example)
18
+ @classifier.rank(example.example)
19
+ end
20
+ end
21
+ end
22
+ end