thera 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. data/.document +5 -0
  2. data/.gitignore +56 -0
  3. data/Gemfile +2 -0
  4. data/Gemfile.lock +20 -0
  5. data/LICENSE.txt +1 -0
  6. data/README.rdoc +8 -0
  7. data/Rakefile +1 -0
  8. data/ext/Makefile +225 -0
  9. data/ext/extconf.rb +29 -0
  10. data/ext/quarry/quarry_toolkit.cpp +148 -0
  11. data/lib/quarry/Makefile.linux +2 -0
  12. data/lib/quarry/Makefile.osx +6 -0
  13. data/lib/quarry/Makefile.targets +23 -0
  14. data/lib/quarry/obj/.gitkeep +0 -0
  15. data/lib/quarry/src/classifier/aode/aode_classifier.cpp +0 -0
  16. data/lib/quarry/src/classifier/aode/aode_classifier.h +0 -0
  17. data/lib/quarry/src/classifier/centroid/centroid_classifier.cpp +0 -0
  18. data/lib/quarry/src/classifier/centroid/centroid_classifier.h +0 -0
  19. data/lib/quarry/src/classifier/classifier.cpp +32 -0
  20. data/lib/quarry/src/classifier/classifier.h +59 -0
  21. data/lib/quarry/src/classifier/knn/knn_classifier.cpp +0 -0
  22. data/lib/quarry/src/classifier/knn/knn_classifier.h +0 -0
  23. data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.cpp +40 -0
  24. data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.h +18 -0
  25. data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +80 -0
  26. data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.h +52 -0
  27. data/lib/quarry/src/data_set/data_set.cpp +130 -0
  28. data/lib/quarry/src/data_set/data_set.h +78 -0
  29. data/lib/quarry/src/data_set/dense/dense_data_set.h +39 -0
  30. data/lib/quarry/src/data_set/dense/dense_example.h +44 -0
  31. data/lib/quarry/src/data_set/example.cpp +10 -0
  32. data/lib/quarry/src/data_set/example.h +23 -0
  33. data/lib/quarry/src/data_set/feature.h +36 -0
  34. data/lib/quarry/src/data_set/features/nominal_feature.cpp +57 -0
  35. data/lib/quarry/src/data_set/features/nominal_feature.h +76 -0
  36. data/lib/quarry/src/data_set/features/numeric_feature.cpp +69 -0
  37. data/lib/quarry/src/data_set/features/numeric_feature.h +78 -0
  38. data/lib/quarry/src/data_set/sparse/sparse_data_set.h +40 -0
  39. data/lib/quarry/src/data_set/sparse/sparse_example.cpp +82 -0
  40. data/lib/quarry/src/data_set/sparse/sparse_example.h +38 -0
  41. data/lib/quarry/src/metrics/confusion_matrix.cpp +129 -0
  42. data/lib/quarry/src/metrics/confusion_matrix.h +82 -0
  43. data/lib/quarry/src/model/model.cpp +29 -0
  44. data/lib/quarry/src/model/model.h +50 -0
  45. data/lib/quarry/src/preprocessing/examples/example_preprocessor.h +20 -0
  46. data/lib/quarry/src/preprocessing/examples/weights/binary_weight.h +20 -0
  47. data/lib/quarry/src/preprocessing/examples/weights/local_weight.h +29 -0
  48. data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +19 -0
  49. data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +59 -0
  50. data/lib/quarry/src/preprocessing/text/inplace_processor/downcase.h +26 -0
  51. data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h +17 -0
  52. data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer.h +44 -0
  53. data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer_original.cpp +375 -0
  54. data/lib/quarry/src/preprocessing/text/text_pipeline.cpp +29 -0
  55. data/lib/quarry/src/preprocessing/text/text_pipeline.h +37 -0
  56. data/lib/quarry/src/preprocessing/text/token_selector/pos_tag_selector.h +21 -0
  57. data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp +82 -0
  58. data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h +20 -0
  59. data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h +17 -0
  60. data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.cpp +29 -0
  61. data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.h +20 -0
  62. data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h +19 -0
  63. data/lib/quarry/src/quarry.cpp +1 -0
  64. data/lib/quarry/src/quarry.h +29 -0
  65. data/lib/quarry/src/storage/arff.cpp +198 -0
  66. data/lib/quarry/src/storage/arff.h +26 -0
  67. data/lib/quarry/src/storage/binary.cpp +457 -0
  68. data/lib/quarry/src/storage/binary.h +79 -0
  69. data/lib/quarry/src/storage/folders.cpp +98 -0
  70. data/lib/quarry/src/storage/folders.h +25 -0
  71. data/lib/quarry/src/storage/storage.h +19 -0
  72. data/lib/quarry/src/test.cpp +6 -0
  73. data/lib/quarry_rb/classifier/classifier.rb +22 -0
  74. data/lib/quarry_rb/classifier/naive_bayes_classifier.rb +10 -0
  75. data/lib/quarry_rb/confusion_matrix.rb +58 -0
  76. data/lib/quarry_rb/data_set/data_set.rb +42 -0
  77. data/lib/quarry_rb/data_set/example.rb +33 -0
  78. data/lib/quarry_rb/data_set/feature.rb +28 -0
  79. data/lib/quarry_rb/enumerable_helper.rb +32 -0
  80. data/lib/quarry_rb/model/model.rb +56 -0
  81. data/lib/quarry_rb/storage/arff.rb +11 -0
  82. data/lib/quarry_rb/storage/binary.rb +23 -0
  83. data/lib/quarry_rb/storage/folders.rb +11 -0
  84. data/lib/quarry_rb/text_pipeline.rb +16 -0
  85. data/lib/thera.rb +20 -0
  86. data/test/helper.rb +19 -0
  87. data/test/test_quarry.rb +33 -0
  88. data/thera.gemspec +21 -0
  89. metadata +148 -0
@@ -0,0 +1,82 @@
1
+ #include "stop_words.h"
2
+ #include <tr1/unordered_set>
3
+ #include <iostream>
4
+ using namespace std;
5
+ using namespace tr1;
6
+
7
+ static unordered_set<string> *stop_words = NULL;
8
+ static int stop_word_count = 586;
9
+ static string stop_word_list[] = {
10
+ "a", "able", "about", "above", "abroad", "according", "accordingly", "across", "actually", "adj",
11
+ "after", "afterwards", "again", "against", "ago", "ahead", "all", "allow", "allows", "almost",
12
+ "alone", "along", "alongside", "already", "also", "although", "always", "am", "amid", "amidst",
13
+ "among", "amongst", "an", "and", "another", "any", "anybody", "anyhow", "anyone", "anything",
14
+ "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "around", "as",
15
+ "aside", "ask", "asking", "associated", "at", "available", "away", "awfully", "b", "back",
16
+ "backward", "backwards", "be", "became", "because", "become", "becomes", "becoming", "been", "before",
17
+ "beforehand", "begin", "behind", "being", "believe", "below", "beside", "besides", "best", "better",
18
+ "between", "beyond", "both", "brief", "but", "by", "c", "came", "can", "cannot",
19
+ "cant", "caption", "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com",
20
+ "come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding",
21
+ "could", "course", "currently", "d", "dare", "definitely", "described", "despite", "did", "different",
22
+ "directly", "do", "does", "doing", "done", "down", "downwards", "during", "e", "each",
23
+ "edu", "eg", "eight", "eighty", "either", "else", "elsewhere", "end", "ending", "enough",
24
+ "entirely", "especially", "et", "etc", "even", "ever", "evermore", "every", "everybody", "everyone",
25
+ "everything", "everywhere", "ex", "exactly", "example", "except", "f", "fairly", "far", "farther",
26
+ "few", "fewer", "fifth", "first", "five", "followed", "following", "follows", "for", "forever",
27
+ "former", "formerly", "forth", "forward", "found", "four", "from", "further", "furthermore", "g",
28
+ "get", "gets", "getting", "given", "gives", "go", "goes", "going", "gone", "got",
29
+ "gotten", "greetings", "h", "had", "half", "happens", "hardly", "has", "have", "having",
30
+ "he", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon",
31
+ "hers", "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how", "howbeit",
32
+ "however", "hundred", "i", "ie", "if", "ignored", "immediate", "in", "inasmuch", "inc",
33
+ "indeed", "indicate", "indicated", "indicates", "inner", "inside", "insofar", "instead", "into", "inward",
34
+ "is", "it", "its", "itself", "j", "just", "k", "keep", "keeps", "kept",
35
+ "know", "known", "knows", "l", "last", "lately", "later", "latter", "latterly", "least",
36
+ "less", "lest", "let", "like", "liked", "likely", "likewise", "little", "look", "looking",
37
+ "looks", "low", "lower", "ltd", "m", "made", "mainly", "make", "makes", "many",
38
+ "may", "maybe", "me", "mean", "meantime", "meanwhile", "merely", "might", "mine", "minus",
39
+ "miss", "more", "moreover", "most", "mostly", "mr", "mrs", "much", "must", "my",
40
+ "myself", "n", "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs",
41
+ "neither", "never", "neverf", "neverless", "nevertheless", "new", "next", "nine", "ninety", "no",
42
+ "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "not", "nothing", "notwithstanding",
43
+ "novel", "now", "nowhere", "o", "obviously", "of", "off", "often", "oh", "ok",
44
+ "okay", "old", "on", "once", "one", "ones", "only", "onto", "opposite", "or",
45
+ "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over",
46
+ "overall", "own", "p", "particular", "particularly", "past", "per", "perhaps", "placed", "please",
47
+ "plus", "possible", "presumably", "probably", "provided", "provides", "q", "que", "quite", "qv",
48
+ "r", "rather", "rd", "re", "really", "reasonably", "recent", "recently", "regarding", "regardless",
49
+ "regards", "relatively", "respectively", "right", "round", "s", "said", "same", "saw", "say",
50
+ "saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming", "seems",
51
+ "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall",
52
+ "she", "should", "since", "six", "so", "some", "somebody", "someday", "somehow", "someone",
53
+ "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying",
54
+ "still", "sub", "such", "sup", "sure", "t", "take", "taken", "taking", "tell",
55
+ "tends", "th", "than", "thank", "thanks", "thanx", "that", "thats", "the", "their",
56
+ "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein",
57
+ "theres", "thereupon", "these", "they", "thing", "things", "think", "third", "thirty", "this",
58
+ "thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "till",
59
+ "to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try",
60
+ "trying", "twice", "two", "u", "un", "under", "underneath", "undoing", "unfortunately", "unless",
61
+ "unlike", "unlikely", "until", "unto", "up", "upon", "upwards", "us", "use", "used",
62
+ "useful", "uses", "using", "usually", "v", "value", "various", "versus", "very", "via",
63
+ "viz", "vs", "w", "want", "wants", "was", "way", "we", "welcome", "well",
64
+ "went", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas",
65
+ "whereby", "wherein", "whereupon", "wherever", "whether", "which", "whichever", "while", "whilst", "whither",
66
+ "who", "whoever", "whole", "whom", "whomever", "whose", "why", "will", "willing", "wish",
67
+ "with", "within", "without", "wonder", "would", "x", "y", "yes", "yet", "you",
68
+ "your", "yours", "yourself", "yourselves", "z", "zero"
69
+ };
70
+
71
+ Preprocessing::Text::StopWords::StopWords() : TokenSelector() {
72
+ if(stop_words == NULL) {
73
+ stop_words = new unordered_set<string>();
74
+ for(int i = 0; i < stop_word_count; i++)
75
+ stop_words->insert(stop_word_list[i]);
76
+ }
77
+ }
78
+
79
+ bool Preprocessing::Text::StopWords::select(char *start, char *end) {
80
+ string token = string(start, (end - start) + 1);
81
+ return stop_words->count(token) == 0;
82
+ }
@@ -0,0 +1,20 @@
1
+ #ifndef __stop_words_h__
2
+ #define __stop_words_h__
3
+ #include "token_selector.h"
4
+
5
+ namespace Preprocessing {
6
+ namespace Text {
7
+
8
+ class StopWords : public TokenSelector {
9
+ public:
10
+ static const uint32_t file_mark = 'stop';
11
+ uint32_t mark() { return file_mark; }
12
+
13
+ StopWords();
14
+ bool select(char *start, char *end);
15
+ };
16
+
17
+ }
18
+ }
19
+
20
+ #endif
@@ -0,0 +1,17 @@
1
+ #ifndef __token_selector_h__
2
+ #define __token_selector_h__
3
+ #include <stdint.h>
4
+
5
+ namespace Preprocessing {
6
+ namespace Text {
7
+
8
+ class TokenSelector {
9
+ public:
10
+ virtual bool select(char *start, char *end) { return true; }
11
+ virtual uint32_t mark() = 0;
12
+ };
13
+
14
+ }
15
+ }
16
+
17
+ #endif
@@ -0,0 +1,29 @@
1
+ #include "preprocessing/text/text_pipeline.h"
2
+ #include "simple_tokeniser.h"
3
+
4
+ void Preprocessing::Text::SimpleTokeniser::tokenise(char *text) {
5
+ char *start, *end;
6
+ bool intoken = false;
7
+ bool active = true;
8
+
9
+ // to simplify the code, the while condition is not while(*text),
10
+ // because the if(intoken) block needs to run when *text == 0 at
11
+ // the end of the string.
12
+ while(active) {
13
+ active = *text;
14
+ if(isalnum(*text)) {
15
+ if(!intoken) {
16
+ intoken = true;
17
+ start = text;
18
+ }
19
+ } else {
20
+ if(intoken) {
21
+ intoken = false;
22
+ *text = 0;
23
+ end = text;
24
+ pipeline->process_token(start, end);
25
+ }
26
+ }
27
+ text++;
28
+ }
29
+ }
@@ -0,0 +1,20 @@
1
+ #ifndef __simple_tokeniser_h__
2
+ #define __simple_tokeniser_h__
3
+ #include "tokeniser.h"
4
+
5
+ namespace Preprocessing {
6
+ namespace Text {
7
+
8
+ class SimpleTokeniser : public Tokeniser {
9
+ public:
10
+ static const uint32_t file_mark = 'simt';
11
+ uint32_t mark() { return file_mark; }
12
+
13
+ SimpleTokeniser(TextPipeline *pipeline) : Tokeniser(pipeline) {}
14
+ void tokenise(char *text);
15
+ };
16
+
17
+ }
18
+ }
19
+
20
+ #endif
@@ -0,0 +1,19 @@
1
+ #ifndef __tokeniser_h__
2
+ #define __tokeniser_h__
3
+
4
+ namespace Preprocessing {
5
+ namespace Text {
6
+ class TextPipeline;
7
+
8
+ class Tokeniser {
9
+ public:
10
+ TextPipeline *pipeline;
11
+ Tokeniser(TextPipeline *pipeline) : pipeline(pipeline) {}
12
+ virtual void tokenise(char *text) {}
13
+ virtual uint32_t mark() = 0;
14
+ };
15
+
16
+ }
17
+ }
18
+
19
+ #endif
@@ -0,0 +1 @@
1
+ void mkmf_marker() {;}
@@ -0,0 +1,29 @@
1
+ #include "data_set/data_set.h"
2
+ #include "data_set/dense/dense_data_set.h"
3
+ #include "data_set/sparse/sparse_data_set.h"
4
+ #include "data_set/example.h"
5
+ #include "data_set/dense/dense_example.h"
6
+ #include "data_set/sparse/sparse_example.h"
7
+ #include "data_set/feature.h"
8
+ #include "data_set/features/nominal_feature.h"
9
+ #include "data_set/features/numeric_feature.h"
10
+
11
+ #include "classifier/classifier.h"
12
+ #include "classifier/naive_bayes/naive_bayes_classifier.h"
13
+
14
+ #include "metrics/confusion_matrix.h"
15
+
16
+ #include "preprocessing/text/text_pipeline.h"
17
+ #include "preprocessing/examples/example_preprocessor.h"
18
+ #include "preprocessing/examples/weights/binary_weight.h"
19
+ #include "preprocessing/examples/weights/local_weight.h"
20
+
21
+ #include "model/model.h"
22
+
23
+ #include "storage/storage.h"
24
+ #include "storage/arff.h"
25
+ #include "storage/folders.h"
26
+ #include "storage/binary.h"
27
+
28
+ // function to show to mkmf this library exists
29
+ void mkmf_marker();
@@ -0,0 +1,198 @@
1
+ #include "arff.h"
2
+ #include <stdexcept>
3
+ #include <fstream>
4
+ #include <iostream>
5
+ #include <stdlib.h>
6
+ #include <vector>
7
+ using namespace std;
8
+
9
+ #define BUFFER_SIZE (10 * 1024)
10
+ #define RELATION_PREFIX "@relation "
11
+ #define RELATION_PREFIX_LENGTH 10
12
+ #define ATTRIBUTE_PREFIX "@attribute "
13
+ #define ATTRIBUTE_PREFIX_LENGTH 11
14
+ #define DATA_PREFIX "@data"
15
+ #define DATA_PREFIX_LENGTH 5
16
+ #define NUMERIC_TYPE "numeric"
17
+ #define NUMERIC_TYPE_LENGTH 7
18
+
19
+
20
+ // skip whitespace and other delimiters
21
+ #define skip_delimiters(conditions) \
22
+ while(conditions)\
23
+ (*str)++;
24
+
25
+ #define skip_only_whitespace() skip_delimiters((isspace(**str)))
26
+ #define skip_whitespace_and_nominal_list_markers() skip_delimiters((isspace(**str) || (**str == ',') || (**str == '}')))
27
+ #define skip_whitespace_and_example_list_markers() skip_delimiters((isspace(**str) || (**str == ',') || (**str == '}')))
28
+
29
+
30
+ // move the character position until the end of a token
31
+ #define tokenise_while(conditions) \
32
+ while(**str && (conditions))\
33
+ (*str)++;\
34
+ if(**str) {\
35
+ **str = 0;\
36
+ (*str)++;\
37
+ }
38
+
39
+ #define tokenise_space() tokenise_while(!isspace(**str))
40
+ #define tokenise_quote() tokenise_while((**str != '"' || (*(*str - 1) != '\\')))
41
+ #define tokenise_value() tokenise_while(!isspace(**str) && (**str != ','))
42
+ #define tokenise_nominal() tokenise_while(!isspace(**str) && (**str != ',') && (**str != '}'))
43
+
44
+
45
+ // determine whether the token is quote delimited or otherwise,
46
+ // and cleanup whitespace etc. at the end
47
+ #define tokenise(tokeniser, skipper) \
48
+ char *start;\
49
+ if(**str == '"') {\
50
+ start = ++*str;\
51
+ tokenise_quote();\
52
+ } else {\
53
+ start = *str;\
54
+ tokeniser();\
55
+ }\
56
+ skipper();\
57
+ return start;
58
+
59
+ inline char *tokenise_attribute_name(char **str) {
60
+ tokenise(tokenise_space, skip_only_whitespace);
61
+ }
62
+
63
+ inline char *tokenise_nominal_value(char **str) {
64
+ tokenise(tokenise_nominal, skip_whitespace_and_nominal_list_markers);
65
+ }
66
+
67
+ inline char *tokenise_example_value(char **str) {
68
+ tokenise(tokenise_value, skip_whitespace_and_example_list_markers);
69
+ }
70
+
71
+ // downcase and compare two strings of a known length
72
+ inline bool matches(char *buffer, char *compare_to, int length) {
73
+ while(length--)
74
+ if(tolower(*(buffer++)) != *(compare_to++))
75
+ return false;
76
+ return true;
77
+ }
78
+
79
+ inline void skip_whitespace(char **str) {
80
+ skip_only_whitespace();
81
+ }
82
+
83
+
84
+
85
+ DataSet::DataSet *Storage::ARFF::read() {
86
+ DataSet::DenseDataSet *data_set = new DataSet::DenseDataSet();
87
+ DataSet::NominalFeature *feature = NULL;
88
+ DataSet::DenseExample *example = NULL;
89
+ State state = relation;
90
+ vector<bool> numeric_feature;
91
+ char buffer[BUFFER_SIZE];
92
+ char *line, *name, *value;
93
+ string value_str;
94
+ int value_index;
95
+ fstream file;
96
+
97
+ file.open(path.c_str(), fstream::in);
98
+
99
+ while(file.good()) {
100
+ file.getline(buffer, BUFFER_SIZE);
101
+
102
+ switch(*buffer) {
103
+ // blank line
104
+ case '\0':
105
+ break;
106
+
107
+ // comments start with percent
108
+ case '%':
109
+ break;
110
+
111
+ // transitioning states, or adding a new feature
112
+ case '@':
113
+ switch(state) {
114
+ case relation:
115
+ if(!matches(buffer, RELATION_PREFIX, RELATION_PREFIX_LENGTH))
116
+ throw runtime_error("Expected relation declaration");
117
+ line = buffer + RELATION_PREFIX_LENGTH;
118
+ skip_whitespace(&line);
119
+ data_set->name = line;
120
+ state = attributes;
121
+ break;
122
+
123
+ case attributes:
124
+ // check if this is an attribute declaration
125
+ if(matches(buffer, ATTRIBUTE_PREFIX, ATTRIBUTE_PREFIX_LENGTH)) {
126
+ line = buffer + ATTRIBUTE_PREFIX_LENGTH;
127
+ skip_whitespace(&line);
128
+
129
+ // extract the attribute's name
130
+ name = tokenise_attribute_name(&line);
131
+ if(!*line)
132
+ throw runtime_error("Unexpected end of line");
133
+
134
+ // add a numeric attribute
135
+ if(matches(line, NUMERIC_TYPE, NUMERIC_TYPE_LENGTH)) {
136
+ data_set->new_numeric_feature(name);
137
+ numeric_feature.push_back(true);
138
+
139
+ // add a nominal attribute
140
+ } else if(*line == '{') {
141
+ feature = data_set->new_nominal_feature(name);
142
+ numeric_feature.push_back(false);
143
+ line++;
144
+
145
+ while(*line) {
146
+ value_str = tokenise_nominal_value(&line);
147
+ feature->add_value(value_str);
148
+ }
149
+
150
+ // other attribute types aren't supported
151
+ } else {
152
+ throw runtime_error("Unknown attribute type - only numeric and nominal attributes are supported");
153
+ }
154
+
155
+ // otherwise could be the start of the data section
156
+ } else {
157
+ if(matches(buffer, DATA_PREFIX, DATA_PREFIX_LENGTH))
158
+ state = data;
159
+ else
160
+ throw runtime_error("Expected attribute or data declaration");
161
+ }
162
+ break;
163
+
164
+ case data:
165
+ throw runtime_error("Unexpected declaration line, currently in data section");
166
+ }
167
+ break;
168
+
169
+ // adding data
170
+ default:
171
+ if(state != data)
172
+ throw runtime_error("Expected data section");
173
+
174
+ example = data_set->new_example();
175
+ value_index = 0;
176
+ line = buffer;
177
+
178
+ while(*line) {
179
+ value = tokenise_example_value(&line);
180
+ if(numeric_feature[value_index]) {
181
+ example->set_value(value_index, atof(value));
182
+ } else {
183
+ value_str = value;
184
+ feature = (DataSet::NominalFeature *)data_set->features[value_index];
185
+ example->set_value(value_index, feature->indexes[value_str]);
186
+ }
187
+ value_index++;
188
+ }
189
+ }
190
+ }
191
+
192
+ data_set->set_category_index(data_set->features.size() - 1);
193
+ file.close();
194
+ return data_set;
195
+ }
196
+
197
+ void Storage::ARFF::write(DataSet::DataSet *data_set) {
198
+ }
@@ -0,0 +1,26 @@
1
+ #ifndef __arff_h__
2
+ #define __arff_h__
3
+ #include "storage/storage.h"
4
+ #include "data_set/dense/dense_data_set.h"
5
+ #include <algorithm>
6
+ #include <cctype>
7
+ #include <string>
8
+ using namespace std;
9
+
10
+ namespace Storage {
11
+ class ARFF : public Storage {
12
+ typedef enum {
13
+ relation,
14
+ attributes,
15
+ data
16
+ } State;
17
+
18
+ public:
19
+ string path;
20
+ ARFF(string path) : path(path) {}
21
+ DataSet::DataSet *read();
22
+ void write(DataSet::DataSet *data_set);
23
+ };
24
+ }
25
+
26
+ #endif