thera 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +56 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +20 -0
- data/LICENSE.txt +1 -0
- data/README.rdoc +8 -0
- data/Rakefile +1 -0
- data/ext/Makefile +225 -0
- data/ext/extconf.rb +29 -0
- data/ext/quarry/quarry_toolkit.cpp +148 -0
- data/lib/quarry/Makefile.linux +2 -0
- data/lib/quarry/Makefile.osx +6 -0
- data/lib/quarry/Makefile.targets +23 -0
- data/lib/quarry/obj/.gitkeep +0 -0
- data/lib/quarry/src/classifier/aode/aode_classifier.cpp +0 -0
- data/lib/quarry/src/classifier/aode/aode_classifier.h +0 -0
- data/lib/quarry/src/classifier/centroid/centroid_classifier.cpp +0 -0
- data/lib/quarry/src/classifier/centroid/centroid_classifier.h +0 -0
- data/lib/quarry/src/classifier/classifier.cpp +32 -0
- data/lib/quarry/src/classifier/classifier.h +59 -0
- data/lib/quarry/src/classifier/knn/knn_classifier.cpp +0 -0
- data/lib/quarry/src/classifier/knn/knn_classifier.h +0 -0
- data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.cpp +40 -0
- data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.h +18 -0
- data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +80 -0
- data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.h +52 -0
- data/lib/quarry/src/data_set/data_set.cpp +130 -0
- data/lib/quarry/src/data_set/data_set.h +78 -0
- data/lib/quarry/src/data_set/dense/dense_data_set.h +39 -0
- data/lib/quarry/src/data_set/dense/dense_example.h +44 -0
- data/lib/quarry/src/data_set/example.cpp +10 -0
- data/lib/quarry/src/data_set/example.h +23 -0
- data/lib/quarry/src/data_set/feature.h +36 -0
- data/lib/quarry/src/data_set/features/nominal_feature.cpp +57 -0
- data/lib/quarry/src/data_set/features/nominal_feature.h +76 -0
- data/lib/quarry/src/data_set/features/numeric_feature.cpp +69 -0
- data/lib/quarry/src/data_set/features/numeric_feature.h +78 -0
- data/lib/quarry/src/data_set/sparse/sparse_data_set.h +40 -0
- data/lib/quarry/src/data_set/sparse/sparse_example.cpp +82 -0
- data/lib/quarry/src/data_set/sparse/sparse_example.h +38 -0
- data/lib/quarry/src/metrics/confusion_matrix.cpp +129 -0
- data/lib/quarry/src/metrics/confusion_matrix.h +82 -0
- data/lib/quarry/src/model/model.cpp +29 -0
- data/lib/quarry/src/model/model.h +50 -0
- data/lib/quarry/src/preprocessing/examples/example_preprocessor.h +20 -0
- data/lib/quarry/src/preprocessing/examples/weights/binary_weight.h +20 -0
- data/lib/quarry/src/preprocessing/examples/weights/local_weight.h +29 -0
- data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +19 -0
- data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +59 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/downcase.h +26 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h +17 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer.h +44 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer_original.cpp +375 -0
- data/lib/quarry/src/preprocessing/text/text_pipeline.cpp +29 -0
- data/lib/quarry/src/preprocessing/text/text_pipeline.h +37 -0
- data/lib/quarry/src/preprocessing/text/token_selector/pos_tag_selector.h +21 -0
- data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp +82 -0
- data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h +20 -0
- data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h +17 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.cpp +29 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.h +20 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h +19 -0
- data/lib/quarry/src/quarry.cpp +1 -0
- data/lib/quarry/src/quarry.h +29 -0
- data/lib/quarry/src/storage/arff.cpp +198 -0
- data/lib/quarry/src/storage/arff.h +26 -0
- data/lib/quarry/src/storage/binary.cpp +457 -0
- data/lib/quarry/src/storage/binary.h +79 -0
- data/lib/quarry/src/storage/folders.cpp +98 -0
- data/lib/quarry/src/storage/folders.h +25 -0
- data/lib/quarry/src/storage/storage.h +19 -0
- data/lib/quarry/src/test.cpp +6 -0
- data/lib/quarry_rb/classifier/classifier.rb +22 -0
- data/lib/quarry_rb/classifier/naive_bayes_classifier.rb +10 -0
- data/lib/quarry_rb/confusion_matrix.rb +58 -0
- data/lib/quarry_rb/data_set/data_set.rb +42 -0
- data/lib/quarry_rb/data_set/example.rb +33 -0
- data/lib/quarry_rb/data_set/feature.rb +28 -0
- data/lib/quarry_rb/enumerable_helper.rb +32 -0
- data/lib/quarry_rb/model/model.rb +56 -0
- data/lib/quarry_rb/storage/arff.rb +11 -0
- data/lib/quarry_rb/storage/binary.rb +23 -0
- data/lib/quarry_rb/storage/folders.rb +11 -0
- data/lib/quarry_rb/text_pipeline.rb +16 -0
- data/lib/thera.rb +20 -0
- data/test/helper.rb +19 -0
- data/test/test_quarry.rb +33 -0
- data/thera.gemspec +21 -0
- metadata +148 -0
@@ -0,0 +1,82 @@
|
|
1
|
+
#include "stop_words.h"
|
2
|
+
#include <tr1/unordered_set>
|
3
|
+
#include <iostream>
|
4
|
+
using namespace std;
|
5
|
+
using namespace tr1;
|
6
|
+
|
7
|
+
static unordered_set<string> *stop_words = NULL;
|
8
|
+
static int stop_word_count = 586;
|
9
|
+
static string stop_word_list[] = {
|
10
|
+
"a", "able", "about", "above", "abroad", "according", "accordingly", "across", "actually", "adj",
|
11
|
+
"after", "afterwards", "again", "against", "ago", "ahead", "all", "allow", "allows", "almost",
|
12
|
+
"alone", "along", "alongside", "already", "also", "although", "always", "am", "amid", "amidst",
|
13
|
+
"among", "amongst", "an", "and", "another", "any", "anybody", "anyhow", "anyone", "anything",
|
14
|
+
"anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "around", "as",
|
15
|
+
"aside", "ask", "asking", "associated", "at", "available", "away", "awfully", "b", "back",
|
16
|
+
"backward", "backwards", "be", "became", "because", "become", "becomes", "becoming", "been", "before",
|
17
|
+
"beforehand", "begin", "behind", "being", "believe", "below", "beside", "besides", "best", "better",
|
18
|
+
"between", "beyond", "both", "brief", "but", "by", "c", "came", "can", "cannot",
|
19
|
+
"cant", "caption", "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com",
|
20
|
+
"come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding",
|
21
|
+
"could", "course", "currently", "d", "dare", "definitely", "described", "despite", "did", "different",
|
22
|
+
"directly", "do", "does", "doing", "done", "down", "downwards", "during", "e", "each",
|
23
|
+
"edu", "eg", "eight", "eighty", "either", "else", "elsewhere", "end", "ending", "enough",
|
24
|
+
"entirely", "especially", "et", "etc", "even", "ever", "evermore", "every", "everybody", "everyone",
|
25
|
+
"everything", "everywhere", "ex", "exactly", "example", "except", "f", "fairly", "far", "farther",
|
26
|
+
"few", "fewer", "fifth", "first", "five", "followed", "following", "follows", "for", "forever",
|
27
|
+
"former", "formerly", "forth", "forward", "found", "four", "from", "further", "furthermore", "g",
|
28
|
+
"get", "gets", "getting", "given", "gives", "go", "goes", "going", "gone", "got",
|
29
|
+
"gotten", "greetings", "h", "had", "half", "happens", "hardly", "has", "have", "having",
|
30
|
+
"he", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon",
|
31
|
+
"hers", "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how", "howbeit",
|
32
|
+
"however", "hundred", "i", "ie", "if", "ignored", "immediate", "in", "inasmuch", "inc",
|
33
|
+
"indeed", "indicate", "indicated", "indicates", "inner", "inside", "insofar", "instead", "into", "inward",
|
34
|
+
"is", "it", "its", "itself", "j", "just", "k", "keep", "keeps", "kept",
|
35
|
+
"know", "known", "knows", "l", "last", "lately", "later", "latter", "latterly", "least",
|
36
|
+
"less", "lest", "let", "like", "liked", "likely", "likewise", "little", "look", "looking",
|
37
|
+
"looks", "low", "lower", "ltd", "m", "made", "mainly", "make", "makes", "many",
|
38
|
+
"may", "maybe", "me", "mean", "meantime", "meanwhile", "merely", "might", "mine", "minus",
|
39
|
+
"miss", "more", "moreover", "most", "mostly", "mr", "mrs", "much", "must", "my",
|
40
|
+
"myself", "n", "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs",
|
41
|
+
"neither", "never", "neverf", "neverless", "nevertheless", "new", "next", "nine", "ninety", "no",
|
42
|
+
"nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "not", "nothing", "notwithstanding",
|
43
|
+
"novel", "now", "nowhere", "o", "obviously", "of", "off", "often", "oh", "ok",
|
44
|
+
"okay", "old", "on", "once", "one", "ones", "only", "onto", "opposite", "or",
|
45
|
+
"other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over",
|
46
|
+
"overall", "own", "p", "particular", "particularly", "past", "per", "perhaps", "placed", "please",
|
47
|
+
"plus", "possible", "presumably", "probably", "provided", "provides", "q", "que", "quite", "qv",
|
48
|
+
"r", "rather", "rd", "re", "really", "reasonably", "recent", "recently", "regarding", "regardless",
|
49
|
+
"regards", "relatively", "respectively", "right", "round", "s", "said", "same", "saw", "say",
|
50
|
+
"saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming", "seems",
|
51
|
+
"seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall",
|
52
|
+
"she", "should", "since", "six", "so", "some", "somebody", "someday", "somehow", "someone",
|
53
|
+
"something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying",
|
54
|
+
"still", "sub", "such", "sup", "sure", "t", "take", "taken", "taking", "tell",
|
55
|
+
"tends", "th", "than", "thank", "thanks", "thanx", "that", "thats", "the", "their",
|
56
|
+
"theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein",
|
57
|
+
"theres", "thereupon", "these", "they", "thing", "things", "think", "third", "thirty", "this",
|
58
|
+
"thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "till",
|
59
|
+
"to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try",
|
60
|
+
"trying", "twice", "two", "u", "un", "under", "underneath", "undoing", "unfortunately", "unless",
|
61
|
+
"unlike", "unlikely", "until", "unto", "up", "upon", "upwards", "us", "use", "used",
|
62
|
+
"useful", "uses", "using", "usually", "v", "value", "various", "versus", "very", "via",
|
63
|
+
"viz", "vs", "w", "want", "wants", "was", "way", "we", "welcome", "well",
|
64
|
+
"went", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas",
|
65
|
+
"whereby", "wherein", "whereupon", "wherever", "whether", "which", "whichever", "while", "whilst", "whither",
|
66
|
+
"who", "whoever", "whole", "whom", "whomever", "whose", "why", "will", "willing", "wish",
|
67
|
+
"with", "within", "without", "wonder", "would", "x", "y", "yes", "yet", "you",
|
68
|
+
"your", "yours", "yourself", "yourselves", "z", "zero"
|
69
|
+
};
|
70
|
+
|
71
|
+
Preprocessing::Text::StopWords::StopWords() : TokenSelector() {
|
72
|
+
if(stop_words == NULL) {
|
73
|
+
stop_words = new unordered_set<string>();
|
74
|
+
for(int i = 0; i < stop_word_count; i++)
|
75
|
+
stop_words->insert(stop_word_list[i]);
|
76
|
+
}
|
77
|
+
}
|
78
|
+
|
79
|
+
bool Preprocessing::Text::StopWords::select(char *start, char *end) {
|
80
|
+
string token = string(start, (end - start) + 1);
|
81
|
+
return stop_words->count(token) == 0;
|
82
|
+
}
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#ifndef __stop_words_h__
|
2
|
+
#define __stop_words_h__
|
3
|
+
#include "token_selector.h"
|
4
|
+
|
5
|
+
namespace Preprocessing {
|
6
|
+
namespace Text {
|
7
|
+
|
8
|
+
class StopWords : public TokenSelector {
|
9
|
+
public:
|
10
|
+
static const uint32_t file_mark = 'stop';
|
11
|
+
uint32_t mark() { return file_mark; }
|
12
|
+
|
13
|
+
StopWords();
|
14
|
+
bool select(char *start, char *end);
|
15
|
+
};
|
16
|
+
|
17
|
+
}
|
18
|
+
}
|
19
|
+
|
20
|
+
#endif
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#ifndef __token_selector_h__
|
2
|
+
#define __token_selector_h__
|
3
|
+
#include <stdint.h>
|
4
|
+
|
5
|
+
namespace Preprocessing {
|
6
|
+
namespace Text {
|
7
|
+
|
8
|
+
class TokenSelector {
|
9
|
+
public:
|
10
|
+
virtual bool select(char *start, char *end) { return true; }
|
11
|
+
virtual uint32_t mark() = 0;
|
12
|
+
};
|
13
|
+
|
14
|
+
}
|
15
|
+
}
|
16
|
+
|
17
|
+
#endif
|
@@ -0,0 +1,29 @@
|
|
1
|
+
#include "preprocessing/text/text_pipeline.h"
|
2
|
+
#include "simple_tokeniser.h"
|
3
|
+
|
4
|
+
void Preprocessing::Text::SimpleTokeniser::tokenise(char *text) {
|
5
|
+
char *start, *end;
|
6
|
+
bool intoken = false;
|
7
|
+
bool active = true;
|
8
|
+
|
9
|
+
// to simplify the code, the while condition is not while(*text),
|
10
|
+
// because the if(intoken) block needs to run when *text == 0 at
|
11
|
+
// the end of the string.
|
12
|
+
while(active) {
|
13
|
+
active = *text;
|
14
|
+
if(isalnum(*text)) {
|
15
|
+
if(!intoken) {
|
16
|
+
intoken = true;
|
17
|
+
start = text;
|
18
|
+
}
|
19
|
+
} else {
|
20
|
+
if(intoken) {
|
21
|
+
intoken = false;
|
22
|
+
*text = 0;
|
23
|
+
end = text;
|
24
|
+
pipeline->process_token(start, end);
|
25
|
+
}
|
26
|
+
}
|
27
|
+
text++;
|
28
|
+
}
|
29
|
+
}
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#ifndef __simple_tokeniser_h__
|
2
|
+
#define __simple_tokeniser_h__
|
3
|
+
#include "tokeniser.h"
|
4
|
+
|
5
|
+
namespace Preprocessing {
|
6
|
+
namespace Text {
|
7
|
+
|
8
|
+
class SimpleTokeniser : public Tokeniser {
|
9
|
+
public:
|
10
|
+
static const uint32_t file_mark = 'simt';
|
11
|
+
uint32_t mark() { return file_mark; }
|
12
|
+
|
13
|
+
SimpleTokeniser(TextPipeline *pipeline) : Tokeniser(pipeline) {}
|
14
|
+
void tokenise(char *text);
|
15
|
+
};
|
16
|
+
|
17
|
+
}
|
18
|
+
}
|
19
|
+
|
20
|
+
#endif
|
@@ -0,0 +1,19 @@
|
|
1
|
+
#ifndef __tokeniser_h__
|
2
|
+
#define __tokeniser_h__
|
3
|
+
|
4
|
+
namespace Preprocessing {
|
5
|
+
namespace Text {
|
6
|
+
class TextPipeline;
|
7
|
+
|
8
|
+
class Tokeniser {
|
9
|
+
public:
|
10
|
+
TextPipeline *pipeline;
|
11
|
+
Tokeniser(TextPipeline *pipeline) : pipeline(pipeline) {}
|
12
|
+
virtual void tokenise(char *text) {}
|
13
|
+
virtual uint32_t mark() = 0;
|
14
|
+
};
|
15
|
+
|
16
|
+
}
|
17
|
+
}
|
18
|
+
|
19
|
+
#endif
|
@@ -0,0 +1 @@
|
|
1
|
+
void mkmf_marker() {;}
|
@@ -0,0 +1,29 @@
|
|
1
|
+
#include "data_set/data_set.h"
|
2
|
+
#include "data_set/dense/dense_data_set.h"
|
3
|
+
#include "data_set/sparse/sparse_data_set.h"
|
4
|
+
#include "data_set/example.h"
|
5
|
+
#include "data_set/dense/dense_example.h"
|
6
|
+
#include "data_set/sparse/sparse_example.h"
|
7
|
+
#include "data_set/feature.h"
|
8
|
+
#include "data_set/features/nominal_feature.h"
|
9
|
+
#include "data_set/features/numeric_feature.h"
|
10
|
+
|
11
|
+
#include "classifier/classifier.h"
|
12
|
+
#include "classifier/naive_bayes/naive_bayes_classifier.h"
|
13
|
+
|
14
|
+
#include "metrics/confusion_matrix.h"
|
15
|
+
|
16
|
+
#include "preprocessing/text/text_pipeline.h"
|
17
|
+
#include "preprocessing/examples/example_preprocessor.h"
|
18
|
+
#include "preprocessing/examples/weights/binary_weight.h"
|
19
|
+
#include "preprocessing/examples/weights/local_weight.h"
|
20
|
+
|
21
|
+
#include "model/model.h"
|
22
|
+
|
23
|
+
#include "storage/storage.h"
|
24
|
+
#include "storage/arff.h"
|
25
|
+
#include "storage/folders.h"
|
26
|
+
#include "storage/binary.h"
|
27
|
+
|
28
|
+
// function to show to mkmf this library exists
|
29
|
+
void mkmf_marker();
|
@@ -0,0 +1,198 @@
|
|
1
|
+
#include "arff.h"
|
2
|
+
#include <stdexcept>
|
3
|
+
#include <fstream>
|
4
|
+
#include <iostream>
|
5
|
+
#include <stdlib.h>
|
6
|
+
#include <vector>
|
7
|
+
using namespace std;
|
8
|
+
|
9
|
+
#define BUFFER_SIZE (10 * 1024)
|
10
|
+
#define RELATION_PREFIX "@relation "
|
11
|
+
#define RELATION_PREFIX_LENGTH 10
|
12
|
+
#define ATTRIBUTE_PREFIX "@attribute "
|
13
|
+
#define ATTRIBUTE_PREFIX_LENGTH 11
|
14
|
+
#define DATA_PREFIX "@data"
|
15
|
+
#define DATA_PREFIX_LENGTH 5
|
16
|
+
#define NUMERIC_TYPE "numeric"
|
17
|
+
#define NUMERIC_TYPE_LENGTH 7
|
18
|
+
|
19
|
+
|
20
|
+
// skip whitespace and other delimiters
|
21
|
+
#define skip_delimiters(conditions) \
|
22
|
+
while(conditions)\
|
23
|
+
(*str)++;
|
24
|
+
|
25
|
+
#define skip_only_whitespace() skip_delimiters((isspace(**str)))
|
26
|
+
#define skip_whitespace_and_nominal_list_markers() skip_delimiters((isspace(**str) || (**str == ',') || (**str == '}')))
|
27
|
+
#define skip_whitespace_and_example_list_markers() skip_delimiters((isspace(**str) || (**str == ',') || (**str == '}')))
|
28
|
+
|
29
|
+
|
30
|
+
// move the character position until the end of a token
|
31
|
+
#define tokenise_while(conditions) \
|
32
|
+
while(**str && (conditions))\
|
33
|
+
(*str)++;\
|
34
|
+
if(**str) {\
|
35
|
+
**str = 0;\
|
36
|
+
(*str)++;\
|
37
|
+
}
|
38
|
+
|
39
|
+
#define tokenise_space() tokenise_while(!isspace(**str))
|
40
|
+
#define tokenise_quote() tokenise_while((**str != '"' || (*(*str - 1) != '\\')))
|
41
|
+
#define tokenise_value() tokenise_while(!isspace(**str) && (**str != ','))
|
42
|
+
#define tokenise_nominal() tokenise_while(!isspace(**str) && (**str != ',') && (**str != '}'))
|
43
|
+
|
44
|
+
|
45
|
+
// determine whether the token is quote delimited or otherwise,
|
46
|
+
// and cleanup whitespace etc. at the end
|
47
|
+
#define tokenise(tokeniser, skipper) \
|
48
|
+
char *start;\
|
49
|
+
if(**str == '"') {\
|
50
|
+
start = ++*str;\
|
51
|
+
tokenise_quote();\
|
52
|
+
} else {\
|
53
|
+
start = *str;\
|
54
|
+
tokeniser();\
|
55
|
+
}\
|
56
|
+
skipper();\
|
57
|
+
return start;
|
58
|
+
|
59
|
+
inline char *tokenise_attribute_name(char **str) {
|
60
|
+
tokenise(tokenise_space, skip_only_whitespace);
|
61
|
+
}
|
62
|
+
|
63
|
+
inline char *tokenise_nominal_value(char **str) {
|
64
|
+
tokenise(tokenise_nominal, skip_whitespace_and_nominal_list_markers);
|
65
|
+
}
|
66
|
+
|
67
|
+
inline char *tokenise_example_value(char **str) {
|
68
|
+
tokenise(tokenise_value, skip_whitespace_and_example_list_markers);
|
69
|
+
}
|
70
|
+
|
71
|
+
// downcase and compare two strings of a known length
|
72
|
+
inline bool matches(char *buffer, char *compare_to, int length) {
|
73
|
+
while(length--)
|
74
|
+
if(tolower(*(buffer++)) != *(compare_to++))
|
75
|
+
return false;
|
76
|
+
return true;
|
77
|
+
}
|
78
|
+
|
79
|
+
inline void skip_whitespace(char **str) {
|
80
|
+
skip_only_whitespace();
|
81
|
+
}
|
82
|
+
|
83
|
+
|
84
|
+
|
85
|
+
DataSet::DataSet *Storage::ARFF::read() {
|
86
|
+
DataSet::DenseDataSet *data_set = new DataSet::DenseDataSet();
|
87
|
+
DataSet::NominalFeature *feature = NULL;
|
88
|
+
DataSet::DenseExample *example = NULL;
|
89
|
+
State state = relation;
|
90
|
+
vector<bool> numeric_feature;
|
91
|
+
char buffer[BUFFER_SIZE];
|
92
|
+
char *line, *name, *value;
|
93
|
+
string value_str;
|
94
|
+
int value_index;
|
95
|
+
fstream file;
|
96
|
+
|
97
|
+
file.open(path.c_str(), fstream::in);
|
98
|
+
|
99
|
+
while(file.good()) {
|
100
|
+
file.getline(buffer, BUFFER_SIZE);
|
101
|
+
|
102
|
+
switch(*buffer) {
|
103
|
+
// blank line
|
104
|
+
case '\0':
|
105
|
+
break;
|
106
|
+
|
107
|
+
// comments start with percent
|
108
|
+
case '%':
|
109
|
+
break;
|
110
|
+
|
111
|
+
// transitioning states, or adding a new feature
|
112
|
+
case '@':
|
113
|
+
switch(state) {
|
114
|
+
case relation:
|
115
|
+
if(!matches(buffer, RELATION_PREFIX, RELATION_PREFIX_LENGTH))
|
116
|
+
throw runtime_error("Expected relation declaration");
|
117
|
+
line = buffer + RELATION_PREFIX_LENGTH;
|
118
|
+
skip_whitespace(&line);
|
119
|
+
data_set->name = line;
|
120
|
+
state = attributes;
|
121
|
+
break;
|
122
|
+
|
123
|
+
case attributes:
|
124
|
+
// check if this is an attribute declaration
|
125
|
+
if(matches(buffer, ATTRIBUTE_PREFIX, ATTRIBUTE_PREFIX_LENGTH)) {
|
126
|
+
line = buffer + ATTRIBUTE_PREFIX_LENGTH;
|
127
|
+
skip_whitespace(&line);
|
128
|
+
|
129
|
+
// extract the attribute's name
|
130
|
+
name = tokenise_attribute_name(&line);
|
131
|
+
if(!*line)
|
132
|
+
throw runtime_error("Unexpected end of line");
|
133
|
+
|
134
|
+
// add a numeric attribute
|
135
|
+
if(matches(line, NUMERIC_TYPE, NUMERIC_TYPE_LENGTH)) {
|
136
|
+
data_set->new_numeric_feature(name);
|
137
|
+
numeric_feature.push_back(true);
|
138
|
+
|
139
|
+
// add a nominal attribute
|
140
|
+
} else if(*line == '{') {
|
141
|
+
feature = data_set->new_nominal_feature(name);
|
142
|
+
numeric_feature.push_back(false);
|
143
|
+
line++;
|
144
|
+
|
145
|
+
while(*line) {
|
146
|
+
value_str = tokenise_nominal_value(&line);
|
147
|
+
feature->add_value(value_str);
|
148
|
+
}
|
149
|
+
|
150
|
+
// other attribute types aren't supported
|
151
|
+
} else {
|
152
|
+
throw runtime_error("Unknown attribute type - only numeric and nominal attributes are supported");
|
153
|
+
}
|
154
|
+
|
155
|
+
// otherwise could be the start of the data section
|
156
|
+
} else {
|
157
|
+
if(matches(buffer, DATA_PREFIX, DATA_PREFIX_LENGTH))
|
158
|
+
state = data;
|
159
|
+
else
|
160
|
+
throw runtime_error("Expected attribute or data declaration");
|
161
|
+
}
|
162
|
+
break;
|
163
|
+
|
164
|
+
case data:
|
165
|
+
throw runtime_error("Unexpected declaration line, currently in data section");
|
166
|
+
}
|
167
|
+
break;
|
168
|
+
|
169
|
+
// adding data
|
170
|
+
default:
|
171
|
+
if(state != data)
|
172
|
+
throw runtime_error("Expected data section");
|
173
|
+
|
174
|
+
example = data_set->new_example();
|
175
|
+
value_index = 0;
|
176
|
+
line = buffer;
|
177
|
+
|
178
|
+
while(*line) {
|
179
|
+
value = tokenise_example_value(&line);
|
180
|
+
if(numeric_feature[value_index]) {
|
181
|
+
example->set_value(value_index, atof(value));
|
182
|
+
} else {
|
183
|
+
value_str = value;
|
184
|
+
feature = (DataSet::NominalFeature *)data_set->features[value_index];
|
185
|
+
example->set_value(value_index, feature->indexes[value_str]);
|
186
|
+
}
|
187
|
+
value_index++;
|
188
|
+
}
|
189
|
+
}
|
190
|
+
}
|
191
|
+
|
192
|
+
data_set->set_category_index(data_set->features.size() - 1);
|
193
|
+
file.close();
|
194
|
+
return data_set;
|
195
|
+
}
|
196
|
+
|
197
|
+
void Storage::ARFF::write(DataSet::DataSet *data_set) {
|
198
|
+
}
|
@@ -0,0 +1,26 @@
|
|
1
|
+
#ifndef __arff_h__
|
2
|
+
#define __arff_h__
|
3
|
+
#include "storage/storage.h"
|
4
|
+
#include "data_set/dense/dense_data_set.h"
|
5
|
+
#include <algorithm>
|
6
|
+
#include <cctype>
|
7
|
+
#include <string>
|
8
|
+
using namespace std;
|
9
|
+
|
10
|
+
namespace Storage {
|
11
|
+
class ARFF : public Storage {
|
12
|
+
typedef enum {
|
13
|
+
relation,
|
14
|
+
attributes,
|
15
|
+
data
|
16
|
+
} State;
|
17
|
+
|
18
|
+
public:
|
19
|
+
string path;
|
20
|
+
ARFF(string path) : path(path) {}
|
21
|
+
DataSet::DataSet *read();
|
22
|
+
void write(DataSet::DataSet *data_set);
|
23
|
+
};
|
24
|
+
}
|
25
|
+
|
26
|
+
#endif
|