thera 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +56 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +20 -0
- data/LICENSE.txt +1 -0
- data/README.rdoc +8 -0
- data/Rakefile +1 -0
- data/ext/Makefile +225 -0
- data/ext/extconf.rb +29 -0
- data/ext/quarry/quarry_toolkit.cpp +148 -0
- data/lib/quarry/Makefile.linux +2 -0
- data/lib/quarry/Makefile.osx +6 -0
- data/lib/quarry/Makefile.targets +23 -0
- data/lib/quarry/obj/.gitkeep +0 -0
- data/lib/quarry/src/classifier/aode/aode_classifier.cpp +0 -0
- data/lib/quarry/src/classifier/aode/aode_classifier.h +0 -0
- data/lib/quarry/src/classifier/centroid/centroid_classifier.cpp +0 -0
- data/lib/quarry/src/classifier/centroid/centroid_classifier.h +0 -0
- data/lib/quarry/src/classifier/classifier.cpp +32 -0
- data/lib/quarry/src/classifier/classifier.h +59 -0
- data/lib/quarry/src/classifier/knn/knn_classifier.cpp +0 -0
- data/lib/quarry/src/classifier/knn/knn_classifier.h +0 -0
- data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.cpp +40 -0
- data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.h +18 -0
- data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +80 -0
- data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.h +52 -0
- data/lib/quarry/src/data_set/data_set.cpp +130 -0
- data/lib/quarry/src/data_set/data_set.h +78 -0
- data/lib/quarry/src/data_set/dense/dense_data_set.h +39 -0
- data/lib/quarry/src/data_set/dense/dense_example.h +44 -0
- data/lib/quarry/src/data_set/example.cpp +10 -0
- data/lib/quarry/src/data_set/example.h +23 -0
- data/lib/quarry/src/data_set/feature.h +36 -0
- data/lib/quarry/src/data_set/features/nominal_feature.cpp +57 -0
- data/lib/quarry/src/data_set/features/nominal_feature.h +76 -0
- data/lib/quarry/src/data_set/features/numeric_feature.cpp +69 -0
- data/lib/quarry/src/data_set/features/numeric_feature.h +78 -0
- data/lib/quarry/src/data_set/sparse/sparse_data_set.h +40 -0
- data/lib/quarry/src/data_set/sparse/sparse_example.cpp +82 -0
- data/lib/quarry/src/data_set/sparse/sparse_example.h +38 -0
- data/lib/quarry/src/metrics/confusion_matrix.cpp +129 -0
- data/lib/quarry/src/metrics/confusion_matrix.h +82 -0
- data/lib/quarry/src/model/model.cpp +29 -0
- data/lib/quarry/src/model/model.h +50 -0
- data/lib/quarry/src/preprocessing/examples/example_preprocessor.h +20 -0
- data/lib/quarry/src/preprocessing/examples/weights/binary_weight.h +20 -0
- data/lib/quarry/src/preprocessing/examples/weights/local_weight.h +29 -0
- data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +19 -0
- data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +59 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/downcase.h +26 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h +17 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer.h +44 -0
- data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer_original.cpp +375 -0
- data/lib/quarry/src/preprocessing/text/text_pipeline.cpp +29 -0
- data/lib/quarry/src/preprocessing/text/text_pipeline.h +37 -0
- data/lib/quarry/src/preprocessing/text/token_selector/pos_tag_selector.h +21 -0
- data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp +82 -0
- data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h +20 -0
- data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h +17 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.cpp +29 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.h +20 -0
- data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h +19 -0
- data/lib/quarry/src/quarry.cpp +1 -0
- data/lib/quarry/src/quarry.h +29 -0
- data/lib/quarry/src/storage/arff.cpp +198 -0
- data/lib/quarry/src/storage/arff.h +26 -0
- data/lib/quarry/src/storage/binary.cpp +457 -0
- data/lib/quarry/src/storage/binary.h +79 -0
- data/lib/quarry/src/storage/folders.cpp +98 -0
- data/lib/quarry/src/storage/folders.h +25 -0
- data/lib/quarry/src/storage/storage.h +19 -0
- data/lib/quarry/src/test.cpp +6 -0
- data/lib/quarry_rb/classifier/classifier.rb +22 -0
- data/lib/quarry_rb/classifier/naive_bayes_classifier.rb +10 -0
- data/lib/quarry_rb/confusion_matrix.rb +58 -0
- data/lib/quarry_rb/data_set/data_set.rb +42 -0
- data/lib/quarry_rb/data_set/example.rb +33 -0
- data/lib/quarry_rb/data_set/feature.rb +28 -0
- data/lib/quarry_rb/enumerable_helper.rb +32 -0
- data/lib/quarry_rb/model/model.rb +56 -0
- data/lib/quarry_rb/storage/arff.rb +11 -0
- data/lib/quarry_rb/storage/binary.rb +23 -0
- data/lib/quarry_rb/storage/folders.rb +11 -0
- data/lib/quarry_rb/text_pipeline.rb +16 -0
- data/lib/thera.rb +20 -0
- data/test/helper.rb +19 -0
- data/test/test_quarry.rb +33 -0
- data/thera.gemspec +21 -0
- metadata +148 -0
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
#include "stop_words.h"
|
|
2
|
+
#include <tr1/unordered_set>
|
|
3
|
+
#include <iostream>
|
|
4
|
+
using namespace std;
|
|
5
|
+
using namespace tr1;
|
|
6
|
+
|
|
7
|
+
static unordered_set<string> *stop_words = NULL;
|
|
8
|
+
static int stop_word_count = 586;
|
|
9
|
+
static string stop_word_list[] = {
|
|
10
|
+
"a", "able", "about", "above", "abroad", "according", "accordingly", "across", "actually", "adj",
|
|
11
|
+
"after", "afterwards", "again", "against", "ago", "ahead", "all", "allow", "allows", "almost",
|
|
12
|
+
"alone", "along", "alongside", "already", "also", "although", "always", "am", "amid", "amidst",
|
|
13
|
+
"among", "amongst", "an", "and", "another", "any", "anybody", "anyhow", "anyone", "anything",
|
|
14
|
+
"anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "around", "as",
|
|
15
|
+
"aside", "ask", "asking", "associated", "at", "available", "away", "awfully", "b", "back",
|
|
16
|
+
"backward", "backwards", "be", "became", "because", "become", "becomes", "becoming", "been", "before",
|
|
17
|
+
"beforehand", "begin", "behind", "being", "believe", "below", "beside", "besides", "best", "better",
|
|
18
|
+
"between", "beyond", "both", "brief", "but", "by", "c", "came", "can", "cannot",
|
|
19
|
+
"cant", "caption", "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com",
|
|
20
|
+
"come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding",
|
|
21
|
+
"could", "course", "currently", "d", "dare", "definitely", "described", "despite", "did", "different",
|
|
22
|
+
"directly", "do", "does", "doing", "done", "down", "downwards", "during", "e", "each",
|
|
23
|
+
"edu", "eg", "eight", "eighty", "either", "else", "elsewhere", "end", "ending", "enough",
|
|
24
|
+
"entirely", "especially", "et", "etc", "even", "ever", "evermore", "every", "everybody", "everyone",
|
|
25
|
+
"everything", "everywhere", "ex", "exactly", "example", "except", "f", "fairly", "far", "farther",
|
|
26
|
+
"few", "fewer", "fifth", "first", "five", "followed", "following", "follows", "for", "forever",
|
|
27
|
+
"former", "formerly", "forth", "forward", "found", "four", "from", "further", "furthermore", "g",
|
|
28
|
+
"get", "gets", "getting", "given", "gives", "go", "goes", "going", "gone", "got",
|
|
29
|
+
"gotten", "greetings", "h", "had", "half", "happens", "hardly", "has", "have", "having",
|
|
30
|
+
"he", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon",
|
|
31
|
+
"hers", "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how", "howbeit",
|
|
32
|
+
"however", "hundred", "i", "ie", "if", "ignored", "immediate", "in", "inasmuch", "inc",
|
|
33
|
+
"indeed", "indicate", "indicated", "indicates", "inner", "inside", "insofar", "instead", "into", "inward",
|
|
34
|
+
"is", "it", "its", "itself", "j", "just", "k", "keep", "keeps", "kept",
|
|
35
|
+
"know", "known", "knows", "l", "last", "lately", "later", "latter", "latterly", "least",
|
|
36
|
+
"less", "lest", "let", "like", "liked", "likely", "likewise", "little", "look", "looking",
|
|
37
|
+
"looks", "low", "lower", "ltd", "m", "made", "mainly", "make", "makes", "many",
|
|
38
|
+
"may", "maybe", "me", "mean", "meantime", "meanwhile", "merely", "might", "mine", "minus",
|
|
39
|
+
"miss", "more", "moreover", "most", "mostly", "mr", "mrs", "much", "must", "my",
|
|
40
|
+
"myself", "n", "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs",
|
|
41
|
+
"neither", "never", "neverf", "neverless", "nevertheless", "new", "next", "nine", "ninety", "no",
|
|
42
|
+
"nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "not", "nothing", "notwithstanding",
|
|
43
|
+
"novel", "now", "nowhere", "o", "obviously", "of", "off", "often", "oh", "ok",
|
|
44
|
+
"okay", "old", "on", "once", "one", "ones", "only", "onto", "opposite", "or",
|
|
45
|
+
"other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over",
|
|
46
|
+
"overall", "own", "p", "particular", "particularly", "past", "per", "perhaps", "placed", "please",
|
|
47
|
+
"plus", "possible", "presumably", "probably", "provided", "provides", "q", "que", "quite", "qv",
|
|
48
|
+
"r", "rather", "rd", "re", "really", "reasonably", "recent", "recently", "regarding", "regardless",
|
|
49
|
+
"regards", "relatively", "respectively", "right", "round", "s", "said", "same", "saw", "say",
|
|
50
|
+
"saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming", "seems",
|
|
51
|
+
"seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall",
|
|
52
|
+
"she", "should", "since", "six", "so", "some", "somebody", "someday", "somehow", "someone",
|
|
53
|
+
"something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying",
|
|
54
|
+
"still", "sub", "such", "sup", "sure", "t", "take", "taken", "taking", "tell",
|
|
55
|
+
"tends", "th", "than", "thank", "thanks", "thanx", "that", "thats", "the", "their",
|
|
56
|
+
"theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein",
|
|
57
|
+
"theres", "thereupon", "these", "they", "thing", "things", "think", "third", "thirty", "this",
|
|
58
|
+
"thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "till",
|
|
59
|
+
"to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try",
|
|
60
|
+
"trying", "twice", "two", "u", "un", "under", "underneath", "undoing", "unfortunately", "unless",
|
|
61
|
+
"unlike", "unlikely", "until", "unto", "up", "upon", "upwards", "us", "use", "used",
|
|
62
|
+
"useful", "uses", "using", "usually", "v", "value", "various", "versus", "very", "via",
|
|
63
|
+
"viz", "vs", "w", "want", "wants", "was", "way", "we", "welcome", "well",
|
|
64
|
+
"went", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas",
|
|
65
|
+
"whereby", "wherein", "whereupon", "wherever", "whether", "which", "whichever", "while", "whilst", "whither",
|
|
66
|
+
"who", "whoever", "whole", "whom", "whomever", "whose", "why", "will", "willing", "wish",
|
|
67
|
+
"with", "within", "without", "wonder", "would", "x", "y", "yes", "yet", "you",
|
|
68
|
+
"your", "yours", "yourself", "yourselves", "z", "zero"
|
|
69
|
+
};
|
|
70
|
+
|
|
71
|
+
Preprocessing::Text::StopWords::StopWords() : TokenSelector() {
|
|
72
|
+
if(stop_words == NULL) {
|
|
73
|
+
stop_words = new unordered_set<string>();
|
|
74
|
+
for(int i = 0; i < stop_word_count; i++)
|
|
75
|
+
stop_words->insert(stop_word_list[i]);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
bool Preprocessing::Text::StopWords::select(char *start, char *end) {
|
|
80
|
+
string token = string(start, (end - start) + 1);
|
|
81
|
+
return stop_words->count(token) == 0;
|
|
82
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
#ifndef __stop_words_h__
|
|
2
|
+
#define __stop_words_h__
|
|
3
|
+
#include "token_selector.h"
|
|
4
|
+
|
|
5
|
+
namespace Preprocessing {
|
|
6
|
+
namespace Text {
|
|
7
|
+
|
|
8
|
+
class StopWords : public TokenSelector {
|
|
9
|
+
public:
|
|
10
|
+
static const uint32_t file_mark = 'stop';
|
|
11
|
+
uint32_t mark() { return file_mark; }
|
|
12
|
+
|
|
13
|
+
StopWords();
|
|
14
|
+
bool select(char *start, char *end);
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
#endif
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
#ifndef __token_selector_h__
|
|
2
|
+
#define __token_selector_h__
|
|
3
|
+
#include <stdint.h>
|
|
4
|
+
|
|
5
|
+
namespace Preprocessing {
|
|
6
|
+
namespace Text {
|
|
7
|
+
|
|
8
|
+
class TokenSelector {
|
|
9
|
+
public:
|
|
10
|
+
virtual bool select(char *start, char *end) { return true; }
|
|
11
|
+
virtual uint32_t mark() = 0;
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
#endif
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#include "preprocessing/text/text_pipeline.h"
|
|
2
|
+
#include "simple_tokeniser.h"
|
|
3
|
+
|
|
4
|
+
void Preprocessing::Text::SimpleTokeniser::tokenise(char *text) {
|
|
5
|
+
char *start, *end;
|
|
6
|
+
bool intoken = false;
|
|
7
|
+
bool active = true;
|
|
8
|
+
|
|
9
|
+
// to simplify the code, the while condition is not while(*text),
|
|
10
|
+
// because the if(intoken) block needs to run when *text == 0 at
|
|
11
|
+
// the end of the string.
|
|
12
|
+
while(active) {
|
|
13
|
+
active = *text;
|
|
14
|
+
if(isalnum(*text)) {
|
|
15
|
+
if(!intoken) {
|
|
16
|
+
intoken = true;
|
|
17
|
+
start = text;
|
|
18
|
+
}
|
|
19
|
+
} else {
|
|
20
|
+
if(intoken) {
|
|
21
|
+
intoken = false;
|
|
22
|
+
*text = 0;
|
|
23
|
+
end = text;
|
|
24
|
+
pipeline->process_token(start, end);
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
text++;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
#ifndef __simple_tokeniser_h__
|
|
2
|
+
#define __simple_tokeniser_h__
|
|
3
|
+
#include "tokeniser.h"
|
|
4
|
+
|
|
5
|
+
namespace Preprocessing {
|
|
6
|
+
namespace Text {
|
|
7
|
+
|
|
8
|
+
class SimpleTokeniser : public Tokeniser {
|
|
9
|
+
public:
|
|
10
|
+
static const uint32_t file_mark = 'simt';
|
|
11
|
+
uint32_t mark() { return file_mark; }
|
|
12
|
+
|
|
13
|
+
SimpleTokeniser(TextPipeline *pipeline) : Tokeniser(pipeline) {}
|
|
14
|
+
void tokenise(char *text);
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
#endif
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
#ifndef __tokeniser_h__
|
|
2
|
+
#define __tokeniser_h__
|
|
3
|
+
|
|
4
|
+
namespace Preprocessing {
|
|
5
|
+
namespace Text {
|
|
6
|
+
class TextPipeline;
|
|
7
|
+
|
|
8
|
+
class Tokeniser {
|
|
9
|
+
public:
|
|
10
|
+
TextPipeline *pipeline;
|
|
11
|
+
Tokeniser(TextPipeline *pipeline) : pipeline(pipeline) {}
|
|
12
|
+
virtual void tokenise(char *text) {}
|
|
13
|
+
virtual uint32_t mark() = 0;
|
|
14
|
+
};
|
|
15
|
+
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
#endif
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
void mkmf_marker() {;}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#include "data_set/data_set.h"
|
|
2
|
+
#include "data_set/dense/dense_data_set.h"
|
|
3
|
+
#include "data_set/sparse/sparse_data_set.h"
|
|
4
|
+
#include "data_set/example.h"
|
|
5
|
+
#include "data_set/dense/dense_example.h"
|
|
6
|
+
#include "data_set/sparse/sparse_example.h"
|
|
7
|
+
#include "data_set/feature.h"
|
|
8
|
+
#include "data_set/features/nominal_feature.h"
|
|
9
|
+
#include "data_set/features/numeric_feature.h"
|
|
10
|
+
|
|
11
|
+
#include "classifier/classifier.h"
|
|
12
|
+
#include "classifier/naive_bayes/naive_bayes_classifier.h"
|
|
13
|
+
|
|
14
|
+
#include "metrics/confusion_matrix.h"
|
|
15
|
+
|
|
16
|
+
#include "preprocessing/text/text_pipeline.h"
|
|
17
|
+
#include "preprocessing/examples/example_preprocessor.h"
|
|
18
|
+
#include "preprocessing/examples/weights/binary_weight.h"
|
|
19
|
+
#include "preprocessing/examples/weights/local_weight.h"
|
|
20
|
+
|
|
21
|
+
#include "model/model.h"
|
|
22
|
+
|
|
23
|
+
#include "storage/storage.h"
|
|
24
|
+
#include "storage/arff.h"
|
|
25
|
+
#include "storage/folders.h"
|
|
26
|
+
#include "storage/binary.h"
|
|
27
|
+
|
|
28
|
+
// function to show to mkmf this library exists
|
|
29
|
+
void mkmf_marker();
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
#include "arff.h"
|
|
2
|
+
#include <stdexcept>
|
|
3
|
+
#include <fstream>
|
|
4
|
+
#include <iostream>
|
|
5
|
+
#include <stdlib.h>
|
|
6
|
+
#include <vector>
|
|
7
|
+
using namespace std;
|
|
8
|
+
|
|
9
|
+
#define BUFFER_SIZE (10 * 1024)
|
|
10
|
+
#define RELATION_PREFIX "@relation "
|
|
11
|
+
#define RELATION_PREFIX_LENGTH 10
|
|
12
|
+
#define ATTRIBUTE_PREFIX "@attribute "
|
|
13
|
+
#define ATTRIBUTE_PREFIX_LENGTH 11
|
|
14
|
+
#define DATA_PREFIX "@data"
|
|
15
|
+
#define DATA_PREFIX_LENGTH 5
|
|
16
|
+
#define NUMERIC_TYPE "numeric"
|
|
17
|
+
#define NUMERIC_TYPE_LENGTH 7
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
// skip whitespace and other delimiters
|
|
21
|
+
#define skip_delimiters(conditions) \
|
|
22
|
+
while(conditions)\
|
|
23
|
+
(*str)++;
|
|
24
|
+
|
|
25
|
+
#define skip_only_whitespace() skip_delimiters((isspace(**str)))
|
|
26
|
+
#define skip_whitespace_and_nominal_list_markers() skip_delimiters((isspace(**str) || (**str == ',') || (**str == '}')))
|
|
27
|
+
#define skip_whitespace_and_example_list_markers() skip_delimiters((isspace(**str) || (**str == ',') || (**str == '}')))
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
// move the character position until the end of a token
|
|
31
|
+
#define tokenise_while(conditions) \
|
|
32
|
+
while(**str && (conditions))\
|
|
33
|
+
(*str)++;\
|
|
34
|
+
if(**str) {\
|
|
35
|
+
**str = 0;\
|
|
36
|
+
(*str)++;\
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
#define tokenise_space() tokenise_while(!isspace(**str))
|
|
40
|
+
#define tokenise_quote() tokenise_while((**str != '"' || (*(*str - 1) != '\\')))
|
|
41
|
+
#define tokenise_value() tokenise_while(!isspace(**str) && (**str != ','))
|
|
42
|
+
#define tokenise_nominal() tokenise_while(!isspace(**str) && (**str != ',') && (**str != '}'))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
// determine whether the token is quote delimited or otherwise,
|
|
46
|
+
// and cleanup whitespace etc. at the end
|
|
47
|
+
#define tokenise(tokeniser, skipper) \
|
|
48
|
+
char *start;\
|
|
49
|
+
if(**str == '"') {\
|
|
50
|
+
start = ++*str;\
|
|
51
|
+
tokenise_quote();\
|
|
52
|
+
} else {\
|
|
53
|
+
start = *str;\
|
|
54
|
+
tokeniser();\
|
|
55
|
+
}\
|
|
56
|
+
skipper();\
|
|
57
|
+
return start;
|
|
58
|
+
|
|
59
|
+
inline char *tokenise_attribute_name(char **str) {
|
|
60
|
+
tokenise(tokenise_space, skip_only_whitespace);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
inline char *tokenise_nominal_value(char **str) {
|
|
64
|
+
tokenise(tokenise_nominal, skip_whitespace_and_nominal_list_markers);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
inline char *tokenise_example_value(char **str) {
|
|
68
|
+
tokenise(tokenise_value, skip_whitespace_and_example_list_markers);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// downcase and compare two strings of a known length
|
|
72
|
+
inline bool matches(char *buffer, char *compare_to, int length) {
|
|
73
|
+
while(length--)
|
|
74
|
+
if(tolower(*(buffer++)) != *(compare_to++))
|
|
75
|
+
return false;
|
|
76
|
+
return true;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
inline void skip_whitespace(char **str) {
|
|
80
|
+
skip_only_whitespace();
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
DataSet::DataSet *Storage::ARFF::read() {
|
|
86
|
+
DataSet::DenseDataSet *data_set = new DataSet::DenseDataSet();
|
|
87
|
+
DataSet::NominalFeature *feature = NULL;
|
|
88
|
+
DataSet::DenseExample *example = NULL;
|
|
89
|
+
State state = relation;
|
|
90
|
+
vector<bool> numeric_feature;
|
|
91
|
+
char buffer[BUFFER_SIZE];
|
|
92
|
+
char *line, *name, *value;
|
|
93
|
+
string value_str;
|
|
94
|
+
int value_index;
|
|
95
|
+
fstream file;
|
|
96
|
+
|
|
97
|
+
file.open(path.c_str(), fstream::in);
|
|
98
|
+
|
|
99
|
+
while(file.good()) {
|
|
100
|
+
file.getline(buffer, BUFFER_SIZE);
|
|
101
|
+
|
|
102
|
+
switch(*buffer) {
|
|
103
|
+
// blank line
|
|
104
|
+
case '\0':
|
|
105
|
+
break;
|
|
106
|
+
|
|
107
|
+
// comments start with percent
|
|
108
|
+
case '%':
|
|
109
|
+
break;
|
|
110
|
+
|
|
111
|
+
// transitioning states, or adding a new feature
|
|
112
|
+
case '@':
|
|
113
|
+
switch(state) {
|
|
114
|
+
case relation:
|
|
115
|
+
if(!matches(buffer, RELATION_PREFIX, RELATION_PREFIX_LENGTH))
|
|
116
|
+
throw runtime_error("Expected relation declaration");
|
|
117
|
+
line = buffer + RELATION_PREFIX_LENGTH;
|
|
118
|
+
skip_whitespace(&line);
|
|
119
|
+
data_set->name = line;
|
|
120
|
+
state = attributes;
|
|
121
|
+
break;
|
|
122
|
+
|
|
123
|
+
case attributes:
|
|
124
|
+
// check if this is an attribute declaration
|
|
125
|
+
if(matches(buffer, ATTRIBUTE_PREFIX, ATTRIBUTE_PREFIX_LENGTH)) {
|
|
126
|
+
line = buffer + ATTRIBUTE_PREFIX_LENGTH;
|
|
127
|
+
skip_whitespace(&line);
|
|
128
|
+
|
|
129
|
+
// extract the attribute's name
|
|
130
|
+
name = tokenise_attribute_name(&line);
|
|
131
|
+
if(!*line)
|
|
132
|
+
throw runtime_error("Unexpected end of line");
|
|
133
|
+
|
|
134
|
+
// add a numeric attribute
|
|
135
|
+
if(matches(line, NUMERIC_TYPE, NUMERIC_TYPE_LENGTH)) {
|
|
136
|
+
data_set->new_numeric_feature(name);
|
|
137
|
+
numeric_feature.push_back(true);
|
|
138
|
+
|
|
139
|
+
// add a nominal attribute
|
|
140
|
+
} else if(*line == '{') {
|
|
141
|
+
feature = data_set->new_nominal_feature(name);
|
|
142
|
+
numeric_feature.push_back(false);
|
|
143
|
+
line++;
|
|
144
|
+
|
|
145
|
+
while(*line) {
|
|
146
|
+
value_str = tokenise_nominal_value(&line);
|
|
147
|
+
feature->add_value(value_str);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// other attribute types aren't supported
|
|
151
|
+
} else {
|
|
152
|
+
throw runtime_error("Unknown attribute type - only numeric and nominal attributes are supported");
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// otherwise could be the start of the data section
|
|
156
|
+
} else {
|
|
157
|
+
if(matches(buffer, DATA_PREFIX, DATA_PREFIX_LENGTH))
|
|
158
|
+
state = data;
|
|
159
|
+
else
|
|
160
|
+
throw runtime_error("Expected attribute or data declaration");
|
|
161
|
+
}
|
|
162
|
+
break;
|
|
163
|
+
|
|
164
|
+
case data:
|
|
165
|
+
throw runtime_error("Unexpected declaration line, currently in data section");
|
|
166
|
+
}
|
|
167
|
+
break;
|
|
168
|
+
|
|
169
|
+
// adding data
|
|
170
|
+
default:
|
|
171
|
+
if(state != data)
|
|
172
|
+
throw runtime_error("Expected data section");
|
|
173
|
+
|
|
174
|
+
example = data_set->new_example();
|
|
175
|
+
value_index = 0;
|
|
176
|
+
line = buffer;
|
|
177
|
+
|
|
178
|
+
while(*line) {
|
|
179
|
+
value = tokenise_example_value(&line);
|
|
180
|
+
if(numeric_feature[value_index]) {
|
|
181
|
+
example->set_value(value_index, atof(value));
|
|
182
|
+
} else {
|
|
183
|
+
value_str = value;
|
|
184
|
+
feature = (DataSet::NominalFeature *)data_set->features[value_index];
|
|
185
|
+
example->set_value(value_index, feature->indexes[value_str]);
|
|
186
|
+
}
|
|
187
|
+
value_index++;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
data_set->set_category_index(data_set->features.size() - 1);
|
|
193
|
+
file.close();
|
|
194
|
+
return data_set;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
void Storage::ARFF::write(DataSet::DataSet *data_set) {
|
|
198
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#ifndef __arff_h__
|
|
2
|
+
#define __arff_h__
|
|
3
|
+
#include "storage/storage.h"
|
|
4
|
+
#include "data_set/dense/dense_data_set.h"
|
|
5
|
+
#include <algorithm>
|
|
6
|
+
#include <cctype>
|
|
7
|
+
#include <string>
|
|
8
|
+
using namespace std;
|
|
9
|
+
|
|
10
|
+
namespace Storage {
|
|
11
|
+
class ARFF : public Storage {
|
|
12
|
+
typedef enum {
|
|
13
|
+
relation,
|
|
14
|
+
attributes,
|
|
15
|
+
data
|
|
16
|
+
} State;
|
|
17
|
+
|
|
18
|
+
public:
|
|
19
|
+
string path;
|
|
20
|
+
ARFF(string path) : path(path) {}
|
|
21
|
+
DataSet::DataSet *read();
|
|
22
|
+
void write(DataSet::DataSet *data_set);
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
#endif
|