RubyGems - thera - Versions diffs - 0.0.1 - Mend

thera 0.0.1

Files changed (89) hide show

data/.document +5 -0
data/.gitignore +56 -0
data/Gemfile +2 -0
data/Gemfile.lock +20 -0
data/LICENSE.txt +1 -0
data/README.rdoc +8 -0
data/Rakefile +1 -0
data/ext/Makefile +225 -0
data/ext/extconf.rb +29 -0
data/ext/quarry/quarry_toolkit.cpp +148 -0
data/lib/quarry/Makefile.linux +2 -0
data/lib/quarry/Makefile.osx +6 -0
data/lib/quarry/Makefile.targets +23 -0
data/lib/quarry/obj/.gitkeep +0 -0
data/lib/quarry/src/classifier/aode/aode_classifier.cpp +0 -0
data/lib/quarry/src/classifier/aode/aode_classifier.h +0 -0
data/lib/quarry/src/classifier/centroid/centroid_classifier.cpp +0 -0
data/lib/quarry/src/classifier/centroid/centroid_classifier.h +0 -0
data/lib/quarry/src/classifier/classifier.cpp +32 -0
data/lib/quarry/src/classifier/classifier.h +59 -0
data/lib/quarry/src/classifier/knn/knn_classifier.cpp +0 -0
data/lib/quarry/src/classifier/knn/knn_classifier.h +0 -0
data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.cpp +40 -0
data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.h +18 -0
data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +80 -0
data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.h +52 -0
data/lib/quarry/src/data_set/data_set.cpp +130 -0
data/lib/quarry/src/data_set/data_set.h +78 -0
data/lib/quarry/src/data_set/dense/dense_data_set.h +39 -0
data/lib/quarry/src/data_set/dense/dense_example.h +44 -0
data/lib/quarry/src/data_set/example.cpp +10 -0
data/lib/quarry/src/data_set/example.h +23 -0
data/lib/quarry/src/data_set/feature.h +36 -0
data/lib/quarry/src/data_set/features/nominal_feature.cpp +57 -0
data/lib/quarry/src/data_set/features/nominal_feature.h +76 -0
data/lib/quarry/src/data_set/features/numeric_feature.cpp +69 -0
data/lib/quarry/src/data_set/features/numeric_feature.h +78 -0
data/lib/quarry/src/data_set/sparse/sparse_data_set.h +40 -0
data/lib/quarry/src/data_set/sparse/sparse_example.cpp +82 -0
data/lib/quarry/src/data_set/sparse/sparse_example.h +38 -0
data/lib/quarry/src/metrics/confusion_matrix.cpp +129 -0
data/lib/quarry/src/metrics/confusion_matrix.h +82 -0
data/lib/quarry/src/model/model.cpp +29 -0
data/lib/quarry/src/model/model.h +50 -0
data/lib/quarry/src/preprocessing/examples/example_preprocessor.h +20 -0
data/lib/quarry/src/preprocessing/examples/weights/binary_weight.h +20 -0
data/lib/quarry/src/preprocessing/examples/weights/local_weight.h +29 -0
data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +19 -0
data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +59 -0
data/lib/quarry/src/preprocessing/text/inplace_processor/downcase.h +26 -0
data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h +17 -0
data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer.h +44 -0
data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer_original.cpp +375 -0
data/lib/quarry/src/preprocessing/text/text_pipeline.cpp +29 -0
data/lib/quarry/src/preprocessing/text/text_pipeline.h +37 -0
data/lib/quarry/src/preprocessing/text/token_selector/pos_tag_selector.h +21 -0
data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp +82 -0
data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h +20 -0
data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h +17 -0
data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.cpp +29 -0
data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.h +20 -0
data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h +19 -0
data/lib/quarry/src/quarry.cpp +1 -0
data/lib/quarry/src/quarry.h +29 -0
data/lib/quarry/src/storage/arff.cpp +198 -0
data/lib/quarry/src/storage/arff.h +26 -0
data/lib/quarry/src/storage/binary.cpp +457 -0
data/lib/quarry/src/storage/binary.h +79 -0
data/lib/quarry/src/storage/folders.cpp +98 -0
data/lib/quarry/src/storage/folders.h +25 -0
data/lib/quarry/src/storage/storage.h +19 -0
data/lib/quarry/src/test.cpp +6 -0
data/lib/quarry_rb/classifier/classifier.rb +22 -0
data/lib/quarry_rb/classifier/naive_bayes_classifier.rb +10 -0
data/lib/quarry_rb/confusion_matrix.rb +58 -0
data/lib/quarry_rb/data_set/data_set.rb +42 -0
data/lib/quarry_rb/data_set/example.rb +33 -0
data/lib/quarry_rb/data_set/feature.rb +28 -0
data/lib/quarry_rb/enumerable_helper.rb +32 -0
data/lib/quarry_rb/model/model.rb +56 -0
data/lib/quarry_rb/storage/arff.rb +11 -0
data/lib/quarry_rb/storage/binary.rb +23 -0
data/lib/quarry_rb/storage/folders.rb +11 -0
data/lib/quarry_rb/text_pipeline.rb +16 -0
data/lib/thera.rb +20 -0
data/test/helper.rb +19 -0
data/test/test_quarry.rb +33 -0
data/thera.gemspec +21 -0
metadata +148 -0

data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp ADDED Viewed

@@ -0,0 +1,82 @@
+#include "stop_words.h"
+#include <tr1/unordered_set>
+#include <iostream>
+using namespace std;
+using namespace tr1;
+static unordered_set<string> *stop_words = NULL;
+static int stop_word_count = 586;
+static string stop_word_list[] = {
+  "a", "able", "about", "above", "abroad", "according", "accordingly", "across", "actually", "adj",
+  "after", "afterwards", "again", "against", "ago", "ahead", "all", "allow", "allows", "almost",
+  "alone", "along", "alongside", "already", "also", "although", "always", "am", "amid", "amidst",
+  "among", "amongst", "an", "and", "another", "any", "anybody", "anyhow", "anyone", "anything",
+  "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "around", "as",
+  "aside", "ask", "asking", "associated", "at", "available", "away", "awfully", "b", "back",
+  "backward", "backwards", "be", "became", "because", "become", "becomes", "becoming", "been", "before",
+  "beforehand", "begin", "behind", "being", "believe", "below", "beside", "besides", "best", "better",
+  "between", "beyond", "both", "brief", "but", "by", "c", "came", "can", "cannot",
+  "cant", "caption", "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com",
+  "come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding",
+  "could", "course", "currently", "d", "dare", "definitely", "described", "despite", "did", "different",
+  "directly", "do", "does", "doing", "done", "down", "downwards", "during", "e", "each",
+  "edu", "eg", "eight", "eighty", "either", "else", "elsewhere", "end", "ending", "enough",
+  "entirely", "especially", "et", "etc", "even", "ever", "evermore", "every", "everybody", "everyone",
+  "everything", "everywhere", "ex", "exactly", "example", "except", "f", "fairly", "far", "farther",
+  "few", "fewer", "fifth", "first", "five", "followed", "following", "follows", "for", "forever",
+  "former", "formerly", "forth", "forward", "found", "four", "from", "further", "furthermore", "g",
+  "get", "gets", "getting", "given", "gives", "go", "goes", "going", "gone", "got",
+  "gotten", "greetings", "h", "had", "half", "happens", "hardly", "has", "have", "having",
+  "he", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon",
+  "hers", "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how", "howbeit",
+  "however", "hundred", "i", "ie", "if", "ignored", "immediate", "in", "inasmuch", "inc",
+  "indeed", "indicate", "indicated", "indicates", "inner", "inside", "insofar", "instead", "into", "inward",
+  "is", "it", "its", "itself", "j", "just", "k", "keep", "keeps", "kept",
+  "know", "known", "knows", "l", "last", "lately", "later", "latter", "latterly", "least",
+  "less", "lest", "let", "like", "liked", "likely", "likewise", "little", "look", "looking",
+  "looks", "low", "lower", "ltd", "m", "made", "mainly", "make", "makes", "many",
+  "may", "maybe", "me", "mean", "meantime", "meanwhile", "merely", "might", "mine", "minus",
+  "miss", "more", "moreover", "most", "mostly", "mr", "mrs", "much", "must", "my",
+  "myself", "n", "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs",
+  "neither", "never", "neverf", "neverless", "nevertheless", "new", "next", "nine", "ninety", "no",
+  "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "not", "nothing", "notwithstanding",
+  "novel", "now", "nowhere", "o", "obviously", "of", "off", "often", "oh", "ok",
+  "okay", "old", "on", "once", "one", "ones", "only", "onto", "opposite", "or",
+  "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over",
+  "overall", "own", "p", "particular", "particularly", "past", "per", "perhaps", "placed", "please",
+  "plus", "possible", "presumably", "probably", "provided", "provides", "q", "que", "quite", "qv",
+  "r", "rather", "rd", "re", "really", "reasonably", "recent", "recently", "regarding", "regardless",
+  "regards", "relatively", "respectively", "right", "round", "s", "said", "same", "saw", "say",
+  "saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming", "seems",
+  "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall",
+  "she", "should", "since", "six", "so", "some", "somebody", "someday", "somehow", "someone",
+  "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying",
+  "still", "sub", "such", "sup", "sure", "t", "take", "taken", "taking", "tell",
+  "tends", "th", "than", "thank", "thanks", "thanx", "that", "thats", "the", "their",
+  "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein",
+  "theres", "thereupon", "these", "they", "thing", "things", "think", "third", "thirty", "this",
+  "thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "till",
+  "to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try",
+  "trying", "twice", "two", "u", "un", "under", "underneath", "undoing", "unfortunately", "unless",
+  "unlike", "unlikely", "until", "unto", "up", "upon", "upwards", "us", "use", "used",
+  "useful", "uses", "using", "usually", "v", "value", "various", "versus", "very", "via",
+  "viz", "vs", "w", "want", "wants", "was", "way", "we", "welcome", "well",
+  "went", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas",
+  "whereby", "wherein", "whereupon", "wherever", "whether", "which", "whichever", "while", "whilst", "whither",
+  "who", "whoever", "whole", "whom", "whomever", "whose", "why", "will", "willing", "wish",
+  "with", "within", "without", "wonder", "would", "x", "y", "yes", "yet", "you",
+  "your", "yours", "yourself", "yourselves", "z", "zero"
+};
+Preprocessing::Text::StopWords::StopWords() : TokenSelector() {
+  if(stop_words == NULL) {
+    stop_words = new unordered_set<string>();
+    for(int i = 0; i < stop_word_count; i++)
+      stop_words->insert(stop_word_list[i]);
+  }
+}
+bool Preprocessing::Text::StopWords::select(char *start, char *end) {
+  string token = string(start, (end - start) + 1);
+  return stop_words->count(token) == 0;
+}

data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h ADDED Viewed

@@ -0,0 +1,20 @@
+#ifndef __stop_words_h__
+#define __stop_words_h__
+#include "token_selector.h"
+namespace Preprocessing {
+  namespace Text {
+    class StopWords : public TokenSelector {
+    public:
+      static const uint32_t file_mark = 'stop';
+      uint32_t mark() { return file_mark; }
+      StopWords();
+      bool select(char *start, char *end);
+    };
+  }
+}
+#endif

data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h ADDED Viewed

@@ -0,0 +1,17 @@
+#ifndef __token_selector_h__
+#define __token_selector_h__
+#include <stdint.h>
+namespace Preprocessing {
+  namespace Text {
+    class TokenSelector {
+    public:
+      virtual bool select(char *start, char *end) { return true; }
+      virtual uint32_t mark() = 0;
+    };
+  }
+}
+#endif

data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.cpp ADDED Viewed

@@ -0,0 +1,29 @@
+#include "preprocessing/text/text_pipeline.h"
+#include "simple_tokeniser.h"
+void Preprocessing::Text::SimpleTokeniser::tokenise(char *text) {
+  char *start, *end;
+  bool intoken = false;
+  bool active  = true;
+  // to simplify the code, the while condition is not while(*text),
+  // because the if(intoken) block needs to run when *text == 0 at
+  // the end of the string.
+  while(active) {
+    active = *text;
+    if(isalnum(*text)) {
+      if(!intoken) {
+        intoken = true;
+        start = text;
+      }
+    } else {
+      if(intoken) {
+        intoken = false;
+        *text = 0;
+        end = text;
+        pipeline->process_token(start, end);
+      }
+    }
+    text++;
+  }
+}

data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.h ADDED Viewed

@@ -0,0 +1,20 @@
+#ifndef __simple_tokeniser_h__
+#define __simple_tokeniser_h__
+#include "tokeniser.h"
+namespace Preprocessing {
+  namespace Text {
+    class SimpleTokeniser : public Tokeniser {
+    public:
+      static const uint32_t file_mark = 'simt';
+      uint32_t mark() { return file_mark; }
+      SimpleTokeniser(TextPipeline *pipeline) : Tokeniser(pipeline) {}
+      void tokenise(char *text);
+    };
+  }
+}
+#endif

data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h ADDED Viewed

@@ -0,0 +1,19 @@
+#ifndef __tokeniser_h__
+#define __tokeniser_h__
+namespace Preprocessing {
+  namespace Text {
+    class TextPipeline;
+    class Tokeniser {
+    public:
+      TextPipeline *pipeline;
+      Tokeniser(TextPipeline *pipeline) : pipeline(pipeline) {}
+      virtual void tokenise(char *text) {}
+      virtual uint32_t mark() = 0;
+    };
+  }
+}
+#endif

data/lib/quarry/src/quarry.cpp ADDED Viewed

	@@ -0,0 +1 @@
1	+ void mkmf_marker() {;}

data/lib/quarry/src/quarry.h ADDED Viewed

@@ -0,0 +1,29 @@
+#include "data_set/data_set.h"
+#include "data_set/dense/dense_data_set.h"
+#include "data_set/sparse/sparse_data_set.h"
+#include "data_set/example.h"
+#include "data_set/dense/dense_example.h"
+#include "data_set/sparse/sparse_example.h"
+#include "data_set/feature.h"
+#include "data_set/features/nominal_feature.h"
+#include "data_set/features/numeric_feature.h"
+#include "classifier/classifier.h"
+#include "classifier/naive_bayes/naive_bayes_classifier.h"
+#include "metrics/confusion_matrix.h"
+#include "preprocessing/text/text_pipeline.h"
+#include "preprocessing/examples/example_preprocessor.h"
+#include "preprocessing/examples/weights/binary_weight.h"
+#include "preprocessing/examples/weights/local_weight.h"
+#include "model/model.h"
+#include "storage/storage.h"
+#include "storage/arff.h"
+#include "storage/folders.h"
+#include "storage/binary.h"
+// function to show to mkmf this library exists
+void mkmf_marker();

data/lib/quarry/src/storage/arff.cpp ADDED Viewed

@@ -0,0 +1,198 @@
+#include "arff.h"
+#include <stdexcept>
+#include <fstream>
+#include <iostream>
+#include <stdlib.h>
+#include <vector>
+using namespace std;
+#define BUFFER_SIZE             (10 * 1024)
+#define RELATION_PREFIX         "@relation "
+#define RELATION_PREFIX_LENGTH  10
+#define ATTRIBUTE_PREFIX        "@attribute "
+#define ATTRIBUTE_PREFIX_LENGTH  11
+#define DATA_PREFIX             "@data"
+#define DATA_PREFIX_LENGTH      5
+#define NUMERIC_TYPE            "numeric"
+#define NUMERIC_TYPE_LENGTH     7
+// skip whitespace and other delimiters
+#define skip_delimiters(conditions) \
+  while(conditions)\
+    (*str)++;
+#define skip_only_whitespace()                      skip_delimiters((isspace(**str)))
+#define skip_whitespace_and_nominal_list_markers()  skip_delimiters((isspace(**str) || (**str == ',') || (**str == '}')))
+#define skip_whitespace_and_example_list_markers()  skip_delimiters((isspace(**str) || (**str == ',') || (**str == '}')))
+// move the character position until the end of a token
+#define tokenise_while(conditions) \
+  while(**str && (conditions))\
+    (*str)++;\
+  if(**str) {\
+    **str = 0;\
+    (*str)++;\
+  }
+#define tokenise_space()    tokenise_while(!isspace(**str))
+#define tokenise_quote()    tokenise_while((**str != '"' || (*(*str - 1) != '\\')))
+#define tokenise_value()    tokenise_while(!isspace(**str) && (**str != ','))
+#define tokenise_nominal()  tokenise_while(!isspace(**str) && (**str != ',') && (**str != '}'))
+// determine whether the token is quote delimited or otherwise,
+// and cleanup whitespace etc. at the end
+#define tokenise(tokeniser, skipper) \
+  char *start;\
+  if(**str == '"') {\
+    start = ++*str;\
+    tokenise_quote();\
+  } else {\
+    start = *str;\
+    tokeniser();\
+  }\
+  skipper();\
+  return start;
+inline char *tokenise_attribute_name(char **str) {
+  tokenise(tokenise_space, skip_only_whitespace);
+}
+inline char *tokenise_nominal_value(char **str) {
+  tokenise(tokenise_nominal, skip_whitespace_and_nominal_list_markers);
+}
+inline char *tokenise_example_value(char **str) {
+  tokenise(tokenise_value, skip_whitespace_and_example_list_markers);
+}
+// downcase and compare two strings of a known length
+inline bool matches(char *buffer, char *compare_to, int length) {
+  while(length--)
+    if(tolower(*(buffer++)) != *(compare_to++))
+      return false;
+  return true;
+}
+inline void skip_whitespace(char **str) {
+  skip_only_whitespace();
+}
+DataSet::DataSet *Storage::ARFF::read() {
+  DataSet::DenseDataSet *data_set = new DataSet::DenseDataSet();
+  DataSet::NominalFeature *feature = NULL;
+  DataSet::DenseExample *example = NULL;
+  State state = relation;
+  vector<bool> numeric_feature;
+  char buffer[BUFFER_SIZE];
+  char *line, *name, *value;
+  string value_str;
+  int value_index;
+  fstream file;
+  file.open(path.c_str(), fstream::in);
+  while(file.good()) {
+    file.getline(buffer, BUFFER_SIZE);
+    switch(*buffer) {
+      // blank line
+      case '\0':
+        break;
+      // comments start with percent
+      case '%':
+        break;
+      // transitioning states, or adding a new feature
+      case '@':
+        switch(state) {
+          case relation:
+            if(!matches(buffer, RELATION_PREFIX, RELATION_PREFIX_LENGTH))
+              throw runtime_error("Expected relation declaration");
+            line = buffer + RELATION_PREFIX_LENGTH;
+            skip_whitespace(&line);
+            data_set->name = line;
+            state = attributes;
+            break;
+          case attributes:
+            // check if this is an attribute declaration
+            if(matches(buffer, ATTRIBUTE_PREFIX, ATTRIBUTE_PREFIX_LENGTH)) {
+              line = buffer + ATTRIBUTE_PREFIX_LENGTH;
+              skip_whitespace(&line);
+              // extract the attribute's name
+              name = tokenise_attribute_name(&line);
+              if(!*line)
+                throw runtime_error("Unexpected end of line");
+              // add a numeric attribute
+              if(matches(line, NUMERIC_TYPE, NUMERIC_TYPE_LENGTH)) {
+                data_set->new_numeric_feature(name);
+                numeric_feature.push_back(true);
+              // add a nominal attribute
+              } else if(*line == '{') {
+                feature = data_set->new_nominal_feature(name);
+                numeric_feature.push_back(false);
+                line++;
+                while(*line) {
+                  value_str = tokenise_nominal_value(&line);
+                  feature->add_value(value_str);
+                }
+              // other attribute types aren't supported
+              } else {
+                throw runtime_error("Unknown attribute type - only numeric and nominal attributes are supported");
+              }
+            // otherwise could be the start of the data section
+            } else {
+              if(matches(buffer, DATA_PREFIX, DATA_PREFIX_LENGTH))
+                state = data;
+              else
+                throw runtime_error("Expected attribute or data declaration");
+            }
+            break;
+          case data:
+            throw runtime_error("Unexpected declaration line, currently in data section");
+        }
+        break;
+      // adding data
+      default:
+        if(state != data)
+          throw runtime_error("Expected data section");
+        example = data_set->new_example();
+        value_index = 0;
+        line = buffer;
+        while(*line) {
+          value = tokenise_example_value(&line);
+          if(numeric_feature[value_index]) {
+            example->set_value(value_index, atof(value));
+          } else {
+            value_str = value;
+            feature = (DataSet::NominalFeature *)data_set->features[value_index];
+            example->set_value(value_index, feature->indexes[value_str]);
+          }
+          value_index++;
+        }
+    }
+  }
+  data_set->set_category_index(data_set->features.size() - 1);
+  file.close();
+  return data_set;
+}
+void Storage::ARFF::write(DataSet::DataSet *data_set) {
+}

data/lib/quarry/src/storage/arff.h ADDED Viewed

@@ -0,0 +1,26 @@
+#ifndef __arff_h__
+#define __arff_h__
+#include "storage/storage.h"
+#include "data_set/dense/dense_data_set.h"
+#include <algorithm>
+#include <cctype>
+#include <string>
+using namespace std;
+namespace Storage {
+  class ARFF : public Storage {
+    typedef enum {
+      relation,
+      attributes,
+      data
+    } State;
+  public:
+    string  path;
+    ARFF(string path) : path(path) {}
+    DataSet::DataSet *read();
+    void write(DataSet::DataSet *data_set);
+  };
+}
+#endif