RubyGems - thera - Versions diffs - 0.0.1 - Mend

thera 0.0.1

Files changed (89) hide show

data/.document +5 -0
data/.gitignore +56 -0
data/Gemfile +2 -0
data/Gemfile.lock +20 -0
data/LICENSE.txt +1 -0
data/README.rdoc +8 -0
data/Rakefile +1 -0
data/ext/Makefile +225 -0
data/ext/extconf.rb +29 -0
data/ext/quarry/quarry_toolkit.cpp +148 -0
data/lib/quarry/Makefile.linux +2 -0
data/lib/quarry/Makefile.osx +6 -0
data/lib/quarry/Makefile.targets +23 -0
data/lib/quarry/obj/.gitkeep +0 -0
data/lib/quarry/src/classifier/aode/aode_classifier.cpp +0 -0
data/lib/quarry/src/classifier/aode/aode_classifier.h +0 -0
data/lib/quarry/src/classifier/centroid/centroid_classifier.cpp +0 -0
data/lib/quarry/src/classifier/centroid/centroid_classifier.h +0 -0
data/lib/quarry/src/classifier/classifier.cpp +32 -0
data/lib/quarry/src/classifier/classifier.h +59 -0
data/lib/quarry/src/classifier/knn/knn_classifier.cpp +0 -0
data/lib/quarry/src/classifier/knn/knn_classifier.h +0 -0
data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.cpp +40 -0
data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.h +18 -0
data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +80 -0
data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.h +52 -0
data/lib/quarry/src/data_set/data_set.cpp +130 -0
data/lib/quarry/src/data_set/data_set.h +78 -0
data/lib/quarry/src/data_set/dense/dense_data_set.h +39 -0
data/lib/quarry/src/data_set/dense/dense_example.h +44 -0
data/lib/quarry/src/data_set/example.cpp +10 -0
data/lib/quarry/src/data_set/example.h +23 -0
data/lib/quarry/src/data_set/feature.h +36 -0
data/lib/quarry/src/data_set/features/nominal_feature.cpp +57 -0
data/lib/quarry/src/data_set/features/nominal_feature.h +76 -0
data/lib/quarry/src/data_set/features/numeric_feature.cpp +69 -0
data/lib/quarry/src/data_set/features/numeric_feature.h +78 -0
data/lib/quarry/src/data_set/sparse/sparse_data_set.h +40 -0
data/lib/quarry/src/data_set/sparse/sparse_example.cpp +82 -0
data/lib/quarry/src/data_set/sparse/sparse_example.h +38 -0
data/lib/quarry/src/metrics/confusion_matrix.cpp +129 -0
data/lib/quarry/src/metrics/confusion_matrix.h +82 -0
data/lib/quarry/src/model/model.cpp +29 -0
data/lib/quarry/src/model/model.h +50 -0
data/lib/quarry/src/preprocessing/examples/example_preprocessor.h +20 -0
data/lib/quarry/src/preprocessing/examples/weights/binary_weight.h +20 -0
data/lib/quarry/src/preprocessing/examples/weights/local_weight.h +29 -0
data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +19 -0
data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +59 -0
data/lib/quarry/src/preprocessing/text/inplace_processor/downcase.h +26 -0
data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h +17 -0
data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer.h +44 -0
data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer_original.cpp +375 -0
data/lib/quarry/src/preprocessing/text/text_pipeline.cpp +29 -0
data/lib/quarry/src/preprocessing/text/text_pipeline.h +37 -0
data/lib/quarry/src/preprocessing/text/token_selector/pos_tag_selector.h +21 -0
data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp +82 -0
data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h +20 -0
data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h +17 -0
data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.cpp +29 -0
data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.h +20 -0
data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h +19 -0
data/lib/quarry/src/quarry.cpp +1 -0
data/lib/quarry/src/quarry.h +29 -0
data/lib/quarry/src/storage/arff.cpp +198 -0
data/lib/quarry/src/storage/arff.h +26 -0
data/lib/quarry/src/storage/binary.cpp +457 -0
data/lib/quarry/src/storage/binary.h +79 -0
data/lib/quarry/src/storage/folders.cpp +98 -0
data/lib/quarry/src/storage/folders.h +25 -0
data/lib/quarry/src/storage/storage.h +19 -0
data/lib/quarry/src/test.cpp +6 -0
data/lib/quarry_rb/classifier/classifier.rb +22 -0
data/lib/quarry_rb/classifier/naive_bayes_classifier.rb +10 -0
data/lib/quarry_rb/confusion_matrix.rb +58 -0
data/lib/quarry_rb/data_set/data_set.rb +42 -0
data/lib/quarry_rb/data_set/example.rb +33 -0
data/lib/quarry_rb/data_set/feature.rb +28 -0
data/lib/quarry_rb/enumerable_helper.rb +32 -0
data/lib/quarry_rb/model/model.rb +56 -0
data/lib/quarry_rb/storage/arff.rb +11 -0
data/lib/quarry_rb/storage/binary.rb +23 -0
data/lib/quarry_rb/storage/folders.rb +11 -0
data/lib/quarry_rb/text_pipeline.rb +16 -0
data/lib/thera.rb +20 -0
data/test/helper.rb +19 -0
data/test/test_quarry.rb +33 -0
data/thera.gemspec +21 -0
metadata +148 -0

data/lib/quarry/src/preprocessing/examples/weights/binary_weight.h ADDED Viewed

@@ -0,0 +1,20 @@
+#ifndef __binary_weight_h__
+#define __binary_weight_h__
+#include "preprocessing/examples/example_preprocessor.h"
+namespace Preprocessing {
+  namespace Examples {
+    class BinaryWeight : public ExamplePreprocessor {
+      void process(DataSet::Example *example) {
+        for(int i = 0; i < example->size; i++) {
+          if(example->get_value(i) != 0.0)
+            example->set_value(i, 1.0);
+        }
+      }
+    };
+  }
+}
+#endif

data/lib/quarry/src/preprocessing/examples/weights/local_weight.h ADDED Viewed

@@ -0,0 +1,29 @@
+#ifndef __local_weight_h__
+#define __local_weight_h__
+#include "preprocessing/examples/example_preprocessor.h"
+namespace Preprocessing {
+  namespace Examples {
+    class LocalWeight : public ExamplePreprocessor {
+      void process(DataSet::Example *example) {
+        int max_value = 0;
+        double value;
+        for(int i = 0; i < example->size; i++) {
+          value = example->get_value(i);
+          if(value > max_value)
+            max_value = value;
+        }
+        for(int i = 0; i < example->size; i++) {
+          value = example->get_value(i);
+          example->set_value(i, value / max_value);
+        }
+      }
+    };
+  }
+}
+#endif

data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h ADDED Viewed

@@ -0,0 +1,19 @@
+#ifndef __example_generator_h__
+#define __example_generator_h__
+#include "data_set/sparse/sparse_data_set.h"
+#include "data_set/sparse/sparse_example.h"
+namespace Preprocessing {
+  namespace Text {
+    class ExampleGenerator {
+    public:
+      ExampleGenerator() {}
+      virtual DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens) { return NULL; }
+      virtual uint32_t mark() = 0;
+    };
+  }
+}
+#endif

data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h ADDED Viewed

@@ -0,0 +1,59 @@
+#ifndef __token_counter_h__
+#define __token_counter_h__
+#include "example_generator.h"
+#include <map>
+namespace Preprocessing {
+  namespace Text {
+    class TokenCounter : public ExampleGenerator {
+    public:
+      static const uint32_t file_mark = 'tcou';
+      uint32_t mark() { return file_mark; }
+      typedef enum {
+        Count,
+        Local,
+        Binary
+      } TokenCounterWeight;
+      map<string, int> token_counts;
+      TokenCounterWeight weight;
+      TokenCounter(TokenCounterWeight weight = Count) : ExampleGenerator(), token_counts(), weight(weight) {}
+      DataSet::SparseExample *generate(DataSet::SparseDataSet *data_set, vector<char *> *tokens) {
+        int max_count = 0, count = 0;
+        double value = 0.0;
+        token_counts.clear();
+        string token;
+        // count the number of occurrences of each token
+        for(vector<char *>::iterator tokens_it = tokens->begin(); tokens_it != tokens->end(); tokens_it++) {
+          token = string(*tokens_it);
+          count = ++token_counts[token];
+          if(count > max_count)
+            max_count = count;
+        }
+        // construct the example
+        DataSet::SparseExample *example = data_set->new_example(token_counts.size());
+        for(map<string, int>::iterator token_counts_it = token_counts.begin(); token_counts_it != token_counts.end(); token_counts_it++) {
+          value = token_counts_it->second;
+          if(weight == Local)
+            value = value / max_count;
+          else if(weight == Binary)
+            value = 1;
+          example->set_value(data_set->get_or_create_numeric_feature_by_name(token_counts_it->first)->index, value);
+        }
+        return example;
+      }
+    };
+  }
+}
+#endif

data/lib/quarry/src/preprocessing/text/inplace_processor/downcase.h ADDED Viewed

@@ -0,0 +1,26 @@
+#ifndef __downcase_h__
+#define __dowmcase_h__
+#include "inplace_processor.h"
+#include <cctype>
+namespace Preprocessing {
+  namespace Text {
+    class Downcase : public InplaceProcessor {
+    public:
+      static const uint32_t file_mark = 'down';
+      uint32_t mark() { return file_mark; }
+      char *process(char *start, char *end) {
+        while(start != end) {
+          *start = tolower(*start);
+          start++;
+        }
+        return end;
+      }
+    };
+  }
+}
+#endif

data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h ADDED Viewed

@@ -0,0 +1,17 @@
+#ifndef __inplace_processor_h__
+#define __inplace_processor_h__
+namespace Preprocessing {
+  namespace Text {
+    class InplaceProcessor {
+    public:
+      InplaceProcessor() {}
+      virtual char *process(char *start, char *end) { return end; }
+      virtual uint32_t mark() = 0;
+    };
+  }
+}
+#endif

data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer.h ADDED Viewed

@@ -0,0 +1,44 @@
+#ifndef __porter_stemmer_h__
+#define __porter_stemmer_h__
+#include "inplace_processor.h"
+// from porter_stemmer_original.c
+extern "C" {
+  struct stemmer;
+  extern struct stemmer * create_stemmer(void);
+  extern void free_stemmer(struct stemmer * z);
+  extern int stem(struct stemmer * z, char * b, int k);
+}
+namespace Preprocessing {
+  namespace Text {
+    class PorterStemmer : public InplaceProcessor {
+    public:
+      static const uint32_t file_mark = 'port';
+      uint32_t mark() { return file_mark; }
+      struct stemmer *stemm;
+      PorterStemmer() : InplaceProcessor() {
+        stemm = create_stemmer();
+      }
+      ~PorterStemmer() {
+        free_stemmer(stemm);
+      }
+      char *process(char *start, char *end) {
+        int length = end - start;
+        int new_length = stem(stemm, start, end - start - 1);
+        for(int i = new_length + 1; i <= length; i++)
+          start[i] = 0;
+        return start + new_length;
+      }
+    };
+  }
+}
+#endif

data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer_original.cpp ADDED Viewed

@@ -0,0 +1,375 @@
+/* This is the Porter stemming algorithm, coded up as thread-safe ANSI C
+   by the author.
+   It may be be regarded as cononical, in that it follows the algorithm
+   presented in
+   Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+   no. 3, pp 130-137,
+   only differing from it at the points maked --DEPARTURE-- below.
+   See also http://www.tartarus.org/~martin/PorterStemmer
+   The algorithm as described in the paper could be exactly replicated
+   by adjusting the points of DEPARTURE, but this is barely necessary,
+   because (a) the points of DEPARTURE are definitely improvements, and
+   (b) no encoding of the Porter stemmer I have seen is anything like
+   as exact as this version, even with the points of DEPARTURE!
+   You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which
+   'stem' takes a list of inputs and sends the stemmed equivalent to
+   stdout.
+   The algorithm as encoded here is particularly fast.
+   Release 2 (the more old-fashioned, non-thread-safe version may be
+   regarded as release 1.)
+*/
+#include <stdlib.h>  /* for malloc, free */
+#include <string.h>  /* for memcmp, memmove */
+#include <iostream>
+/* The main part of the stemming algorithm starts here.
+*/
+#define TRUE 1
+#define FALSE 0
+extern "C" {
+/* stemmer is a structure for a few local bits of data,
+*/
+struct stemmer {
+   char * b;       /* buffer for word to be stemmed */
+   int k;          /* offset to the end of the string */
+   int j;          /* a general offset into the string */
+};
+/* Member b is a buffer holding a word to be stemmed. The letters are in
+   b[0], b[1] ... ending at b[z->k]. Member k is readjusted downwards as
+   the stemming progresses. Zero termination is not in fact used in the
+   algorithm.
+   Note that only lower case sequences are stemmed. Forcing to lower case
+   should be done before stem(...) is called.
+   Typical usage is:
+       struct stemmer * z = create_stemmer();
+       char b[] = "pencils";
+       int res = stem(z, b, 6);
+           /- stem the 7 characters of b[0] to b[6]. The result, res,
+              will be 5 (the 's' is removed). -/
+       free_stemmer(z);
+*/
+extern struct stemmer * create_stemmer(void)
+{
+    return (struct stemmer *) malloc(sizeof(struct stemmer));
+    /* assume malloc succeeds */
+}
+extern void free_stemmer(struct stemmer * z)
+{
+    free(z);
+}
+/* cons(z, i) is TRUE <=> b[i] is a consonant. ('b' means 'z->b', but here
+   and below we drop 'z->' in comments.
+*/
+static int cons(struct stemmer * z, int i)
+{  switch (z->b[i])
+   {  case 'a': case 'e': case 'i': case 'o': case 'u': return FALSE;
+      case 'y': return (i == 0) ? TRUE : !cons(z, i - 1);
+      default: return TRUE;
+   }
+}
+/* m(z) measures the number of consonant sequences between 0 and j. if c is
+   a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
+   presence,
+      <c><v>       gives 0
+      <c>vc<v>     gives 1
+      <c>vcvc<v>   gives 2
+      <c>vcvcvc<v> gives 3
+      ....
+*/
+static int m(struct stemmer * z)
+{  int n = 0;
+   int i = 0;
+   int j = z->j;
+   while(TRUE)
+   {  if (i > j) return n;
+      if (! cons(z, i)) break; i++;
+   }
+   i++;
+   while(TRUE)
+   {  while(TRUE)
+      {  if (i > j) return n;
+            if (cons(z, i)) break;
+            i++;
+      }
+      i++;
+      n++;
+      while(TRUE)
+      {  if (i > j) return n;
+         if (! cons(z, i)) break;
+         i++;
+      }
+      i++;
+   }
+}
+/* vowelinstem(z) is TRUE <=> 0,...j contains a vowel */
+static int vowelinstem(struct stemmer * z)
+{
+   int j = z->j;
+   int i; for (i = 0; i <= j; i++) if (! cons(z, i)) return TRUE;
+   return FALSE;
+}
+/* doublec(z, j) is TRUE <=> j,(j-1) contain a double consonant. */
+static int doublec(struct stemmer * z, int j)
+{
+   char * b = z->b;
+   if (j < 1) return FALSE;
+   if (b[j] != b[j - 1]) return FALSE;
+   return cons(z, j);
+}
+/* cvc(z, i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
+   and also if the second c is not w,x or y. this is used when trying to
+   restore an e at the end of a short word. e.g.
+      cav(e), lov(e), hop(e), crim(e), but
+      snow, box, tray.
+*/
+static int cvc(struct stemmer * z, int i)
+{  if (i < 2 || !cons(z, i) || cons(z, i - 1) || !cons(z, i - 2)) return FALSE;
+   {  int ch = z->b[i];
+      if (ch  == 'w' || ch == 'x' || ch == 'y') return FALSE;
+   }
+   return TRUE;
+}
+/* ends(z, s) is TRUE <=> 0,...k ends with the string s. */
+static int ends(struct stemmer * z, char * s)
+{  int length = s[0];
+   char * b = z->b;
+   int k = z->k;
+   if (s[length] != b[k]) return FALSE; /* tiny speed-up */
+   if (length > k + 1) return FALSE;
+   if (memcmp(b + k - length + 1, s + 1, length) != 0) return FALSE;
+   z->j = k-length;
+   return TRUE;
+}
+/* setto(z, s) sets (j+1),...k to the characters in the string s, readjusting
+   k. */
+static void setto(struct stemmer * z, char * s)
+{  int length = s[0];
+   int j = z->j;
+   memmove(z->b + j + 1, s + 1, length);
+   z->k = j+length;
+}
+/* r(z, s) is used further down. */
+static void r(struct stemmer * z, char * s) { if (m(z) > 0) setto(z, s); }
+/* step1ab(z) gets rid of plurals and -ed or -ing. e.g.
+       caresses  ->  caress
+       ponies    ->  poni
+       ties      ->  ti
+       caress    ->  caress
+       cats      ->  cat
+       feed      ->  feed
+       agreed    ->  agree
+       disabled  ->  disable
+       matting   ->  mat
+       mating    ->  mate
+       meeting   ->  meet
+       milling   ->  mill
+       messing   ->  mess
+       meetings  ->  meet
+*/
+static void step1ab(struct stemmer * z)
+{
+   char * b = z->b;
+   if (b[z->k] == 's')
+   {  if (ends(z, "\04" "sses")) z->k -= 2; else
+      if (ends(z, "\03" "ies")) setto(z, "\01" "i"); else
+      if (b[z->k - 1] != 's') z->k--;
+   }
+   if (ends(z, "\03" "eed")) { if (m(z) > 0) z->k--; } else
+   if ((ends(z, "\02" "ed") || ends(z, "\03" "ing")) && vowelinstem(z))
+   {  z->k = z->j;
+      if (ends(z, "\02" "at")) setto(z, "\03" "ate"); else
+      if (ends(z, "\02" "bl")) setto(z, "\03" "ble"); else
+      if (ends(z, "\02" "iz")) setto(z, "\03" "ize"); else
+      if (doublec(z, z->k))
+      {  z->k--;
+         {  int ch = b[z->k];
+            if (ch == 'l' || ch == 's' || ch == 'z') z->k++;
+         }
+      }
+      else if (m(z) == 1 && cvc(z, z->k)) setto(z, "\01" "e");
+   }
+}
+/* step1c(z) turns terminal y to i when there is another vowel in the stem. */
+static void step1c(struct stemmer * z)
+{
+   if (ends(z, "\01" "y") && vowelinstem(z)) z->b[z->k] = 'i';
+}
+/* step2(z) maps double suffices to single ones. so -ization ( = -ize plus
+   -ation) maps to -ize etc. note that the string before the suffix must give
+   m(z) > 0. */
+static void step2(struct stemmer * z) { switch (z->b[z->k-1])
+{
+   case 'a': if (ends(z, "\07" "ational")) { r(z, "\03" "ate"); break; }
+             if (ends(z, "\06" "tional")) { r(z, "\04" "tion"); break; }
+             break;
+   case 'c': if (ends(z, "\04" "enci")) { r(z, "\04" "ence"); break; }
+             if (ends(z, "\04" "anci")) { r(z, "\04" "ance"); break; }
+             break;
+   case 'e': if (ends(z, "\04" "izer")) { r(z, "\03" "ize"); break; }
+             break;
+   case 'l': if (ends(z, "\03" "bli")) { r(z, "\03" "ble"); break; } /*-DEPARTURE-*/
+ /* To match the published algorithm, replace this line with
+    case 'l': if (ends(z, "\04" "abli")) { r(z, "\04" "able"); break; } */
+             if (ends(z, "\04" "alli")) { r(z, "\02" "al"); break; }
+             if (ends(z, "\05" "entli")) { r(z, "\03" "ent"); break; }
+             if (ends(z, "\03" "eli")) { r(z, "\01" "e"); break; }
+             if (ends(z, "\05" "ousli")) { r(z, "\03" "ous"); break; }
+             break;
+   case 'o': if (ends(z, "\07" "ization")) { r(z, "\03" "ize"); break; }
+             if (ends(z, "\05" "ation")) { r(z, "\03" "ate"); break; }
+             if (ends(z, "\04" "ator")) { r(z, "\03" "ate"); break; }
+             break;
+   case 's': if (ends(z, "\05" "alism")) { r(z, "\02" "al"); break; }
+             if (ends(z, "\07" "iveness")) { r(z, "\03" "ive"); break; }
+             if (ends(z, "\07" "fulness")) { r(z, "\03" "ful"); break; }
+             if (ends(z, "\07" "ousness")) { r(z, "\03" "ous"); break; }
+             break;
+   case 't': if (ends(z, "\05" "aliti")) { r(z, "\02" "al"); break; }
+             if (ends(z, "\05" "iviti")) { r(z, "\03" "ive"); break; }
+             if (ends(z, "\06" "biliti")) { r(z, "\03" "ble"); break; }
+             break;
+   case 'g': if (ends(z, "\04" "logi")) { r(z, "\03" "log"); break; } /*-DEPARTURE-*/
+ /* To match the published algorithm, delete this line */
+} }
+/* step3(z) deals with -ic-, -full, -ness etc. similar strategy to step2. */
+static void step3(struct stemmer * z) { switch (z->b[z->k])
+{
+   case 'e': if (ends(z, "\05" "icate")) { r(z, "\02" "ic"); break; }
+             if (ends(z, "\05" "ative")) { r(z, "\00" ""); break; }
+             if (ends(z, "\05" "alize")) { r(z, "\02" "al"); break; }
+             break;
+   case 'i': if (ends(z, "\05" "iciti")) { r(z, "\02" "ic"); break; }
+             break;
+   case 'l': if (ends(z, "\04" "ical")) { r(z, "\02" "ic"); break; }
+             if (ends(z, "\03" "ful")) { r(z, "\00" ""); break; }
+             break;
+   case 's': if (ends(z, "\04" "ness")) { r(z, "\00" ""); break; }
+             break;
+} }
+/* step4(z) takes off -ant, -ence etc., in context <c>vcvc<v>. */
+static void step4(struct stemmer * z)
+{  switch (z->b[z->k-1])
+   {  case 'a': if (ends(z, "\02" "al")) break; return;
+      case 'c': if (ends(z, "\04" "ance")) break;
+                if (ends(z, "\04" "ence")) break; return;
+      case 'e': if (ends(z, "\02" "er")) break; return;
+      case 'i': if (ends(z, "\02" "ic")) break; return;
+      case 'l': if (ends(z, "\04" "able")) break;
+                if (ends(z, "\04" "ible")) break; return;
+      case 'n': if (ends(z, "\03" "ant")) break;
+                if (ends(z, "\05" "ement")) break;
+                if (ends(z, "\04" "ment")) break;
+                if (ends(z, "\03" "ent")) break; return;
+      case 'o': if (ends(z, "\03" "ion") && (z->b[z->j] == 's' || z->b[z->j] == 't')) break;
+                if (ends(z, "\02" "ou")) break; return;
+                /* takes care of -ous */
+      case 's': if (ends(z, "\03" "ism")) break; return;
+      case 't': if (ends(z, "\03" "ate")) break;
+                if (ends(z, "\03" "iti")) break; return;
+      case 'u': if (ends(z, "\03" "ous")) break; return;
+      case 'v': if (ends(z, "\03" "ive")) break; return;
+      case 'z': if (ends(z, "\03" "ize")) break; return;
+      default: return;
+   }
+   if (m(z) > 1) z->k = z->j;
+}
+/* step5(z) removes a final -e if m(z) > 1, and changes -ll to -l if
+   m(z) > 1. */
+static void step5(struct stemmer * z)
+{
+   char * b = z->b;
+   z->j = z->k;
+   if (b[z->k] == 'e')
+   {  int a = m(z);
+      if ((a > 1) || (a == 1 && !cvc(z, z->k - 1))) z->k--;
+   }
+   if (b[z->k] == 'l' && doublec(z, z->k) && m(z) > 1) z->k--;
+}
+/* In stem(z, b, k), b is a char pointer, and the string to be stemmed is
+   from b[0] to b[k] inclusive.  Possibly b[k+1] == '\0', but it is not
+   important. The stemmer adjusts the characters b[0] ... b[k] and returns
+   the new end-point of the string, k'. Stemming never increases word
+   length, so 0 <= k' <= k.
+*/
+extern int stem(struct stemmer * z, char * b, int k)
+{
+   if (k <= 1) return k; /*-DEPARTURE-*/
+   z->b = b; z->k = k; /* copy the parameters into z */
+   /* With this line, strings of length 1 or 2 don't go through the
+      stemming process, although no mention is made of this in the
+      published algorithm. Remove the line to match the published
+      algorithm. */
+   step1ab(z); step1c(z); step2(z); step3(z); step4(z); step5(z);
+   return z->k;
+}
+} // extern "C"

data/lib/quarry/src/preprocessing/text/text_pipeline.cpp ADDED Viewed

@@ -0,0 +1,29 @@
+#include "text_pipeline.h"
+#include <iostream>
+DataSet::SparseExample *Preprocessing::Text::TextPipeline::process_text(DataSet::SparseDataSet *data_set, char *text) {
+  tokens.clear();
+  tokeniser->tokenise(text);
+  return generator->generate(data_set, &tokens);
+}
+void Preprocessing::Text::TextPipeline::process_token(char *start, char *end) {
+  for(int i = 0; i < processors.size(); i++)
+    processors[i]->process(start, end);
+  for(int i = 0; i < selectors.size(); i++)
+    if(!selectors[i]->select(start, end))
+      return;
+  tokens.push_back(start);
+}
+Preprocessing::Text::TextPipeline *Preprocessing::Text::StandardPipeline() {
+  TextPipeline *pipeline = new TextPipeline();
+  pipeline->tokeniser = new SimpleTokeniser(pipeline);
+  pipeline->processors.push_back(new Downcase());
+  pipeline->processors.push_back(new PorterStemmer());
+  pipeline->selectors.push_back(new StopWords());
+  pipeline->generator = new TokenCounter(TokenCounter::Local);
+  return pipeline;
+}

data/lib/quarry/src/preprocessing/text/text_pipeline.h ADDED Viewed

@@ -0,0 +1,37 @@
+#ifndef __text_pipeline_h__
+#define __text_pipeline_h__
+#include "data_set/sparse/sparse_example.h"
+#include "example_generator/example_generator.h"
+#include "example_generator/token_counter.h"
+#include "inplace_processor/inplace_processor.h"
+#include "inplace_processor/downcase.h"
+#include "inplace_processor/porter_stemmer.h"
+#include "token_selector/token_selector.h"
+#include "token_selector/stop_words.h"
+#include "token_selector/pos_tag_selector.h"
+#include "tokeniser/tokeniser.h"
+#include "tokeniser/simple_tokeniser.h"
+#include <vector>
+namespace Preprocessing {
+  namespace Text {
+    class Tokeniser;
+    class TextPipeline {
+    public:
+      Tokeniser *tokeniser;
+      vector<InplaceProcessor *> processors;
+      vector<TokenSelector *> selectors;
+      ExampleGenerator *generator;
+      vector<char *> tokens;
+      TextPipeline() : tokeniser(NULL), processors(), selectors(), generator(NULL), tokens() {}
+      DataSet::SparseExample *process_text(DataSet::SparseDataSet *data_set, char *text);
+      void process_token(char *start, char *end);
+    };
+    TextPipeline *StandardPipeline();
+  }
+}
+#endif

data/lib/quarry/src/preprocessing/text/token_selector/pos_tag_selector.h ADDED Viewed

@@ -0,0 +1,21 @@
+#ifndef __pos_tag_selector_h__
+#define __pos_tag_selector_h__
+#include "token_selector.h"
+namespace Preprocessing {
+  namespace Text {
+    class POSTagSelector : public TokenSelector {
+    public:
+      static const uint32_t file_mark = 'post';
+      uint32_t mark() { return file_mark; }
+      bool select(char *start, char *end) {
+        return true;
+      }
+    };
+  }
+}
+#endif