ruletagger 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/COPYING +21 -0
  2. data/History.txt +4 -0
  3. data/LICENSE +21 -0
  4. data/License.txt +20 -0
  5. data/Manifest.txt +75 -0
  6. data/PostInstall.txt +7 -0
  7. data/README +7 -0
  8. data/README.txt +53 -0
  9. data/Rakefile +33 -0
  10. data/config/hoe.rb +75 -0
  11. data/config/requirements.rb +15 -0
  12. data/ext/rule_tagger/bool.h +38 -0
  13. data/ext/rule_tagger/darray.c +292 -0
  14. data/ext/rule_tagger/darray.h +125 -0
  15. data/ext/rule_tagger/darrayP.h +50 -0
  16. data/ext/rule_tagger/extconf.rb +14 -0
  17. data/ext/rule_tagger/lex.c +170 -0
  18. data/ext/rule_tagger/lex.h +49 -0
  19. data/ext/rule_tagger/memory.c +127 -0
  20. data/ext/rule_tagger/memory.h +20 -0
  21. data/ext/rule_tagger/rbtagger.c +252 -0
  22. data/ext/rule_tagger/registry.c +326 -0
  23. data/ext/rule_tagger/registry.h +129 -0
  24. data/ext/rule_tagger/registryP.h +46 -0
  25. data/ext/rule_tagger/ruby-compat.h +20 -0
  26. data/ext/rule_tagger/rules.c +525 -0
  27. data/ext/rule_tagger/rules.h +42 -0
  28. data/ext/rule_tagger/sysdep.h +20 -0
  29. data/ext/rule_tagger/tagger.c +110 -0
  30. data/ext/rule_tagger/tagger.h +46 -0
  31. data/ext/rule_tagger/useful.c +44 -0
  32. data/ext/rule_tagger/useful.h +51 -0
  33. data/ext/word_tagger/extconf.rb +7 -0
  34. data/ext/word_tagger/porter_stemmer.c +430 -0
  35. data/ext/word_tagger/porter_stemmer.h +19 -0
  36. data/ext/word_tagger/rtagger.cc +83 -0
  37. data/ext/word_tagger/tagger.cc +153 -0
  38. data/ext/word_tagger/tagger.h +27 -0
  39. data/ext/word_tagger/tagger.rb +8 -0
  40. data/ext/word_tagger/test/Makefile +22 -0
  41. data/ext/word_tagger/test/doc.txt +87 -0
  42. data/ext/word_tagger/test/test.cc +107 -0
  43. data/ext/word_tagger/test.rb +31 -0
  44. data/lib/brill/tagger.rb +225 -0
  45. data/lib/rbtagger/version.rb +9 -0
  46. data/lib/rbtagger.rb +6 -0
  47. data/script/console +10 -0
  48. data/script/destroy +14 -0
  49. data/script/generate +14 -0
  50. data/script/txt2html +82 -0
  51. data/setup.rb +1585 -0
  52. data/tasks/deployment.rake +34 -0
  53. data/tasks/environment.rake +7 -0
  54. data/tasks/website.rake +17 -0
  55. data/test/CONTEXTUALRULEFILE +284 -0
  56. data/test/LEXICALRULEFILE +148 -0
  57. data/test/LEXICON +93696 -0
  58. data/test/docs/doc0.txt +20 -0
  59. data/test/docs/doc1.txt +11 -0
  60. data/test/docs/doc2.txt +52 -0
  61. data/test/docs/doc3.txt +128 -0
  62. data/test/docs/doc4.txt +337 -0
  63. data/test/docs/doc5.txt +497 -0
  64. data/test/docs/doc6.txt +116 -0
  65. data/test/docs/doc7.txt +101 -0
  66. data/test/docs/doc8.txt +25 -0
  67. data/test/docs/doc9.txt +84 -0
  68. data/test/tagger_test.rb +60 -0
  69. data/test/test_helper.rb +2 -0
  70. data/tools/rakehelp.rb +113 -0
  71. data/website/index.html +113 -0
  72. data/website/index.txt +53 -0
  73. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  74. data/website/stylesheets/screen.css +138 -0
  75. data/website/template.html.erb +48 -0
  76. metadata +155 -0
@@ -0,0 +1,19 @@
1
+ #ifndef PORTER_STEMMER_H
2
+ #define PORTER_STEMMER_H
3
+ #ifdef __cplusplus
4
+ extern "C" {
5
+ #endif
6
+
7
+ struct stemmer;
8
+
9
+ extern struct stemmer * porter_stemmer_new(void);
10
+ extern void porter_stemmer_free(struct stemmer * z);
11
+
12
+ extern int porter_stem(struct stemmer * z, const char * b, int k);
13
+
14
+
15
+
16
+ #ifdef __cplusplus
17
+ }
18
+ #endif
19
+ #endif
@@ -0,0 +1,83 @@
1
+ /**
2
+ * Copyright (c) 2008 Todd A. Fisher
3
+ * see LICENSE
4
+ */
5
+ #include "ruby.h"
6
+ #include "tagger.h"
7
+
8
+ #define DEBUG
9
+ #ifdef DEBUG
10
+ #define TRACE() fprintf(stderr, "> %s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__)
11
+ #else
12
+ #define TRACE()
13
+ #endif
14
+
15
+ /* ruby 1.9 compat */
16
+ #ifndef RSTRING_PTR
17
+ #define RSTRING_PTR(str) RSTRING(str)->ptr
18
+ #endif
19
+
20
+ #ifndef RSTRING_LEN
21
+ #define RSTRING_LEN(str) RSTRING(str)->len
22
+ #endif
23
+
24
+ static VALUE rb_Tagger;
25
+ static VALUE rb_NWordTagger;
26
+
27
+ VALUE Tagger_execute( VALUE self, VALUE text )
28
+ {
29
+ NWordTagger *tagger;
30
+ Data_Get_Struct( self, NWordTagger, tagger );
31
+ std::vector<std::string> tags = tagger->execute( RSTRING_PTR(text) );
32
+ VALUE results = rb_ary_new2(tags.size());
33
+ for( size_t i = 0; i < tags.size(); ++i ){
34
+ rb_ary_push( results, rb_str_new( tags[i].c_str(), tags[i].length() ) );
35
+ }
36
+ return results;
37
+ }
38
+ VALUE Tagger_set_words( VALUE self, VALUE words )
39
+ {
40
+ NWordTagger *tagger;
41
+ Data_Get_Struct( self, NWordTagger, tagger );
42
+ tagger->setNWords( NUM2INT(words) );
43
+ return Qnil;
44
+ }
45
+ VALUE Tagger_load_tags( VALUE self, VALUE tagarr )
46
+ {
47
+ NWordTagger *tagger;
48
+ Data_Get_Struct( self, NWordTagger, tagger );
49
+ std::set<std::string> tags;
50
+ int len = RARRAY(tagarr)->len;
51
+ for( int i = 0; i < len; ++i ){
52
+ std::string tag = RSTRING_PTR( rb_ary_entry( tagarr, i ) );
53
+ tags.insert(tag);
54
+ }
55
+ tagger->loadTags(tags);
56
+ return Qnil;
57
+ }
58
+
59
+ static void Tagger_free( NWordTagger *tagger )
60
+ {
61
+ delete tagger;
62
+ }
63
+
64
+ VALUE Tagger_alloc(VALUE klass)
65
+ {
66
+ VALUE object;
67
+ NWordTagger *tagger = new NWordTagger();
68
+ object = Data_Wrap_Struct( klass, NULL, Tagger_free, tagger );
69
+
70
+ return object;
71
+ }
72
+
73
+ extern "C" void Init_rtagger()
74
+ {
75
+ rb_Tagger = rb_define_module( "Tagger" );
76
+ rb_NWordTagger = rb_define_class_under( rb_Tagger, "NWordTagger", rb_cObject );
77
+
78
+ rb_define_alloc_func( rb_NWordTagger, Tagger_alloc );
79
+
80
+ rb_define_method( rb_NWordTagger, "load_tags", (VALUE (*)(...))Tagger_load_tags, 1 );
81
+ rb_define_method( rb_NWordTagger, "execute", (VALUE (*)(...))Tagger_execute, 1 );
82
+ rb_define_method( rb_NWordTagger, "set_words", (VALUE (*)(...))Tagger_set_words, 1 );
83
+ }
@@ -0,0 +1,153 @@
1
+ #include <ctype.h>
2
+ #include "tagger.h"
3
+ #include <set>
4
+ #include <algorithm>
5
+ #include <sstream>
6
+ #include <iterator>
7
+ #include <string>
8
+ #include <vector>
9
+ #include "porter_stemmer.h"
10
+
11
+ struct WordComparitor
12
+ {
13
+ bool operator()(const std::pair<std::string,int> &s1, const std::pair<std::string,int> &s2) const
14
+ {
15
+ return s1.second < s2.second;
16
+ }
17
+ };
18
+
19
+
20
+ // from http://www.thescripts.com/forum/thread167600.html
21
+ // split words by ' '
22
+ static std::vector<std::string> word_split(const std::string& s)
23
+ {
24
+ std::string words = s;
25
+ // convert all non alpha characters to spaces
26
+ for( size_t i = 0; i < words.length(); ++i ) {
27
+ if( !isalpha( words[i] ) ) {
28
+ words[i] = ' '; // convert to space
29
+ }
30
+ }
31
+
32
+ std::istringstream is(words);
33
+ return std::vector<std::string>(std::istream_iterator<std::string>(is), std::istream_iterator<std::string>());
34
+ }
35
+
36
+ NWordTagger::NWordTagger()
37
+ : nwords(2), stemmer(porter_stemmer_new()){
38
+ }
39
+ NWordTagger::~NWordTagger(){
40
+ porter_stemmer_free(stemmer);
41
+ }
42
+ void NWordTagger::loadTags( const std::set<std::string> &tags )
43
+ {
44
+ for( std::set<std::string>::iterator i = tags.begin(); i != tags.end(); ++i ){
45
+ std::string stemmed, word = std::string(*i);
46
+ std::vector<std::string> words = word_split( *i );
47
+ //printf( "word: %s\n", word.c_str() );
48
+
49
+ if( words.size() > 1 ){
50
+ for( size_t j = 0; j < words.size(); ++j ){
51
+ stemmed += this->stemWord(words[j]) + " ";
52
+ }
53
+ stemmed = stemmed.substr(0,stemmed.length()-1);
54
+ this->tags[stemmed] = word;
55
+ //printf( "word: %s -> %s\n", word.c_str(), stemmed.c_str() );
56
+ }
57
+ else{
58
+ stemmed = this->stemWord(*i);
59
+ //printf( "word: %s -> %s\n", word.c_str(), stemmed.c_str() );
60
+ this->tags[stemmed] = word;
61
+ }
62
+
63
+ }
64
+ }
65
+ std::string NWordTagger::stemWord( const std::string &word )const
66
+ {
67
+ std::string stemmed;
68
+ char *transition_buffer = strdup( word.c_str() );
69
+ stemmed = word.substr(0,porter_stem(this->stemmer, transition_buffer, word.length()-1 )+1);
70
+ free( transition_buffer );
71
+ return stemmed;
72
+ }
73
+
74
+ std::vector<std::string> NWordTagger::execute( const char *text, short max )const
75
+ {
76
+ int max_count = 0;
77
+ std::vector<std::string> words = word_split( text );
78
+ std::map<std::string, int> matched_tags; // stores tags and frequency
79
+ std::string match_word;
80
+ std::map<std::string,std::string>::const_iterator matched;
81
+
82
+ // loop over the words stemming each word
83
+ for( size_t i = 0; i < words.size(); ++i ) {
84
+
85
+ // get the stemmed word at position i
86
+ match_word = this->stemWord(words[i]);
87
+
88
+ // now scan ahead nwords positions searching our tags table for matches
89
+ for( short j = 1; (j <= this->nwords) && ((i+j) < words.size()); ++j ) {
90
+ matched = this->tags.find( match_word );
91
+ if( matched != this->tags.end() ){
92
+ //printf( "word: %d:(%s->%s)\n", i, match_word.c_str(), matched->second.c_str() );
93
+ std::map<std::string, int>::iterator mloc = matched_tags.find( matched->second );
94
+ if( mloc == matched_tags.end() ) {
95
+ matched_tags[matched->second] = 1; // count 1
96
+ }
97
+ else {
98
+ mloc->second++;
99
+ if( max_count < mloc->second ) { max_count = mloc->second; }
100
+ }
101
+ }
102
+ // stem each word and compare against our tag bank
103
+ //printf( "window: %ld:%lu\n", i,(i+j) );
104
+ match_word += " " + this->stemWord(words[i+j]);
105
+ }
106
+
107
+ matched = this->tags.find( match_word );
108
+ if( matched != this->tags.end() ) {
109
+ //printf( "word: %ld:(%s->%s)\n", i, words[i].c_str(), match_word.c_str() );
110
+ std::map<std::string, int>::iterator mloc = matched_tags.find( matched->second );
111
+ if( mloc == matched_tags.end() ) {
112
+ matched_tags[matched->second] = 1; // count 1
113
+ }
114
+ else {
115
+ mloc->second++;
116
+ if( max_count < mloc->second ) { max_count = mloc->second; }
117
+ }
118
+ }
119
+ }
120
+
121
+ // now that we have all the matched tags reduce to max using the tag frequency as a reduction measure
122
+ std::vector< std::pair<std::string,int> > sorted_tags;
123
+
124
+ //printf( "max frequency: %d\n", max_count );
125
+ for( std::map<std::string, int>::iterator mloc = matched_tags.begin(); mloc != matched_tags.end(); ++mloc ){
126
+ //printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
127
+ sorted_tags.push_back(*mloc);
128
+ }
129
+
130
+ // sort the tags in frequency order
131
+ std::sort( sorted_tags.begin(), sorted_tags.end(), WordComparitor() );
132
+
133
+ std::vector< std::string > reduced_tags;
134
+
135
+ std::vector< std::pair<std::string, int> >::iterator mloc;
136
+ do {
137
+ for(mloc = sorted_tags.begin(); mloc != sorted_tags.end(); ++mloc ) {
138
+ std::pair< std::string, int > word_freq = *mloc;
139
+ // printf( "word: %s, frequency: %d\n", word_freq.first.c_str(), word_freq.second );
140
+ //printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
141
+ if( word_freq.second < max_count ) {
142
+ sorted_tags.erase( mloc );
143
+ break;
144
+ }
145
+ }
146
+ } while( sorted_tags.size() > (size_t)max && mloc != sorted_tags.end() );
147
+
148
+ for( size_t i = 0; i < sorted_tags.size(); ++i ) {
149
+ reduced_tags.push_back( sorted_tags[i].first );
150
+ }
151
+
152
+ return reduced_tags;
153
+ }
@@ -0,0 +1,27 @@
1
+ #ifndef NWORD_TAGGER_H
2
+ #define NWORD_TAGGER_H
3
+ #include <set>
4
+ #include <map>
5
+ #include <string>
6
+ #include <vector>
7
+
8
+ struct NWordTagger {
9
+ NWordTagger();
10
+ ~NWordTagger();
11
+
12
+ void loadTags( const std::set<std::string> &tags );
13
+
14
+ short getNWords()const{ return nwords; }
15
+ void setNWords( short words ){ nwords = words; }
16
+
17
+ std::vector<std::string> execute( const char *text, short max = 10 )const;
18
+ private:
19
+ short nwords;
20
+ struct stemmer *stemmer;
21
+ std::map<std::string,std::string> tags;
22
+ std::vector<std::string> words;
23
+
24
+ std::string stemWord( const std::string &word )const;
25
+ };
26
+
27
+ #endif
@@ -0,0 +1,8 @@
1
+ module Tagger
2
+ require 'rtagger'
3
+ class SimpleTagger < Tagger::NWordTagger
4
+ def execute( text )
5
+ super( text.gsub(/[^\w]/,' ') )
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,22 @@
1
+ CFLAGS=-g -Wall #-fprofile-arcs -ftest-coverage
2
+
3
+ all: porter_stemmer.o tagger.o test.o
4
+ g++ $(CFLAGS) -o testit $^
5
+ time ./testit doc.txt
6
+ # gcov -b tagger.cc
7
+ # valgrind --leak-check=full ./testit
8
+ test.o: test.cc tagger.h
9
+ g++ $(CFLAGS) test.cc -c
10
+ tagger.o: tagger.cc tagger.h porter_stemmer.h
11
+ g++ $(CFLAGS) tagger.cc -c
12
+ porter_stemmer.o: porter_stemmer.c porter_stemmer.h
13
+ gcc $(CFLAGS) porter_stemmer.c -c
14
+
15
+ setup:
16
+ ln -s ../tagger.h
17
+ ln -s ../tagger.cc
18
+ ln -s ../porter_stemmer.h
19
+ ln -s ../porter_stemmer.c
20
+
21
+ clean:
22
+ rm -f tagger.o test.o testit
@@ -0,0 +1,87 @@
1
+ Allergies are diseases of the immune system that cause an overreaction to
2
+ substances called "allergens." Allergies are grouped by the kind of trigger,
3
+ time of year or where symptoms appear on the body: indoor andoutdoor allergies
4
+ (also called "hay fever," "seasonal," "perennial" or "nasal" allergies), food
5
+ and drug allergies, latex allergies, insect allergies, skin allergies and eye
6
+ allergies. People who have allergies can live healthy and active lives.
7
+
8
+ movieMultimedia Allergy Library
9
+ GlossaryGlossary of Allergy Terms
10
+
11
+ What are Allergies | What Causes Allergies | Diagnosis | Treatment |
12
+ Prevention |
13
+
14
+ What Causes Allergies
15
+
16
+ The substances that cause allergic disease in people are known as allergens.
17
+ "Antigens," or protein particles like pollen, food or dander enter our bodies
18
+ through a variety of ways. If the antigen causes an allergic reaction, that
19
+ particle is considered an "allergen" – and antigen that triggers an allergic
20
+ reaction. These allergens can get into our body in several ways:
21
+
22
+ * Inhaled into the nose and the lungs. Examples are airborne pollens of
23
+ * certain trees, grasses and weeds; house dust that include dust mite
24
+ * particles, mold spores, cat and dog dander and latex dust.
25
+ * Ingested by mouth. Frequent culprits include shrimp, peanuts and
26
+ * other nuts.
27
+ * Injected. Such as medications delivered by needle like
28
+ * penicillin or other injectable drugs, and venom from insect
29
+ * stings and bites.
30
+ * Absorbed through the skin. Plants such as poison ivy, sumac
31
+ * and oak and latex are examples.
32
+
33
+ What Makes Some Pollen Cause Allergies, and Not Others?
34
+
35
+ Plant pollens that are carried by the wind cause most
36
+ allergies of the nose, eyes and lungs. These plants (including
37
+ certain weeds, trees and grasses) are natural pollutants
38
+ produced at various times of the year when their small,
39
+ inconspicuous flowers discharge literally billions of pollen
40
+ particles.
41
+
42
+ Because the particles can be carried significant distances, it
43
+ is important for you not only to understand local
44
+ environmental conditions, but also conditions over the broader
45
+ area of the state or region in which you live. Unlike the
46
+ wind-pollinated plants, conspicuous wild flowers or flowers
47
+ used in most residential gardens are pollinated by bees,
48
+ wasps, and other insects and therefore are not widely capable
49
+ of producing allergic disease.
50
+
51
+ What is the Role of Heredity in Allergy?
52
+
53
+ Like baldness, height and eye color, the capacity to become
54
+ allergic is an inherited characteristic. Yet, although you may
55
+ be born with the genetic capability to become allergic, you
56
+ are not automatically allergic to specific allergens. Several
57
+ factors must be present for allergic sensitivity to be
58
+ developed:
59
+
60
+ * The specific genes acquired from parents.
61
+ * The exposure to one or more allergens to which you
62
+ * have a genetically programmed response.
63
+ * The degree and length of exposure.
64
+
65
+ A baby born with the tendency to become allergic
66
+ to cow's milk, for example, may show allergic
67
+ symptoms several months after birth. A genetic
68
+ capability to become allergic to cat dander may
69
+ take three to four years of cat exposure before
70
+ the person shows symptoms. These people may also
71
+ become allergic to other environmental substances
72
+ with age.
73
+
74
+ On the other hand, poison ivy allergy (contact
75
+ dermatitis) is an example of an allergy in which
76
+ hereditary background does not play a part. The
77
+ person with poison ivy allergy first has to be
78
+ exposed to the oil from the plant. This usually
79
+ occurs during youth, when a rash does not always
80
+ appear. However, the first exposure may sensitize
81
+ or cause the person to become allergic and, when
82
+ subsequent exposure takes place, a contact
83
+ dermatitis rash appears and can be quite severe.
84
+ Many plants are capable of producing this type of
85
+ rash. Substances other than plants, such as dyes,
86
+ metals, and chemicals in deodorants and cosmetics,
87
+ can also cause a similar dermatitis.
@@ -0,0 +1,107 @@
1
+ #include <stdio.h>
2
+ #include <ctype.h>
3
+ #include <sys/stat.h>
4
+ #include <assert.h>
5
+ #include "tagger.h"
6
+
7
+ struct TestBase {
8
+ TestBase();
9
+ void overflow();
10
+ void small();
11
+ void large_file();
12
+
13
+ std::set<std::string> tags;
14
+ NWordTagger tagger;
15
+ };
16
+
17
+
18
+ TestBase::TestBase()
19
+ {
20
+ tags.insert("fitness");
21
+ tags.insert("delightful");
22
+ tags.insert("dreaming");
23
+ tags.insert("dreaming of their world");
24
+ tags.insert("names");
25
+ tags.insert("places");
26
+ tags.insert("diabetes");
27
+ tags.insert("sugars");
28
+ tags.insert("allergy");
29
+ tags.insert("dermatitis");
30
+
31
+ tagger.setNWords( 4 );
32
+
33
+ tagger.loadTags( tags );
34
+ }
35
+
36
+ void TestBase::overflow()
37
+ {
38
+ // input passed to the filter should be processed to downcase, and remove all punctionation
39
+ std::string words( "hello fitness fitness fitness party dreaming dreaming of their world how are you all doing today so many times I ve seen or heard a delightful story or tales" );
40
+ std::vector<std::string> matched_tags = tagger.execute( words.c_str(), 2 );
41
+
42
+ printf( "%ld tags\n", (long int)matched_tags.size() );
43
+ for( size_t i = 0; i < matched_tags.size(); ++i ){
44
+ printf( "tagged: %s\n", matched_tags[i].c_str() );
45
+ }
46
+
47
+ assert( matched_tags.size() == 2 );
48
+ assert( matched_tags[0] == "dreaming" );
49
+ assert( matched_tags[1] == "fitness" );
50
+ }
51
+
52
+ void TestBase::small()
53
+ {
54
+ // input passed to the filter should be processed to downcase, and remove all punctionation
55
+ std::string words( "nothing to see here" );
56
+ std::vector<std::string> matched_tags = tagger.execute( words.c_str(), 2 );
57
+
58
+ assert( matched_tags.size() == 0 );
59
+ }
60
+
61
+ void TestBase::large_file()
62
+ {
63
+ // input passed to the filter should be processed to downcase, and remove all punctionation
64
+ std::string words;
65
+ FILE *in = fopen("doc.txt","r");
66
+ struct stat s;
67
+ char *buffer = NULL;
68
+ fstat( fileno(in), &s );
69
+ buffer = (char*)malloc(sizeof(char)*(s.st_size+1));
70
+ memset(buffer,'\0',s.st_size+1);
71
+ fread( buffer, sizeof(char), s.st_size, in );
72
+ words = buffer;
73
+ free(buffer);
74
+ fclose(in);
75
+
76
+ std::vector<std::string> matched_tags = tagger.execute( words.c_str(), 10 );
77
+
78
+ printf( "%ld tags\n", (long int)matched_tags.size() );
79
+ for( size_t i = 0; i < matched_tags.size(); ++i ){
80
+ printf( "tagged: %s\n", matched_tags[i].c_str() );
81
+ }
82
+
83
+ assert( matched_tags.size() == 2 );
84
+ assert( matched_tags[0] == "allergy" );
85
+ assert( matched_tags[1] == "dermatitis" );
86
+ }
87
+
88
+ static void test_run()
89
+ {
90
+ TestBase test;
91
+
92
+ for( int i = 0; i < 10; ++i ) {
93
+ test.overflow();
94
+ test.small();
95
+ test.large_file();
96
+ }
97
+
98
+ }
99
+
100
+ int main()
101
+ {
102
+ // running multiple iterations to test for memory leaks
103
+ for( int i = 0; i < 2; ++i ) {
104
+ test_run();
105
+ }
106
+ return 0;
107
+ }
@@ -0,0 +1,31 @@
1
+ if $0 == __FILE__
2
+ require 'test/unit'
3
+ require 'tagger'
4
+
5
+ class NWordTest < Test::Unit::TestCase
6
+
7
+ def setup
8
+ if !defined?($tagger)
9
+ $tagger = Tagger::SimpleTagger.new
10
+ $tagger.load_tags( File.read('../../tags.txt').split("\n") )
11
+ $tagger.set_words( 4 );
12
+ end
13
+ end
14
+
15
+ def test_basic
16
+ timer = Time.now
17
+ text = "This is a sa'mple doc[]ument lets see how cancer ngrams 4 works out for this interesting text!"
18
+ tags = $tagger.execute( text )
19
+ assert_equal ['cancer','work'], tags
20
+ puts "Duration: #{Time.now - timer} sec"
21
+ end
22
+
23
+ def test_ngram_size3
24
+ timer = Time.now
25
+ text = "This body of text contains something like ventricular septal defect"
26
+ tags = $tagger.execute( text )
27
+ assert_equal ['ventricular septal defect'], tags
28
+ puts "Duration: #{Time.now - timer} sec"
29
+ end
30
+ end
31
+ end