rbtagger 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/COPYING +21 -0
  2. data/History.txt +4 -0
  3. data/LICENSE +21 -0
  4. data/License.txt +20 -0
  5. data/Manifest.txt +75 -0
  6. data/PostInstall.txt +7 -0
  7. data/README +7 -0
  8. data/README.txt +53 -0
  9. data/Rakefile +33 -0
  10. data/config/hoe.rb +74 -0
  11. data/config/requirements.rb +15 -0
  12. data/ext/rule_tagger/bool.h +38 -0
  13. data/ext/rule_tagger/darray.c +292 -0
  14. data/ext/rule_tagger/darray.h +125 -0
  15. data/ext/rule_tagger/darrayP.h +50 -0
  16. data/ext/rule_tagger/extconf.rb +14 -0
  17. data/ext/rule_tagger/lex.c +170 -0
  18. data/ext/rule_tagger/lex.h +49 -0
  19. data/ext/rule_tagger/memory.c +127 -0
  20. data/ext/rule_tagger/memory.h +20 -0
  21. data/ext/rule_tagger/rbtagger.c +252 -0
  22. data/ext/rule_tagger/registry.c +326 -0
  23. data/ext/rule_tagger/registry.h +129 -0
  24. data/ext/rule_tagger/registryP.h +46 -0
  25. data/ext/rule_tagger/ruby-compat.h +20 -0
  26. data/ext/rule_tagger/rules.c +525 -0
  27. data/ext/rule_tagger/rules.h +42 -0
  28. data/ext/rule_tagger/sysdep.h +20 -0
  29. data/ext/rule_tagger/tagger.c +110 -0
  30. data/ext/rule_tagger/tagger.h +46 -0
  31. data/ext/rule_tagger/useful.c +44 -0
  32. data/ext/rule_tagger/useful.h +51 -0
  33. data/ext/word_tagger/extconf.rb +7 -0
  34. data/ext/word_tagger/porter_stemmer.c +430 -0
  35. data/ext/word_tagger/porter_stemmer.h +19 -0
  36. data/ext/word_tagger/rtagger.cc +83 -0
  37. data/ext/word_tagger/tagger.cc +153 -0
  38. data/ext/word_tagger/tagger.h +27 -0
  39. data/ext/word_tagger/tagger.rb +8 -0
  40. data/ext/word_tagger/test/Makefile +22 -0
  41. data/ext/word_tagger/test/doc.txt +87 -0
  42. data/ext/word_tagger/test/test.cc +107 -0
  43. data/ext/word_tagger/test.rb +31 -0
  44. data/lib/brill/tagger.rb +225 -0
  45. data/lib/rbtagger/version.rb +9 -0
  46. data/lib/rbtagger.rb +6 -0
  47. data/script/console +10 -0
  48. data/script/destroy +14 -0
  49. data/script/generate +14 -0
  50. data/script/txt2html +82 -0
  51. data/setup.rb +1585 -0
  52. data/tasks/deployment.rake +34 -0
  53. data/tasks/environment.rake +7 -0
  54. data/tasks/website.rake +17 -0
  55. data/test/CONTEXTUALRULEFILE +284 -0
  56. data/test/LEXICALRULEFILE +148 -0
  57. data/test/LEXICON +93696 -0
  58. data/test/docs/doc0.txt +20 -0
  59. data/test/docs/doc1.txt +11 -0
  60. data/test/docs/doc2.txt +52 -0
  61. data/test/docs/doc3.txt +128 -0
  62. data/test/docs/doc4.txt +337 -0
  63. data/test/docs/doc5.txt +497 -0
  64. data/test/docs/doc6.txt +116 -0
  65. data/test/docs/doc7.txt +101 -0
  66. data/test/docs/doc8.txt +25 -0
  67. data/test/docs/doc9.txt +84 -0
  68. data/test/tagger_test.rb +60 -0
  69. data/test/test_helper.rb +2 -0
  70. data/tools/rakehelp.rb +113 -0
  71. data/website/index.html +113 -0
  72. data/website/index.txt +53 -0
  73. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  74. data/website/stylesheets/screen.css +138 -0
  75. data/website/template.html.erb +48 -0
  76. metadata +155 -0
@@ -0,0 +1,19 @@
1
+ #ifndef PORTER_STEMMER_H
2
+ #define PORTER_STEMMER_H
3
+ #ifdef __cplusplus
4
+ extern "C" {
5
+ #endif
6
+
7
+ struct stemmer;
8
+
9
+ extern struct stemmer * porter_stemmer_new(void);
10
+ extern void porter_stemmer_free(struct stemmer * z);
11
+
12
+ extern int porter_stem(struct stemmer * z, const char * b, int k);
13
+
14
+
15
+
16
+ #ifdef __cplusplus
17
+ }
18
+ #endif
19
+ #endif
@@ -0,0 +1,83 @@
1
+ /**
2
+ * Copyright (c) 2008 Todd A. Fisher
3
+ * see LICENSE
4
+ */
5
+ #include "ruby.h"
6
+ #include "tagger.h"
7
+
8
+ #define DEBUG
9
+ #ifdef DEBUG
10
+ #define TRACE() fprintf(stderr, "> %s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__)
11
+ #else
12
+ #define TRACE()
13
+ #endif
14
+
15
+ /* ruby 1.9 compat */
16
+ #ifndef RSTRING_PTR
17
+ #define RSTRING_PTR(str) RSTRING(str)->ptr
18
+ #endif
19
+
20
+ #ifndef RSTRING_LEN
21
+ #define RSTRING_LEN(str) RSTRING(str)->len
22
+ #endif
23
+
24
+ static VALUE rb_Tagger;
25
+ static VALUE rb_NWordTagger;
26
+
27
+ VALUE Tagger_execute( VALUE self, VALUE text )
28
+ {
29
+ NWordTagger *tagger;
30
+ Data_Get_Struct( self, NWordTagger, tagger );
31
+ std::vector<std::string> tags = tagger->execute( RSTRING_PTR(text) );
32
+ VALUE results = rb_ary_new2(tags.size());
33
+ for( size_t i = 0; i < tags.size(); ++i ){
34
+ rb_ary_push( results, rb_str_new( tags[i].c_str(), tags[i].length() ) );
35
+ }
36
+ return results;
37
+ }
38
+ VALUE Tagger_set_words( VALUE self, VALUE words )
39
+ {
40
+ NWordTagger *tagger;
41
+ Data_Get_Struct( self, NWordTagger, tagger );
42
+ tagger->setNWords( NUM2INT(words) );
43
+ return Qnil;
44
+ }
45
+ VALUE Tagger_load_tags( VALUE self, VALUE tagarr )
46
+ {
47
+ NWordTagger *tagger;
48
+ Data_Get_Struct( self, NWordTagger, tagger );
49
+ std::set<std::string> tags;
50
+ int len = RARRAY(tagarr)->len;
51
+ for( int i = 0; i < len; ++i ){
52
+ std::string tag = RSTRING_PTR( rb_ary_entry( tagarr, i ) );
53
+ tags.insert(tag);
54
+ }
55
+ tagger->loadTags(tags);
56
+ return Qnil;
57
+ }
58
+
59
+ static void Tagger_free( NWordTagger *tagger )
60
+ {
61
+ delete tagger;
62
+ }
63
+
64
+ VALUE Tagger_alloc(VALUE klass)
65
+ {
66
+ VALUE object;
67
+ NWordTagger *tagger = new NWordTagger();
68
+ object = Data_Wrap_Struct( klass, NULL, Tagger_free, tagger );
69
+
70
+ return object;
71
+ }
72
+
73
+ extern "C" void Init_rtagger()
74
+ {
75
+ rb_Tagger = rb_define_module( "Tagger" );
76
+ rb_NWordTagger = rb_define_class_under( rb_Tagger, "NWordTagger", rb_cObject );
77
+
78
+ rb_define_alloc_func( rb_NWordTagger, Tagger_alloc );
79
+
80
+ rb_define_method( rb_NWordTagger, "load_tags", (VALUE (*)(...))Tagger_load_tags, 1 );
81
+ rb_define_method( rb_NWordTagger, "execute", (VALUE (*)(...))Tagger_execute, 1 );
82
+ rb_define_method( rb_NWordTagger, "set_words", (VALUE (*)(...))Tagger_set_words, 1 );
83
+ }
@@ -0,0 +1,153 @@
1
+ #include <ctype.h>
2
+ #include "tagger.h"
3
+ #include <set>
4
+ #include <algorithm>
5
+ #include <sstream>
6
+ #include <iterator>
7
+ #include <string>
8
+ #include <vector>
9
+ #include "porter_stemmer.h"
10
+
11
+ struct WordComparitor
12
+ {
13
+ bool operator()(const std::pair<std::string,int> &s1, const std::pair<std::string,int> &s2) const
14
+ {
15
+ return s1.second < s2.second;
16
+ }
17
+ };
18
+
19
+
20
+ // from http://www.thescripts.com/forum/thread167600.html
21
+ // split words by ' '
22
+ static std::vector<std::string> word_split(const std::string& s)
23
+ {
24
+ std::string words = s;
25
+ // convert all non alpha characters to spaces
26
+ for( size_t i = 0; i < words.length(); ++i ) {
27
+ if( !isalpha( words[i] ) ) {
28
+ words[i] = ' '; // convert to space
29
+ }
30
+ }
31
+
32
+ std::istringstream is(words);
33
+ return std::vector<std::string>(std::istream_iterator<std::string>(is), std::istream_iterator<std::string>());
34
+ }
35
+
36
+ NWordTagger::NWordTagger()
37
+ : nwords(2), stemmer(porter_stemmer_new()){
38
+ }
39
+ NWordTagger::~NWordTagger(){
40
+ porter_stemmer_free(stemmer);
41
+ }
42
+ void NWordTagger::loadTags( const std::set<std::string> &tags )
43
+ {
44
+ for( std::set<std::string>::iterator i = tags.begin(); i != tags.end(); ++i ){
45
+ std::string stemmed, word = std::string(*i);
46
+ std::vector<std::string> words = word_split( *i );
47
+ //printf( "word: %s\n", word.c_str() );
48
+
49
+ if( words.size() > 1 ){
50
+ for( size_t j = 0; j < words.size(); ++j ){
51
+ stemmed += this->stemWord(words[j]) + " ";
52
+ }
53
+ stemmed = stemmed.substr(0,stemmed.length()-1);
54
+ this->tags[stemmed] = word;
55
+ //printf( "word: %s -> %s\n", word.c_str(), stemmed.c_str() );
56
+ }
57
+ else{
58
+ stemmed = this->stemWord(*i);
59
+ //printf( "word: %s -> %s\n", word.c_str(), stemmed.c_str() );
60
+ this->tags[stemmed] = word;
61
+ }
62
+
63
+ }
64
+ }
65
+ std::string NWordTagger::stemWord( const std::string &word )const
66
+ {
67
+ std::string stemmed;
68
+ char *transition_buffer = strdup( word.c_str() );
69
+ stemmed = word.substr(0,porter_stem(this->stemmer, transition_buffer, word.length()-1 )+1);
70
+ free( transition_buffer );
71
+ return stemmed;
72
+ }
73
+
74
+ std::vector<std::string> NWordTagger::execute( const char *text, short max )const
75
+ {
76
+ int max_count = 0;
77
+ std::vector<std::string> words = word_split( text );
78
+ std::map<std::string, int> matched_tags; // stores tags and frequency
79
+ std::string match_word;
80
+ std::map<std::string,std::string>::const_iterator matched;
81
+
82
+ // loop over the words stemming each word
83
+ for( size_t i = 0; i < words.size(); ++i ) {
84
+
85
+ // get the stemmed word at position i
86
+ match_word = this->stemWord(words[i]);
87
+
88
+ // now scan ahead nwords positions searching our tags table for matches
89
+ for( short j = 1; (j <= this->nwords) && ((i+j) < words.size()); ++j ) {
90
+ matched = this->tags.find( match_word );
91
+ if( matched != this->tags.end() ){
92
+ //printf( "word: %d:(%s->%s)\n", i, match_word.c_str(), matched->second.c_str() );
93
+ std::map<std::string, int>::iterator mloc = matched_tags.find( matched->second );
94
+ if( mloc == matched_tags.end() ) {
95
+ matched_tags[matched->second] = 1; // count 1
96
+ }
97
+ else {
98
+ mloc->second++;
99
+ if( max_count < mloc->second ) { max_count = mloc->second; }
100
+ }
101
+ }
102
+ // stem each word and compare against our tag bank
103
+ //printf( "window: %ld:%lu\n", i,(i+j) );
104
+ match_word += " " + this->stemWord(words[i+j]);
105
+ }
106
+
107
+ matched = this->tags.find( match_word );
108
+ if( matched != this->tags.end() ) {
109
+ //printf( "word: %ld:(%s->%s)\n", i, words[i].c_str(), match_word.c_str() );
110
+ std::map<std::string, int>::iterator mloc = matched_tags.find( matched->second );
111
+ if( mloc == matched_tags.end() ) {
112
+ matched_tags[matched->second] = 1; // count 1
113
+ }
114
+ else {
115
+ mloc->second++;
116
+ if( max_count < mloc->second ) { max_count = mloc->second; }
117
+ }
118
+ }
119
+ }
120
+
121
+ // now that we have all the matched tags reduce to max using the tag frequency as a reduction measure
122
+ std::vector< std::pair<std::string,int> > sorted_tags;
123
+
124
+ //printf( "max frequency: %d\n", max_count );
125
+ for( std::map<std::string, int>::iterator mloc = matched_tags.begin(); mloc != matched_tags.end(); ++mloc ){
126
+ //printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
127
+ sorted_tags.push_back(*mloc);
128
+ }
129
+
130
+ // sort the tags in frequency order
131
+ std::sort( sorted_tags.begin(), sorted_tags.end(), WordComparitor() );
132
+
133
+ std::vector< std::string > reduced_tags;
134
+
135
+ std::vector< std::pair<std::string, int> >::iterator mloc;
136
+ do {
137
+ for(mloc = sorted_tags.begin(); mloc != sorted_tags.end(); ++mloc ) {
138
+ std::pair< std::string, int > word_freq = *mloc;
139
+ // printf( "word: %s, frequency: %d\n", word_freq.first.c_str(), word_freq.second );
140
+ //printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
141
+ if( word_freq.second < max_count ) {
142
+ sorted_tags.erase( mloc );
143
+ break;
144
+ }
145
+ }
146
+ } while( sorted_tags.size() > (size_t)max && mloc != sorted_tags.end() );
147
+
148
+ for( size_t i = 0; i < sorted_tags.size(); ++i ) {
149
+ reduced_tags.push_back( sorted_tags[i].first );
150
+ }
151
+
152
+ return reduced_tags;
153
+ }
@@ -0,0 +1,27 @@
1
+ #ifndef NWORD_TAGGER_H
2
+ #define NWORD_TAGGER_H
3
+ #include <set>
4
+ #include <map>
5
+ #include <string>
6
+ #include <vector>
7
+
8
+ struct NWordTagger {
9
+ NWordTagger();
10
+ ~NWordTagger();
11
+
12
+ void loadTags( const std::set<std::string> &tags );
13
+
14
+ short getNWords()const{ return nwords; }
15
+ void setNWords( short words ){ nwords = words; }
16
+
17
+ std::vector<std::string> execute( const char *text, short max = 10 )const;
18
+ private:
19
+ short nwords;
20
+ struct stemmer *stemmer;
21
+ std::map<std::string,std::string> tags;
22
+ std::vector<std::string> words;
23
+
24
+ std::string stemWord( const std::string &word )const;
25
+ };
26
+
27
+ #endif
@@ -0,0 +1,8 @@
1
+ module Tagger
2
+ require 'rtagger'
3
+ class SimpleTagger < Tagger::NWordTagger
4
+ def execute( text )
5
+ super( text.gsub(/[^\w]/,' ') )
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,22 @@
1
+ CFLAGS=-g -Wall #-fprofile-arcs -ftest-coverage
2
+
3
+ all: porter_stemmer.o tagger.o test.o
4
+ g++ $(CFLAGS) -o testit $^
5
+ time ./testit doc.txt
6
+ # gcov -b tagger.cc
7
+ # valgrind --leak-check=full ./testit
8
+ test.o: test.cc tagger.h
9
+ g++ $(CFLAGS) test.cc -c
10
+ tagger.o: tagger.cc tagger.h porter_stemmer.h
11
+ g++ $(CFLAGS) tagger.cc -c
12
+ porter_stemmer.o: porter_stemmer.c porter_stemmer.h
13
+ gcc $(CFLAGS) porter_stemmer.c -c
14
+
15
+ setup:
16
+ ln -s ../tagger.h
17
+ ln -s ../tagger.cc
18
+ ln -s ../porter_stemmer.h
19
+ ln -s ../porter_stemmer.c
20
+
21
+ clean:
22
+ rm -f tagger.o test.o testit
@@ -0,0 +1,87 @@
1
+ Allergies are diseases of the immune system that cause an overreaction to
2
+ substances called "allergens." Allergies are grouped by the kind of trigger,
3
+ time of year or where symptoms appear on the body: indoor andoutdoor allergies
4
+ (also called "hay fever," "seasonal," "perennial" or "nasal" allergies), food
5
+ and drug allergies, latex allergies, insect allergies, skin allergies and eye
6
+ allergies. People who have allergies can live healthy and active lives.
7
+
8
+ movieMultimedia Allergy Library
9
+ GlossaryGlossary of Allergy Terms
10
+
11
+ What are Allergies | What Causes Allergies | Diagnosis | Treatment |
12
+ Prevention |
13
+
14
+ What Causes Allergies
15
+
16
+ The substances that cause allergic disease in people are known as allergens.
17
+ "Antigens," or protein particles like pollen, food or dander enter our bodies
18
+ through a variety of ways. If the antigen causes an allergic reaction, that
19
+ particle is considered an "allergen" – and antigen that triggers an allergic
20
+ reaction. These allergens can get into our body in several ways:
21
+
22
+ * Inhaled into the nose and the lungs. Examples are airborne pollens of
23
+ * certain trees, grasses and weeds; house dust that include dust mite
24
+ * particles, mold spores, cat and dog dander and latex dust.
25
+ * Ingested by mouth. Frequent culprits include shrimp, peanuts and
26
+ * other nuts.
27
+ * Injected. Such as medications delivered by needle like
28
+ * penicillin or other injectable drugs, and venom from insect
29
+ * stings and bites.
30
+ * Absorbed through the skin. Plants such as poison ivy, sumac
31
+ * and oak and latex are examples.
32
+
33
+ What Makes Some Pollen Cause Allergies, and Not Others?
34
+
35
+ Plant pollens that are carried by the wind cause most
36
+ allergies of the nose, eyes and lungs. These plants (including
37
+ certain weeds, trees and grasses) are natural pollutants
38
+ produced at various times of the year when their small,
39
+ inconspicuous flowers discharge literally billions of pollen
40
+ particles.
41
+
42
+ Because the particles can be carried significant distances, it
43
+ is important for you not only to understand local
44
+ environmental conditions, but also conditions over the broader
45
+ area of the state or region in which you live. Unlike the
46
+ wind-pollinated plants, conspicuous wild flowers or flowers
47
+ used in most residential gardens are pollinated by bees,
48
+ wasps, and other insects and therefore are not widely capable
49
+ of producing allergic disease.
50
+
51
+ What is the Role of Heredity in Allergy?
52
+
53
+ Like baldness, height and eye color, the capacity to become
54
+ allergic is an inherited characteristic. Yet, although you may
55
+ be born with the genetic capability to become allergic, you
56
+ are not automatically allergic to specific allergens. Several
57
+ factors must be present for allergic sensitivity to be
58
+ developed:
59
+
60
+ * The specific genes acquired from parents.
61
+ * The exposure to one or more allergens to which you
62
+ * have a genetically programmed response.
63
+ * The degree and length of exposure.
64
+
65
+ A baby born with the tendency to become allergic
66
+ to cow's milk, for example, may show allergic
67
+ symptoms several months after birth. A genetic
68
+ capability to become allergic to cat dander may
69
+ take three to four years of cat exposure before
70
+ the person shows symptoms. These people may also
71
+ become allergic to other environmental substances
72
+ with age.
73
+
74
+ On the other hand, poison ivy allergy (contact
75
+ dermatitis) is an example of an allergy in which
76
+ hereditary background does not play a part. The
77
+ person with poison ivy allergy first has to be
78
+ exposed to the oil from the plant. This usually
79
+ occurs during youth, when a rash does not always
80
+ appear. However, the first exposure may sensitize
81
+ or cause the person to become allergic and, when
82
+ subsequent exposure takes place, a contact
83
+ dermatitis rash appears and can be quite severe.
84
+ Many plants are capable of producing this type of
85
+ rash. Substances other than plants, such as dyes,
86
+ metals, and chemicals in deodorants and cosmetics,
87
+ can also cause a similar dermatitis.
@@ -0,0 +1,107 @@
1
+ #include <stdio.h>
2
+ #include <ctype.h>
3
+ #include <sys/stat.h>
4
+ #include <assert.h>
5
+ #include "tagger.h"
6
+
7
+ struct TestBase {
8
+ TestBase();
9
+ void overflow();
10
+ void small();
11
+ void large_file();
12
+
13
+ std::set<std::string> tags;
14
+ NWordTagger tagger;
15
+ };
16
+
17
+
18
+ TestBase::TestBase()
19
+ {
20
+ tags.insert("fitness");
21
+ tags.insert("delightful");
22
+ tags.insert("dreaming");
23
+ tags.insert("dreaming of their world");
24
+ tags.insert("names");
25
+ tags.insert("places");
26
+ tags.insert("diabetes");
27
+ tags.insert("sugars");
28
+ tags.insert("allergy");
29
+ tags.insert("dermatitis");
30
+
31
+ tagger.setNWords( 4 );
32
+
33
+ tagger.loadTags( tags );
34
+ }
35
+
36
+ void TestBase::overflow()
37
+ {
38
+ // input passed to the filter should be processed to downcase, and remove all punctionation
39
+ std::string words( "hello fitness fitness fitness party dreaming dreaming of their world how are you all doing today so many times I ve seen or heard a delightful story or tales" );
40
+ std::vector<std::string> matched_tags = tagger.execute( words.c_str(), 2 );
41
+
42
+ printf( "%ld tags\n", (long int)matched_tags.size() );
43
+ for( size_t i = 0; i < matched_tags.size(); ++i ){
44
+ printf( "tagged: %s\n", matched_tags[i].c_str() );
45
+ }
46
+
47
+ assert( matched_tags.size() == 2 );
48
+ assert( matched_tags[0] == "dreaming" );
49
+ assert( matched_tags[1] == "fitness" );
50
+ }
51
+
52
+ void TestBase::small()
53
+ {
54
+ // input passed to the filter should be processed to downcase, and remove all punctionation
55
+ std::string words( "nothing to see here" );
56
+ std::vector<std::string> matched_tags = tagger.execute( words.c_str(), 2 );
57
+
58
+ assert( matched_tags.size() == 0 );
59
+ }
60
+
61
+ void TestBase::large_file()
62
+ {
63
+ // input passed to the filter should be processed to downcase, and remove all punctionation
64
+ std::string words;
65
+ FILE *in = fopen("doc.txt","r");
66
+ struct stat s;
67
+ char *buffer = NULL;
68
+ fstat( fileno(in), &s );
69
+ buffer = (char*)malloc(sizeof(char)*(s.st_size+1));
70
+ memset(buffer,'\0',s.st_size+1);
71
+ fread( buffer, sizeof(char), s.st_size, in );
72
+ words = buffer;
73
+ free(buffer);
74
+ fclose(in);
75
+
76
+ std::vector<std::string> matched_tags = tagger.execute( words.c_str(), 10 );
77
+
78
+ printf( "%ld tags\n", (long int)matched_tags.size() );
79
+ for( size_t i = 0; i < matched_tags.size(); ++i ){
80
+ printf( "tagged: %s\n", matched_tags[i].c_str() );
81
+ }
82
+
83
+ assert( matched_tags.size() == 2 );
84
+ assert( matched_tags[0] == "allergy" );
85
+ assert( matched_tags[1] == "dermatitis" );
86
+ }
87
+
88
+ static void test_run()
89
+ {
90
+ TestBase test;
91
+
92
+ for( int i = 0; i < 10; ++i ) {
93
+ test.overflow();
94
+ test.small();
95
+ test.large_file();
96
+ }
97
+
98
+ }
99
+
100
+ int main()
101
+ {
102
+ // running multiple iterations to test for memory leaks
103
+ for( int i = 0; i < 2; ++i ) {
104
+ test_run();
105
+ }
106
+ return 0;
107
+ }
@@ -0,0 +1,31 @@
1
+ if $0 == __FILE__
2
+ require 'test/unit'
3
+ require 'tagger'
4
+
5
+ class NWordTest < Test::Unit::TestCase
6
+
7
+ def setup
8
+ if !defined?($tagger)
9
+ $tagger = Tagger::SimpleTagger.new
10
+ $tagger.load_tags( File.read('../../tags.txt').split("\n") )
11
+ $tagger.set_words( 4 );
12
+ end
13
+ end
14
+
15
+ def test_basic
16
+ timer = Time.now
17
+ text = "This is a sa'mple doc[]ument lets see how cancer ngrams 4 works out for this interesting text!"
18
+ tags = $tagger.execute( text )
19
+ assert_equal ['cancer','work'], tags
20
+ puts "Duration: #{Time.now - timer} sec"
21
+ end
22
+
23
+ def test_ngram_size3
24
+ timer = Time.now
25
+ text = "This body of text contains something like ventricular septal defect"
26
+ tags = $tagger.execute( text )
27
+ assert_equal ['ventricular septal defect'], tags
28
+ puts "Duration: #{Time.now - timer} sec"
29
+ end
30
+ end
31
+ end