rbtagger 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +21 -0
- data/History.txt +4 -0
- data/LICENSE +21 -0
- data/License.txt +20 -0
- data/Manifest.txt +75 -0
- data/PostInstall.txt +7 -0
- data/README +7 -0
- data/README.txt +53 -0
- data/Rakefile +33 -0
- data/config/hoe.rb +74 -0
- data/config/requirements.rb +15 -0
- data/ext/rule_tagger/bool.h +38 -0
- data/ext/rule_tagger/darray.c +292 -0
- data/ext/rule_tagger/darray.h +125 -0
- data/ext/rule_tagger/darrayP.h +50 -0
- data/ext/rule_tagger/extconf.rb +14 -0
- data/ext/rule_tagger/lex.c +170 -0
- data/ext/rule_tagger/lex.h +49 -0
- data/ext/rule_tagger/memory.c +127 -0
- data/ext/rule_tagger/memory.h +20 -0
- data/ext/rule_tagger/rbtagger.c +252 -0
- data/ext/rule_tagger/registry.c +326 -0
- data/ext/rule_tagger/registry.h +129 -0
- data/ext/rule_tagger/registryP.h +46 -0
- data/ext/rule_tagger/ruby-compat.h +20 -0
- data/ext/rule_tagger/rules.c +525 -0
- data/ext/rule_tagger/rules.h +42 -0
- data/ext/rule_tagger/sysdep.h +20 -0
- data/ext/rule_tagger/tagger.c +110 -0
- data/ext/rule_tagger/tagger.h +46 -0
- data/ext/rule_tagger/useful.c +44 -0
- data/ext/rule_tagger/useful.h +51 -0
- data/ext/word_tagger/extconf.rb +7 -0
- data/ext/word_tagger/porter_stemmer.c +430 -0
- data/ext/word_tagger/porter_stemmer.h +19 -0
- data/ext/word_tagger/rtagger.cc +83 -0
- data/ext/word_tagger/tagger.cc +153 -0
- data/ext/word_tagger/tagger.h +27 -0
- data/ext/word_tagger/tagger.rb +8 -0
- data/ext/word_tagger/test/Makefile +22 -0
- data/ext/word_tagger/test/doc.txt +87 -0
- data/ext/word_tagger/test/test.cc +107 -0
- data/ext/word_tagger/test.rb +31 -0
- data/lib/brill/tagger.rb +225 -0
- data/lib/rbtagger/version.rb +9 -0
- data/lib/rbtagger.rb +6 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/script/txt2html +82 -0
- data/setup.rb +1585 -0
- data/tasks/deployment.rake +34 -0
- data/tasks/environment.rake +7 -0
- data/tasks/website.rake +17 -0
- data/test/CONTEXTUALRULEFILE +284 -0
- data/test/LEXICALRULEFILE +148 -0
- data/test/LEXICON +93696 -0
- data/test/docs/doc0.txt +20 -0
- data/test/docs/doc1.txt +11 -0
- data/test/docs/doc2.txt +52 -0
- data/test/docs/doc3.txt +128 -0
- data/test/docs/doc4.txt +337 -0
- data/test/docs/doc5.txt +497 -0
- data/test/docs/doc6.txt +116 -0
- data/test/docs/doc7.txt +101 -0
- data/test/docs/doc8.txt +25 -0
- data/test/docs/doc9.txt +84 -0
- data/test/tagger_test.rb +60 -0
- data/test/test_helper.rb +2 -0
- data/tools/rakehelp.rb +113 -0
- data/website/index.html +113 -0
- data/website/index.txt +53 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +138 -0
- data/website/template.html.erb +48 -0
- metadata +155 -0
@@ -0,0 +1,19 @@
|
|
1
|
+
#ifndef PORTER_STEMMER_H
|
2
|
+
#define PORTER_STEMMER_H
|
3
|
+
#ifdef __cplusplus
|
4
|
+
extern "C" {
|
5
|
+
#endif
|
6
|
+
|
7
|
+
struct stemmer;
|
8
|
+
|
9
|
+
extern struct stemmer * porter_stemmer_new(void);
|
10
|
+
extern void porter_stemmer_free(struct stemmer * z);
|
11
|
+
|
12
|
+
extern int porter_stem(struct stemmer * z, const char * b, int k);
|
13
|
+
|
14
|
+
|
15
|
+
|
16
|
+
#ifdef __cplusplus
|
17
|
+
}
|
18
|
+
#endif
|
19
|
+
#endif
|
@@ -0,0 +1,83 @@
|
|
1
|
+
/**
|
2
|
+
* Copyright (c) 2008 Todd A. Fisher
|
3
|
+
* see LICENSE
|
4
|
+
*/
|
5
|
+
#include "ruby.h"
|
6
|
+
#include "tagger.h"
|
7
|
+
|
8
|
+
#define DEBUG
|
9
|
+
#ifdef DEBUG
|
10
|
+
#define TRACE() fprintf(stderr, "> %s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__)
|
11
|
+
#else
|
12
|
+
#define TRACE()
|
13
|
+
#endif
|
14
|
+
|
15
|
+
/* ruby 1.9 compat */
|
16
|
+
#ifndef RSTRING_PTR
|
17
|
+
#define RSTRING_PTR(str) RSTRING(str)->ptr
|
18
|
+
#endif
|
19
|
+
|
20
|
+
#ifndef RSTRING_LEN
|
21
|
+
#define RSTRING_LEN(str) RSTRING(str)->len
|
22
|
+
#endif
|
23
|
+
|
24
|
+
static VALUE rb_Tagger;
|
25
|
+
static VALUE rb_NWordTagger;
|
26
|
+
|
27
|
+
VALUE Tagger_execute( VALUE self, VALUE text )
|
28
|
+
{
|
29
|
+
NWordTagger *tagger;
|
30
|
+
Data_Get_Struct( self, NWordTagger, tagger );
|
31
|
+
std::vector<std::string> tags = tagger->execute( RSTRING_PTR(text) );
|
32
|
+
VALUE results = rb_ary_new2(tags.size());
|
33
|
+
for( size_t i = 0; i < tags.size(); ++i ){
|
34
|
+
rb_ary_push( results, rb_str_new( tags[i].c_str(), tags[i].length() ) );
|
35
|
+
}
|
36
|
+
return results;
|
37
|
+
}
|
38
|
+
VALUE Tagger_set_words( VALUE self, VALUE words )
|
39
|
+
{
|
40
|
+
NWordTagger *tagger;
|
41
|
+
Data_Get_Struct( self, NWordTagger, tagger );
|
42
|
+
tagger->setNWords( NUM2INT(words) );
|
43
|
+
return Qnil;
|
44
|
+
}
|
45
|
+
VALUE Tagger_load_tags( VALUE self, VALUE tagarr )
|
46
|
+
{
|
47
|
+
NWordTagger *tagger;
|
48
|
+
Data_Get_Struct( self, NWordTagger, tagger );
|
49
|
+
std::set<std::string> tags;
|
50
|
+
int len = RARRAY(tagarr)->len;
|
51
|
+
for( int i = 0; i < len; ++i ){
|
52
|
+
std::string tag = RSTRING_PTR( rb_ary_entry( tagarr, i ) );
|
53
|
+
tags.insert(tag);
|
54
|
+
}
|
55
|
+
tagger->loadTags(tags);
|
56
|
+
return Qnil;
|
57
|
+
}
|
58
|
+
|
59
|
+
static void Tagger_free( NWordTagger *tagger )
|
60
|
+
{
|
61
|
+
delete tagger;
|
62
|
+
}
|
63
|
+
|
64
|
+
VALUE Tagger_alloc(VALUE klass)
|
65
|
+
{
|
66
|
+
VALUE object;
|
67
|
+
NWordTagger *tagger = new NWordTagger();
|
68
|
+
object = Data_Wrap_Struct( klass, NULL, Tagger_free, tagger );
|
69
|
+
|
70
|
+
return object;
|
71
|
+
}
|
72
|
+
|
73
|
+
extern "C" void Init_rtagger()
|
74
|
+
{
|
75
|
+
rb_Tagger = rb_define_module( "Tagger" );
|
76
|
+
rb_NWordTagger = rb_define_class_under( rb_Tagger, "NWordTagger", rb_cObject );
|
77
|
+
|
78
|
+
rb_define_alloc_func( rb_NWordTagger, Tagger_alloc );
|
79
|
+
|
80
|
+
rb_define_method( rb_NWordTagger, "load_tags", (VALUE (*)(...))Tagger_load_tags, 1 );
|
81
|
+
rb_define_method( rb_NWordTagger, "execute", (VALUE (*)(...))Tagger_execute, 1 );
|
82
|
+
rb_define_method( rb_NWordTagger, "set_words", (VALUE (*)(...))Tagger_set_words, 1 );
|
83
|
+
}
|
@@ -0,0 +1,153 @@
|
|
1
|
+
#include <ctype.h>
|
2
|
+
#include "tagger.h"
|
3
|
+
#include <set>
|
4
|
+
#include <algorithm>
|
5
|
+
#include <sstream>
|
6
|
+
#include <iterator>
|
7
|
+
#include <string>
|
8
|
+
#include <vector>
|
9
|
+
#include "porter_stemmer.h"
|
10
|
+
|
11
|
+
struct WordComparitor
|
12
|
+
{
|
13
|
+
bool operator()(const std::pair<std::string,int> &s1, const std::pair<std::string,int> &s2) const
|
14
|
+
{
|
15
|
+
return s1.second < s2.second;
|
16
|
+
}
|
17
|
+
};
|
18
|
+
|
19
|
+
|
20
|
+
// from http://www.thescripts.com/forum/thread167600.html
|
21
|
+
// split words by ' '
|
22
|
+
static std::vector<std::string> word_split(const std::string& s)
|
23
|
+
{
|
24
|
+
std::string words = s;
|
25
|
+
// convert all non alpha characters to spaces
|
26
|
+
for( size_t i = 0; i < words.length(); ++i ) {
|
27
|
+
if( !isalpha( words[i] ) ) {
|
28
|
+
words[i] = ' '; // convert to space
|
29
|
+
}
|
30
|
+
}
|
31
|
+
|
32
|
+
std::istringstream is(words);
|
33
|
+
return std::vector<std::string>(std::istream_iterator<std::string>(is), std::istream_iterator<std::string>());
|
34
|
+
}
|
35
|
+
|
36
|
+
NWordTagger::NWordTagger()
|
37
|
+
: nwords(2), stemmer(porter_stemmer_new()){
|
38
|
+
}
|
39
|
+
NWordTagger::~NWordTagger(){
|
40
|
+
porter_stemmer_free(stemmer);
|
41
|
+
}
|
42
|
+
void NWordTagger::loadTags( const std::set<std::string> &tags )
|
43
|
+
{
|
44
|
+
for( std::set<std::string>::iterator i = tags.begin(); i != tags.end(); ++i ){
|
45
|
+
std::string stemmed, word = std::string(*i);
|
46
|
+
std::vector<std::string> words = word_split( *i );
|
47
|
+
//printf( "word: %s\n", word.c_str() );
|
48
|
+
|
49
|
+
if( words.size() > 1 ){
|
50
|
+
for( size_t j = 0; j < words.size(); ++j ){
|
51
|
+
stemmed += this->stemWord(words[j]) + " ";
|
52
|
+
}
|
53
|
+
stemmed = stemmed.substr(0,stemmed.length()-1);
|
54
|
+
this->tags[stemmed] = word;
|
55
|
+
//printf( "word: %s -> %s\n", word.c_str(), stemmed.c_str() );
|
56
|
+
}
|
57
|
+
else{
|
58
|
+
stemmed = this->stemWord(*i);
|
59
|
+
//printf( "word: %s -> %s\n", word.c_str(), stemmed.c_str() );
|
60
|
+
this->tags[stemmed] = word;
|
61
|
+
}
|
62
|
+
|
63
|
+
}
|
64
|
+
}
|
65
|
+
std::string NWordTagger::stemWord( const std::string &word )const
|
66
|
+
{
|
67
|
+
std::string stemmed;
|
68
|
+
char *transition_buffer = strdup( word.c_str() );
|
69
|
+
stemmed = word.substr(0,porter_stem(this->stemmer, transition_buffer, word.length()-1 )+1);
|
70
|
+
free( transition_buffer );
|
71
|
+
return stemmed;
|
72
|
+
}
|
73
|
+
|
74
|
+
std::vector<std::string> NWordTagger::execute( const char *text, short max )const
|
75
|
+
{
|
76
|
+
int max_count = 0;
|
77
|
+
std::vector<std::string> words = word_split( text );
|
78
|
+
std::map<std::string, int> matched_tags; // stores tags and frequency
|
79
|
+
std::string match_word;
|
80
|
+
std::map<std::string,std::string>::const_iterator matched;
|
81
|
+
|
82
|
+
// loop over the words stemming each word
|
83
|
+
for( size_t i = 0; i < words.size(); ++i ) {
|
84
|
+
|
85
|
+
// get the stemmed word at position i
|
86
|
+
match_word = this->stemWord(words[i]);
|
87
|
+
|
88
|
+
// now scan ahead nwords positions searching our tags table for matches
|
89
|
+
for( short j = 1; (j <= this->nwords) && ((i+j) < words.size()); ++j ) {
|
90
|
+
matched = this->tags.find( match_word );
|
91
|
+
if( matched != this->tags.end() ){
|
92
|
+
//printf( "word: %d:(%s->%s)\n", i, match_word.c_str(), matched->second.c_str() );
|
93
|
+
std::map<std::string, int>::iterator mloc = matched_tags.find( matched->second );
|
94
|
+
if( mloc == matched_tags.end() ) {
|
95
|
+
matched_tags[matched->second] = 1; // count 1
|
96
|
+
}
|
97
|
+
else {
|
98
|
+
mloc->second++;
|
99
|
+
if( max_count < mloc->second ) { max_count = mloc->second; }
|
100
|
+
}
|
101
|
+
}
|
102
|
+
// stem each word and compare against our tag bank
|
103
|
+
//printf( "window: %ld:%lu\n", i,(i+j) );
|
104
|
+
match_word += " " + this->stemWord(words[i+j]);
|
105
|
+
}
|
106
|
+
|
107
|
+
matched = this->tags.find( match_word );
|
108
|
+
if( matched != this->tags.end() ) {
|
109
|
+
//printf( "word: %ld:(%s->%s)\n", i, words[i].c_str(), match_word.c_str() );
|
110
|
+
std::map<std::string, int>::iterator mloc = matched_tags.find( matched->second );
|
111
|
+
if( mloc == matched_tags.end() ) {
|
112
|
+
matched_tags[matched->second] = 1; // count 1
|
113
|
+
}
|
114
|
+
else {
|
115
|
+
mloc->second++;
|
116
|
+
if( max_count < mloc->second ) { max_count = mloc->second; }
|
117
|
+
}
|
118
|
+
}
|
119
|
+
}
|
120
|
+
|
121
|
+
// now that we have all the matched tags reduce to max using the tag frequency as a reduction measure
|
122
|
+
std::vector< std::pair<std::string,int> > sorted_tags;
|
123
|
+
|
124
|
+
//printf( "max frequency: %d\n", max_count );
|
125
|
+
for( std::map<std::string, int>::iterator mloc = matched_tags.begin(); mloc != matched_tags.end(); ++mloc ){
|
126
|
+
//printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
|
127
|
+
sorted_tags.push_back(*mloc);
|
128
|
+
}
|
129
|
+
|
130
|
+
// sort the tags in frequency order
|
131
|
+
std::sort( sorted_tags.begin(), sorted_tags.end(), WordComparitor() );
|
132
|
+
|
133
|
+
std::vector< std::string > reduced_tags;
|
134
|
+
|
135
|
+
std::vector< std::pair<std::string, int> >::iterator mloc;
|
136
|
+
do {
|
137
|
+
for(mloc = sorted_tags.begin(); mloc != sorted_tags.end(); ++mloc ) {
|
138
|
+
std::pair< std::string, int > word_freq = *mloc;
|
139
|
+
// printf( "word: %s, frequency: %d\n", word_freq.first.c_str(), word_freq.second );
|
140
|
+
//printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
|
141
|
+
if( word_freq.second < max_count ) {
|
142
|
+
sorted_tags.erase( mloc );
|
143
|
+
break;
|
144
|
+
}
|
145
|
+
}
|
146
|
+
} while( sorted_tags.size() > (size_t)max && mloc != sorted_tags.end() );
|
147
|
+
|
148
|
+
for( size_t i = 0; i < sorted_tags.size(); ++i ) {
|
149
|
+
reduced_tags.push_back( sorted_tags[i].first );
|
150
|
+
}
|
151
|
+
|
152
|
+
return reduced_tags;
|
153
|
+
}
|
@@ -0,0 +1,27 @@
|
|
1
|
+
#ifndef NWORD_TAGGER_H
|
2
|
+
#define NWORD_TAGGER_H
|
3
|
+
#include <set>
|
4
|
+
#include <map>
|
5
|
+
#include <string>
|
6
|
+
#include <vector>
|
7
|
+
|
8
|
+
struct NWordTagger {
|
9
|
+
NWordTagger();
|
10
|
+
~NWordTagger();
|
11
|
+
|
12
|
+
void loadTags( const std::set<std::string> &tags );
|
13
|
+
|
14
|
+
short getNWords()const{ return nwords; }
|
15
|
+
void setNWords( short words ){ nwords = words; }
|
16
|
+
|
17
|
+
std::vector<std::string> execute( const char *text, short max = 10 )const;
|
18
|
+
private:
|
19
|
+
short nwords;
|
20
|
+
struct stemmer *stemmer;
|
21
|
+
std::map<std::string,std::string> tags;
|
22
|
+
std::vector<std::string> words;
|
23
|
+
|
24
|
+
std::string stemWord( const std::string &word )const;
|
25
|
+
};
|
26
|
+
|
27
|
+
#endif
|
@@ -0,0 +1,22 @@
|
|
1
|
+
CFLAGS=-g -Wall #-fprofile-arcs -ftest-coverage
|
2
|
+
|
3
|
+
all: porter_stemmer.o tagger.o test.o
|
4
|
+
g++ $(CFLAGS) -o testit $^
|
5
|
+
time ./testit doc.txt
|
6
|
+
# gcov -b tagger.cc
|
7
|
+
# valgrind --leak-check=full ./testit
|
8
|
+
test.o: test.cc tagger.h
|
9
|
+
g++ $(CFLAGS) test.cc -c
|
10
|
+
tagger.o: tagger.cc tagger.h porter_stemmer.h
|
11
|
+
g++ $(CFLAGS) tagger.cc -c
|
12
|
+
porter_stemmer.o: porter_stemmer.c porter_stemmer.h
|
13
|
+
gcc $(CFLAGS) porter_stemmer.c -c
|
14
|
+
|
15
|
+
setup:
|
16
|
+
ln -s ../tagger.h
|
17
|
+
ln -s ../tagger.cc
|
18
|
+
ln -s ../porter_stemmer.h
|
19
|
+
ln -s ../porter_stemmer.c
|
20
|
+
|
21
|
+
clean:
|
22
|
+
rm -f tagger.o test.o testit
|
@@ -0,0 +1,87 @@
|
|
1
|
+
Allergies are diseases of the immune system that cause an overreaction to
|
2
|
+
substances called "allergens." Allergies are grouped by the kind of trigger,
|
3
|
+
time of year or where symptoms appear on the body: indoor andoutdoor allergies
|
4
|
+
(also called "hay fever," "seasonal," "perennial" or "nasal" allergies), food
|
5
|
+
and drug allergies, latex allergies, insect allergies, skin allergies and eye
|
6
|
+
allergies. People who have allergies can live healthy and active lives.
|
7
|
+
|
8
|
+
movieMultimedia Allergy Library
|
9
|
+
GlossaryGlossary of Allergy Terms
|
10
|
+
|
11
|
+
What are Allergies | What Causes Allergies | Diagnosis | Treatment |
|
12
|
+
Prevention |
|
13
|
+
|
14
|
+
What Causes Allergies
|
15
|
+
|
16
|
+
The substances that cause allergic disease in people are known as allergens.
|
17
|
+
"Antigens," or protein particles like pollen, food or dander enter our bodies
|
18
|
+
through a variety of ways. If the antigen causes an allergic reaction, that
|
19
|
+
particle is considered an "allergen" – and antigen that triggers an allergic
|
20
|
+
reaction. These allergens can get into our body in several ways:
|
21
|
+
|
22
|
+
* Inhaled into the nose and the lungs. Examples are airborne pollens of
|
23
|
+
* certain trees, grasses and weeds; house dust that include dust mite
|
24
|
+
* particles, mold spores, cat and dog dander and latex dust.
|
25
|
+
* Ingested by mouth. Frequent culprits include shrimp, peanuts and
|
26
|
+
* other nuts.
|
27
|
+
* Injected. Such as medications delivered by needle like
|
28
|
+
* penicillin or other injectable drugs, and venom from insect
|
29
|
+
* stings and bites.
|
30
|
+
* Absorbed through the skin. Plants such as poison ivy, sumac
|
31
|
+
* and oak and latex are examples.
|
32
|
+
|
33
|
+
What Makes Some Pollen Cause Allergies, and Not Others?
|
34
|
+
|
35
|
+
Plant pollens that are carried by the wind cause most
|
36
|
+
allergies of the nose, eyes and lungs. These plants (including
|
37
|
+
certain weeds, trees and grasses) are natural pollutants
|
38
|
+
produced at various times of the year when their small,
|
39
|
+
inconspicuous flowers discharge literally billions of pollen
|
40
|
+
particles.
|
41
|
+
|
42
|
+
Because the particles can be carried significant distances, it
|
43
|
+
is important for you not only to understand local
|
44
|
+
environmental conditions, but also conditions over the broader
|
45
|
+
area of the state or region in which you live. Unlike the
|
46
|
+
wind-pollinated plants, conspicuous wild flowers or flowers
|
47
|
+
used in most residential gardens are pollinated by bees,
|
48
|
+
wasps, and other insects and therefore are not widely capable
|
49
|
+
of producing allergic disease.
|
50
|
+
|
51
|
+
What is the Role of Heredity in Allergy?
|
52
|
+
|
53
|
+
Like baldness, height and eye color, the capacity to become
|
54
|
+
allergic is an inherited characteristic. Yet, although you may
|
55
|
+
be born with the genetic capability to become allergic, you
|
56
|
+
are not automatically allergic to specific allergens. Several
|
57
|
+
factors must be present for allergic sensitivity to be
|
58
|
+
developed:
|
59
|
+
|
60
|
+
* The specific genes acquired from parents.
|
61
|
+
* The exposure to one or more allergens to which you
|
62
|
+
* have a genetically programmed response.
|
63
|
+
* The degree and length of exposure.
|
64
|
+
|
65
|
+
A baby born with the tendency to become allergic
|
66
|
+
to cow's milk, for example, may show allergic
|
67
|
+
symptoms several months after birth. A genetic
|
68
|
+
capability to become allergic to cat dander may
|
69
|
+
take three to four years of cat exposure before
|
70
|
+
the person shows symptoms. These people may also
|
71
|
+
become allergic to other environmental substances
|
72
|
+
with age.
|
73
|
+
|
74
|
+
On the other hand, poison ivy allergy (contact
|
75
|
+
dermatitis) is an example of an allergy in which
|
76
|
+
hereditary background does not play a part. The
|
77
|
+
person with poison ivy allergy first has to be
|
78
|
+
exposed to the oil from the plant. This usually
|
79
|
+
occurs during youth, when a rash does not always
|
80
|
+
appear. However, the first exposure may sensitize
|
81
|
+
or cause the person to become allergic and, when
|
82
|
+
subsequent exposure takes place, a contact
|
83
|
+
dermatitis rash appears and can be quite severe.
|
84
|
+
Many plants are capable of producing this type of
|
85
|
+
rash. Substances other than plants, such as dyes,
|
86
|
+
metals, and chemicals in deodorants and cosmetics,
|
87
|
+
can also cause a similar dermatitis.
|
@@ -0,0 +1,107 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <ctype.h>
|
3
|
+
#include <sys/stat.h>
|
4
|
+
#include <assert.h>
|
5
|
+
#include "tagger.h"
|
6
|
+
|
7
|
+
struct TestBase {
|
8
|
+
TestBase();
|
9
|
+
void overflow();
|
10
|
+
void small();
|
11
|
+
void large_file();
|
12
|
+
|
13
|
+
std::set<std::string> tags;
|
14
|
+
NWordTagger tagger;
|
15
|
+
};
|
16
|
+
|
17
|
+
|
18
|
+
TestBase::TestBase()
|
19
|
+
{
|
20
|
+
tags.insert("fitness");
|
21
|
+
tags.insert("delightful");
|
22
|
+
tags.insert("dreaming");
|
23
|
+
tags.insert("dreaming of their world");
|
24
|
+
tags.insert("names");
|
25
|
+
tags.insert("places");
|
26
|
+
tags.insert("diabetes");
|
27
|
+
tags.insert("sugars");
|
28
|
+
tags.insert("allergy");
|
29
|
+
tags.insert("dermatitis");
|
30
|
+
|
31
|
+
tagger.setNWords( 4 );
|
32
|
+
|
33
|
+
tagger.loadTags( tags );
|
34
|
+
}
|
35
|
+
|
36
|
+
void TestBase::overflow()
|
37
|
+
{
|
38
|
+
// input passed to the filter should be processed to downcase, and remove all punctionation
|
39
|
+
std::string words( "hello fitness fitness fitness party dreaming dreaming of their world how are you all doing today so many times I ve seen or heard a delightful story or tales" );
|
40
|
+
std::vector<std::string> matched_tags = tagger.execute( words.c_str(), 2 );
|
41
|
+
|
42
|
+
printf( "%ld tags\n", (long int)matched_tags.size() );
|
43
|
+
for( size_t i = 0; i < matched_tags.size(); ++i ){
|
44
|
+
printf( "tagged: %s\n", matched_tags[i].c_str() );
|
45
|
+
}
|
46
|
+
|
47
|
+
assert( matched_tags.size() == 2 );
|
48
|
+
assert( matched_tags[0] == "dreaming" );
|
49
|
+
assert( matched_tags[1] == "fitness" );
|
50
|
+
}
|
51
|
+
|
52
|
+
void TestBase::small()
|
53
|
+
{
|
54
|
+
// input passed to the filter should be processed to downcase, and remove all punctionation
|
55
|
+
std::string words( "nothing to see here" );
|
56
|
+
std::vector<std::string> matched_tags = tagger.execute( words.c_str(), 2 );
|
57
|
+
|
58
|
+
assert( matched_tags.size() == 0 );
|
59
|
+
}
|
60
|
+
|
61
|
+
void TestBase::large_file()
|
62
|
+
{
|
63
|
+
// input passed to the filter should be processed to downcase, and remove all punctionation
|
64
|
+
std::string words;
|
65
|
+
FILE *in = fopen("doc.txt","r");
|
66
|
+
struct stat s;
|
67
|
+
char *buffer = NULL;
|
68
|
+
fstat( fileno(in), &s );
|
69
|
+
buffer = (char*)malloc(sizeof(char)*(s.st_size+1));
|
70
|
+
memset(buffer,'\0',s.st_size+1);
|
71
|
+
fread( buffer, sizeof(char), s.st_size, in );
|
72
|
+
words = buffer;
|
73
|
+
free(buffer);
|
74
|
+
fclose(in);
|
75
|
+
|
76
|
+
std::vector<std::string> matched_tags = tagger.execute( words.c_str(), 10 );
|
77
|
+
|
78
|
+
printf( "%ld tags\n", (long int)matched_tags.size() );
|
79
|
+
for( size_t i = 0; i < matched_tags.size(); ++i ){
|
80
|
+
printf( "tagged: %s\n", matched_tags[i].c_str() );
|
81
|
+
}
|
82
|
+
|
83
|
+
assert( matched_tags.size() == 2 );
|
84
|
+
assert( matched_tags[0] == "allergy" );
|
85
|
+
assert( matched_tags[1] == "dermatitis" );
|
86
|
+
}
|
87
|
+
|
88
|
+
static void test_run()
|
89
|
+
{
|
90
|
+
TestBase test;
|
91
|
+
|
92
|
+
for( int i = 0; i < 10; ++i ) {
|
93
|
+
test.overflow();
|
94
|
+
test.small();
|
95
|
+
test.large_file();
|
96
|
+
}
|
97
|
+
|
98
|
+
}
|
99
|
+
|
100
|
+
int main()
|
101
|
+
{
|
102
|
+
// running multiple iterations to test for memory leaks
|
103
|
+
for( int i = 0; i < 2; ++i ) {
|
104
|
+
test_run();
|
105
|
+
}
|
106
|
+
return 0;
|
107
|
+
}
|
@@ -0,0 +1,31 @@
|
|
1
|
+
if $0 == __FILE__
|
2
|
+
require 'test/unit'
|
3
|
+
require 'tagger'
|
4
|
+
|
5
|
+
class NWordTest < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def setup
|
8
|
+
if !defined?($tagger)
|
9
|
+
$tagger = Tagger::SimpleTagger.new
|
10
|
+
$tagger.load_tags( File.read('../../tags.txt').split("\n") )
|
11
|
+
$tagger.set_words( 4 );
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_basic
|
16
|
+
timer = Time.now
|
17
|
+
text = "This is a sa'mple doc[]ument lets see how cancer ngrams 4 works out for this interesting text!"
|
18
|
+
tags = $tagger.execute( text )
|
19
|
+
assert_equal ['cancer','work'], tags
|
20
|
+
puts "Duration: #{Time.now - timer} sec"
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_ngram_size3
|
24
|
+
timer = Time.now
|
25
|
+
text = "This body of text contains something like ventricular septal defect"
|
26
|
+
tags = $tagger.execute( text )
|
27
|
+
assert_equal ['ventricular septal defect'], tags
|
28
|
+
puts "Duration: #{Time.now - timer} sec"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|