rbtagger 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +21 -0
- data/History.txt +4 -0
- data/LICENSE +21 -0
- data/License.txt +20 -0
- data/Manifest.txt +75 -0
- data/PostInstall.txt +7 -0
- data/README +7 -0
- data/README.txt +53 -0
- data/Rakefile +33 -0
- data/config/hoe.rb +74 -0
- data/config/requirements.rb +15 -0
- data/ext/rule_tagger/bool.h +38 -0
- data/ext/rule_tagger/darray.c +292 -0
- data/ext/rule_tagger/darray.h +125 -0
- data/ext/rule_tagger/darrayP.h +50 -0
- data/ext/rule_tagger/extconf.rb +14 -0
- data/ext/rule_tagger/lex.c +170 -0
- data/ext/rule_tagger/lex.h +49 -0
- data/ext/rule_tagger/memory.c +127 -0
- data/ext/rule_tagger/memory.h +20 -0
- data/ext/rule_tagger/rbtagger.c +252 -0
- data/ext/rule_tagger/registry.c +326 -0
- data/ext/rule_tagger/registry.h +129 -0
- data/ext/rule_tagger/registryP.h +46 -0
- data/ext/rule_tagger/ruby-compat.h +20 -0
- data/ext/rule_tagger/rules.c +525 -0
- data/ext/rule_tagger/rules.h +42 -0
- data/ext/rule_tagger/sysdep.h +20 -0
- data/ext/rule_tagger/tagger.c +110 -0
- data/ext/rule_tagger/tagger.h +46 -0
- data/ext/rule_tagger/useful.c +44 -0
- data/ext/rule_tagger/useful.h +51 -0
- data/ext/word_tagger/extconf.rb +7 -0
- data/ext/word_tagger/porter_stemmer.c +430 -0
- data/ext/word_tagger/porter_stemmer.h +19 -0
- data/ext/word_tagger/rtagger.cc +83 -0
- data/ext/word_tagger/tagger.cc +153 -0
- data/ext/word_tagger/tagger.h +27 -0
- data/ext/word_tagger/tagger.rb +8 -0
- data/ext/word_tagger/test/Makefile +22 -0
- data/ext/word_tagger/test/doc.txt +87 -0
- data/ext/word_tagger/test/test.cc +107 -0
- data/ext/word_tagger/test.rb +31 -0
- data/lib/brill/tagger.rb +225 -0
- data/lib/rbtagger/version.rb +9 -0
- data/lib/rbtagger.rb +6 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/script/txt2html +82 -0
- data/setup.rb +1585 -0
- data/tasks/deployment.rake +34 -0
- data/tasks/environment.rake +7 -0
- data/tasks/website.rake +17 -0
- data/test/CONTEXTUALRULEFILE +284 -0
- data/test/LEXICALRULEFILE +148 -0
- data/test/LEXICON +93696 -0
- data/test/docs/doc0.txt +20 -0
- data/test/docs/doc1.txt +11 -0
- data/test/docs/doc2.txt +52 -0
- data/test/docs/doc3.txt +128 -0
- data/test/docs/doc4.txt +337 -0
- data/test/docs/doc5.txt +497 -0
- data/test/docs/doc6.txt +116 -0
- data/test/docs/doc7.txt +101 -0
- data/test/docs/doc8.txt +25 -0
- data/test/docs/doc9.txt +84 -0
- data/test/tagger_test.rb +60 -0
- data/test/test_helper.rb +2 -0
- data/tools/rakehelp.rb +113 -0
- data/website/index.html +113 -0
- data/website/index.txt +53 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +138 -0
- data/website/template.html.erb +48 -0
- metadata +155 -0
@@ -0,0 +1,19 @@
|
|
1
|
+
#ifndef PORTER_STEMMER_H
|
2
|
+
#define PORTER_STEMMER_H
|
3
|
+
#ifdef __cplusplus
|
4
|
+
extern "C" {
|
5
|
+
#endif
|
6
|
+
|
7
|
+
struct stemmer;
|
8
|
+
|
9
|
+
extern struct stemmer * porter_stemmer_new(void);
|
10
|
+
extern void porter_stemmer_free(struct stemmer * z);
|
11
|
+
|
12
|
+
extern int porter_stem(struct stemmer * z, const char * b, int k);
|
13
|
+
|
14
|
+
|
15
|
+
|
16
|
+
#ifdef __cplusplus
|
17
|
+
}
|
18
|
+
#endif
|
19
|
+
#endif
|
@@ -0,0 +1,83 @@
|
|
1
|
+
/**
|
2
|
+
* Copyright (c) 2008 Todd A. Fisher
|
3
|
+
* see LICENSE
|
4
|
+
*/
|
5
|
+
#include "ruby.h"
|
6
|
+
#include "tagger.h"
|
7
|
+
|
8
|
+
#define DEBUG
|
9
|
+
#ifdef DEBUG
|
10
|
+
#define TRACE() fprintf(stderr, "> %s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__)
|
11
|
+
#else
|
12
|
+
#define TRACE()
|
13
|
+
#endif
|
14
|
+
|
15
|
+
/* ruby 1.9 compat */
|
16
|
+
#ifndef RSTRING_PTR
|
17
|
+
#define RSTRING_PTR(str) RSTRING(str)->ptr
|
18
|
+
#endif
|
19
|
+
|
20
|
+
#ifndef RSTRING_LEN
|
21
|
+
#define RSTRING_LEN(str) RSTRING(str)->len
|
22
|
+
#endif
|
23
|
+
|
24
|
+
static VALUE rb_Tagger;
|
25
|
+
static VALUE rb_NWordTagger;
|
26
|
+
|
27
|
+
VALUE Tagger_execute( VALUE self, VALUE text )
|
28
|
+
{
|
29
|
+
NWordTagger *tagger;
|
30
|
+
Data_Get_Struct( self, NWordTagger, tagger );
|
31
|
+
std::vector<std::string> tags = tagger->execute( RSTRING_PTR(text) );
|
32
|
+
VALUE results = rb_ary_new2(tags.size());
|
33
|
+
for( size_t i = 0; i < tags.size(); ++i ){
|
34
|
+
rb_ary_push( results, rb_str_new( tags[i].c_str(), tags[i].length() ) );
|
35
|
+
}
|
36
|
+
return results;
|
37
|
+
}
|
38
|
+
VALUE Tagger_set_words( VALUE self, VALUE words )
|
39
|
+
{
|
40
|
+
NWordTagger *tagger;
|
41
|
+
Data_Get_Struct( self, NWordTagger, tagger );
|
42
|
+
tagger->setNWords( NUM2INT(words) );
|
43
|
+
return Qnil;
|
44
|
+
}
|
45
|
+
VALUE Tagger_load_tags( VALUE self, VALUE tagarr )
|
46
|
+
{
|
47
|
+
NWordTagger *tagger;
|
48
|
+
Data_Get_Struct( self, NWordTagger, tagger );
|
49
|
+
std::set<std::string> tags;
|
50
|
+
int len = RARRAY(tagarr)->len;
|
51
|
+
for( int i = 0; i < len; ++i ){
|
52
|
+
std::string tag = RSTRING_PTR( rb_ary_entry( tagarr, i ) );
|
53
|
+
tags.insert(tag);
|
54
|
+
}
|
55
|
+
tagger->loadTags(tags);
|
56
|
+
return Qnil;
|
57
|
+
}
|
58
|
+
|
59
|
+
static void Tagger_free( NWordTagger *tagger )
|
60
|
+
{
|
61
|
+
delete tagger;
|
62
|
+
}
|
63
|
+
|
64
|
+
VALUE Tagger_alloc(VALUE klass)
|
65
|
+
{
|
66
|
+
VALUE object;
|
67
|
+
NWordTagger *tagger = new NWordTagger();
|
68
|
+
object = Data_Wrap_Struct( klass, NULL, Tagger_free, tagger );
|
69
|
+
|
70
|
+
return object;
|
71
|
+
}
|
72
|
+
|
73
|
+
extern "C" void Init_rtagger()
|
74
|
+
{
|
75
|
+
rb_Tagger = rb_define_module( "Tagger" );
|
76
|
+
rb_NWordTagger = rb_define_class_under( rb_Tagger, "NWordTagger", rb_cObject );
|
77
|
+
|
78
|
+
rb_define_alloc_func( rb_NWordTagger, Tagger_alloc );
|
79
|
+
|
80
|
+
rb_define_method( rb_NWordTagger, "load_tags", (VALUE (*)(...))Tagger_load_tags, 1 );
|
81
|
+
rb_define_method( rb_NWordTagger, "execute", (VALUE (*)(...))Tagger_execute, 1 );
|
82
|
+
rb_define_method( rb_NWordTagger, "set_words", (VALUE (*)(...))Tagger_set_words, 1 );
|
83
|
+
}
|
@@ -0,0 +1,153 @@
|
|
1
|
+
#include <ctype.h>
|
2
|
+
#include "tagger.h"
|
3
|
+
#include <set>
|
4
|
+
#include <algorithm>
|
5
|
+
#include <sstream>
|
6
|
+
#include <iterator>
|
7
|
+
#include <string>
|
8
|
+
#include <vector>
|
9
|
+
#include "porter_stemmer.h"
|
10
|
+
|
11
|
+
struct WordComparitor
|
12
|
+
{
|
13
|
+
bool operator()(const std::pair<std::string,int> &s1, const std::pair<std::string,int> &s2) const
|
14
|
+
{
|
15
|
+
return s1.second < s2.second;
|
16
|
+
}
|
17
|
+
};
|
18
|
+
|
19
|
+
|
20
|
+
// from http://www.thescripts.com/forum/thread167600.html
|
21
|
+
// split words by ' '
|
22
|
+
static std::vector<std::string> word_split(const std::string& s)
|
23
|
+
{
|
24
|
+
std::string words = s;
|
25
|
+
// convert all non alpha characters to spaces
|
26
|
+
for( size_t i = 0; i < words.length(); ++i ) {
|
27
|
+
if( !isalpha( words[i] ) ) {
|
28
|
+
words[i] = ' '; // convert to space
|
29
|
+
}
|
30
|
+
}
|
31
|
+
|
32
|
+
std::istringstream is(words);
|
33
|
+
return std::vector<std::string>(std::istream_iterator<std::string>(is), std::istream_iterator<std::string>());
|
34
|
+
}
|
35
|
+
|
36
|
+
NWordTagger::NWordTagger()
|
37
|
+
: nwords(2), stemmer(porter_stemmer_new()){
|
38
|
+
}
|
39
|
+
NWordTagger::~NWordTagger(){
|
40
|
+
porter_stemmer_free(stemmer);
|
41
|
+
}
|
42
|
+
void NWordTagger::loadTags( const std::set<std::string> &tags )
|
43
|
+
{
|
44
|
+
for( std::set<std::string>::iterator i = tags.begin(); i != tags.end(); ++i ){
|
45
|
+
std::string stemmed, word = std::string(*i);
|
46
|
+
std::vector<std::string> words = word_split( *i );
|
47
|
+
//printf( "word: %s\n", word.c_str() );
|
48
|
+
|
49
|
+
if( words.size() > 1 ){
|
50
|
+
for( size_t j = 0; j < words.size(); ++j ){
|
51
|
+
stemmed += this->stemWord(words[j]) + " ";
|
52
|
+
}
|
53
|
+
stemmed = stemmed.substr(0,stemmed.length()-1);
|
54
|
+
this->tags[stemmed] = word;
|
55
|
+
//printf( "word: %s -> %s\n", word.c_str(), stemmed.c_str() );
|
56
|
+
}
|
57
|
+
else{
|
58
|
+
stemmed = this->stemWord(*i);
|
59
|
+
//printf( "word: %s -> %s\n", word.c_str(), stemmed.c_str() );
|
60
|
+
this->tags[stemmed] = word;
|
61
|
+
}
|
62
|
+
|
63
|
+
}
|
64
|
+
}
|
65
|
+
std::string NWordTagger::stemWord( const std::string &word )const
|
66
|
+
{
|
67
|
+
std::string stemmed;
|
68
|
+
char *transition_buffer = strdup( word.c_str() );
|
69
|
+
stemmed = word.substr(0,porter_stem(this->stemmer, transition_buffer, word.length()-1 )+1);
|
70
|
+
free( transition_buffer );
|
71
|
+
return stemmed;
|
72
|
+
}
|
73
|
+
|
74
|
+
std::vector<std::string> NWordTagger::execute( const char *text, short max )const
|
75
|
+
{
|
76
|
+
int max_count = 0;
|
77
|
+
std::vector<std::string> words = word_split( text );
|
78
|
+
std::map<std::string, int> matched_tags; // stores tags and frequency
|
79
|
+
std::string match_word;
|
80
|
+
std::map<std::string,std::string>::const_iterator matched;
|
81
|
+
|
82
|
+
// loop over the words stemming each word
|
83
|
+
for( size_t i = 0; i < words.size(); ++i ) {
|
84
|
+
|
85
|
+
// get the stemmed word at position i
|
86
|
+
match_word = this->stemWord(words[i]);
|
87
|
+
|
88
|
+
// now scan ahead nwords positions searching our tags table for matches
|
89
|
+
for( short j = 1; (j <= this->nwords) && ((i+j) < words.size()); ++j ) {
|
90
|
+
matched = this->tags.find( match_word );
|
91
|
+
if( matched != this->tags.end() ){
|
92
|
+
//printf( "word: %d:(%s->%s)\n", i, match_word.c_str(), matched->second.c_str() );
|
93
|
+
std::map<std::string, int>::iterator mloc = matched_tags.find( matched->second );
|
94
|
+
if( mloc == matched_tags.end() ) {
|
95
|
+
matched_tags[matched->second] = 1; // count 1
|
96
|
+
}
|
97
|
+
else {
|
98
|
+
mloc->second++;
|
99
|
+
if( max_count < mloc->second ) { max_count = mloc->second; }
|
100
|
+
}
|
101
|
+
}
|
102
|
+
// stem each word and compare against our tag bank
|
103
|
+
//printf( "window: %ld:%lu\n", i,(i+j) );
|
104
|
+
match_word += " " + this->stemWord(words[i+j]);
|
105
|
+
}
|
106
|
+
|
107
|
+
matched = this->tags.find( match_word );
|
108
|
+
if( matched != this->tags.end() ) {
|
109
|
+
//printf( "word: %ld:(%s->%s)\n", i, words[i].c_str(), match_word.c_str() );
|
110
|
+
std::map<std::string, int>::iterator mloc = matched_tags.find( matched->second );
|
111
|
+
if( mloc == matched_tags.end() ) {
|
112
|
+
matched_tags[matched->second] = 1; // count 1
|
113
|
+
}
|
114
|
+
else {
|
115
|
+
mloc->second++;
|
116
|
+
if( max_count < mloc->second ) { max_count = mloc->second; }
|
117
|
+
}
|
118
|
+
}
|
119
|
+
}
|
120
|
+
|
121
|
+
// now that we have all the matched tags reduce to max using the tag frequency as a reduction measure
|
122
|
+
std::vector< std::pair<std::string,int> > sorted_tags;
|
123
|
+
|
124
|
+
//printf( "max frequency: %d\n", max_count );
|
125
|
+
for( std::map<std::string, int>::iterator mloc = matched_tags.begin(); mloc != matched_tags.end(); ++mloc ){
|
126
|
+
//printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
|
127
|
+
sorted_tags.push_back(*mloc);
|
128
|
+
}
|
129
|
+
|
130
|
+
// sort the tags in frequency order
|
131
|
+
std::sort( sorted_tags.begin(), sorted_tags.end(), WordComparitor() );
|
132
|
+
|
133
|
+
std::vector< std::string > reduced_tags;
|
134
|
+
|
135
|
+
std::vector< std::pair<std::string, int> >::iterator mloc;
|
136
|
+
do {
|
137
|
+
for(mloc = sorted_tags.begin(); mloc != sorted_tags.end(); ++mloc ) {
|
138
|
+
std::pair< std::string, int > word_freq = *mloc;
|
139
|
+
// printf( "word: %s, frequency: %d\n", word_freq.first.c_str(), word_freq.second );
|
140
|
+
//printf( "word: %s, frequency: %d\n", mloc->first.c_str(), mloc->second );
|
141
|
+
if( word_freq.second < max_count ) {
|
142
|
+
sorted_tags.erase( mloc );
|
143
|
+
break;
|
144
|
+
}
|
145
|
+
}
|
146
|
+
} while( sorted_tags.size() > (size_t)max && mloc != sorted_tags.end() );
|
147
|
+
|
148
|
+
for( size_t i = 0; i < sorted_tags.size(); ++i ) {
|
149
|
+
reduced_tags.push_back( sorted_tags[i].first );
|
150
|
+
}
|
151
|
+
|
152
|
+
return reduced_tags;
|
153
|
+
}
|
@@ -0,0 +1,27 @@
|
|
1
|
+
#ifndef NWORD_TAGGER_H
|
2
|
+
#define NWORD_TAGGER_H
|
3
|
+
#include <set>
|
4
|
+
#include <map>
|
5
|
+
#include <string>
|
6
|
+
#include <vector>
|
7
|
+
|
8
|
+
struct NWordTagger {
|
9
|
+
NWordTagger();
|
10
|
+
~NWordTagger();
|
11
|
+
|
12
|
+
void loadTags( const std::set<std::string> &tags );
|
13
|
+
|
14
|
+
short getNWords()const{ return nwords; }
|
15
|
+
void setNWords( short words ){ nwords = words; }
|
16
|
+
|
17
|
+
std::vector<std::string> execute( const char *text, short max = 10 )const;
|
18
|
+
private:
|
19
|
+
short nwords;
|
20
|
+
struct stemmer *stemmer;
|
21
|
+
std::map<std::string,std::string> tags;
|
22
|
+
std::vector<std::string> words;
|
23
|
+
|
24
|
+
std::string stemWord( const std::string &word )const;
|
25
|
+
};
|
26
|
+
|
27
|
+
#endif
|
@@ -0,0 +1,22 @@
|
|
1
|
+
CFLAGS=-g -Wall #-fprofile-arcs -ftest-coverage
|
2
|
+
|
3
|
+
all: porter_stemmer.o tagger.o test.o
|
4
|
+
g++ $(CFLAGS) -o testit $^
|
5
|
+
time ./testit doc.txt
|
6
|
+
# gcov -b tagger.cc
|
7
|
+
# valgrind --leak-check=full ./testit
|
8
|
+
test.o: test.cc tagger.h
|
9
|
+
g++ $(CFLAGS) test.cc -c
|
10
|
+
tagger.o: tagger.cc tagger.h porter_stemmer.h
|
11
|
+
g++ $(CFLAGS) tagger.cc -c
|
12
|
+
porter_stemmer.o: porter_stemmer.c porter_stemmer.h
|
13
|
+
gcc $(CFLAGS) porter_stemmer.c -c
|
14
|
+
|
15
|
+
setup:
|
16
|
+
ln -s ../tagger.h
|
17
|
+
ln -s ../tagger.cc
|
18
|
+
ln -s ../porter_stemmer.h
|
19
|
+
ln -s ../porter_stemmer.c
|
20
|
+
|
21
|
+
clean:
|
22
|
+
rm -f tagger.o test.o testit
|
@@ -0,0 +1,87 @@
|
|
1
|
+
Allergies are diseases of the immune system that cause an overreaction to
|
2
|
+
substances called "allergens." Allergies are grouped by the kind of trigger,
|
3
|
+
time of year or where symptoms appear on the body: indoor andoutdoor allergies
|
4
|
+
(also called "hay fever," "seasonal," "perennial" or "nasal" allergies), food
|
5
|
+
and drug allergies, latex allergies, insect allergies, skin allergies and eye
|
6
|
+
allergies. People who have allergies can live healthy and active lives.
|
7
|
+
|
8
|
+
movieMultimedia Allergy Library
|
9
|
+
GlossaryGlossary of Allergy Terms
|
10
|
+
|
11
|
+
What are Allergies | What Causes Allergies | Diagnosis | Treatment |
|
12
|
+
Prevention |
|
13
|
+
|
14
|
+
What Causes Allergies
|
15
|
+
|
16
|
+
The substances that cause allergic disease in people are known as allergens.
|
17
|
+
"Antigens," or protein particles like pollen, food or dander enter our bodies
|
18
|
+
through a variety of ways. If the antigen causes an allergic reaction, that
|
19
|
+
particle is considered an "allergen" – and antigen that triggers an allergic
|
20
|
+
reaction. These allergens can get into our body in several ways:
|
21
|
+
|
22
|
+
* Inhaled into the nose and the lungs. Examples are airborne pollens of
|
23
|
+
* certain trees, grasses and weeds; house dust that include dust mite
|
24
|
+
* particles, mold spores, cat and dog dander and latex dust.
|
25
|
+
* Ingested by mouth. Frequent culprits include shrimp, peanuts and
|
26
|
+
* other nuts.
|
27
|
+
* Injected. Such as medications delivered by needle like
|
28
|
+
* penicillin or other injectable drugs, and venom from insect
|
29
|
+
* stings and bites.
|
30
|
+
* Absorbed through the skin. Plants such as poison ivy, sumac
|
31
|
+
* and oak and latex are examples.
|
32
|
+
|
33
|
+
What Makes Some Pollen Cause Allergies, and Not Others?
|
34
|
+
|
35
|
+
Plant pollens that are carried by the wind cause most
|
36
|
+
allergies of the nose, eyes and lungs. These plants (including
|
37
|
+
certain weeds, trees and grasses) are natural pollutants
|
38
|
+
produced at various times of the year when their small,
|
39
|
+
inconspicuous flowers discharge literally billions of pollen
|
40
|
+
particles.
|
41
|
+
|
42
|
+
Because the particles can be carried significant distances, it
|
43
|
+
is important for you not only to understand local
|
44
|
+
environmental conditions, but also conditions over the broader
|
45
|
+
area of the state or region in which you live. Unlike the
|
46
|
+
wind-pollinated plants, conspicuous wild flowers or flowers
|
47
|
+
used in most residential gardens are pollinated by bees,
|
48
|
+
wasps, and other insects and therefore are not widely capable
|
49
|
+
of producing allergic disease.
|
50
|
+
|
51
|
+
What is the Role of Heredity in Allergy?
|
52
|
+
|
53
|
+
Like baldness, height and eye color, the capacity to become
|
54
|
+
allergic is an inherited characteristic. Yet, although you may
|
55
|
+
be born with the genetic capability to become allergic, you
|
56
|
+
are not automatically allergic to specific allergens. Several
|
57
|
+
factors must be present for allergic sensitivity to be
|
58
|
+
developed:
|
59
|
+
|
60
|
+
* The specific genes acquired from parents.
|
61
|
+
* The exposure to one or more allergens to which you
|
62
|
+
* have a genetically programmed response.
|
63
|
+
* The degree and length of exposure.
|
64
|
+
|
65
|
+
A baby born with the tendency to become allergic
|
66
|
+
to cow's milk, for example, may show allergic
|
67
|
+
symptoms several months after birth. A genetic
|
68
|
+
capability to become allergic to cat dander may
|
69
|
+
take three to four years of cat exposure before
|
70
|
+
the person shows symptoms. These people may also
|
71
|
+
become allergic to other environmental substances
|
72
|
+
with age.
|
73
|
+
|
74
|
+
On the other hand, poison ivy allergy (contact
|
75
|
+
dermatitis) is an example of an allergy in which
|
76
|
+
hereditary background does not play a part. The
|
77
|
+
person with poison ivy allergy first has to be
|
78
|
+
exposed to the oil from the plant. This usually
|
79
|
+
occurs during youth, when a rash does not always
|
80
|
+
appear. However, the first exposure may sensitize
|
81
|
+
or cause the person to become allergic and, when
|
82
|
+
subsequent exposure takes place, a contact
|
83
|
+
dermatitis rash appears and can be quite severe.
|
84
|
+
Many plants are capable of producing this type of
|
85
|
+
rash. Substances other than plants, such as dyes,
|
86
|
+
metals, and chemicals in deodorants and cosmetics,
|
87
|
+
can also cause a similar dermatitis.
|
@@ -0,0 +1,107 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <ctype.h>
|
3
|
+
#include <sys/stat.h>
|
4
|
+
#include <assert.h>
|
5
|
+
#include "tagger.h"
|
6
|
+
|
7
|
+
struct TestBase {
|
8
|
+
TestBase();
|
9
|
+
void overflow();
|
10
|
+
void small();
|
11
|
+
void large_file();
|
12
|
+
|
13
|
+
std::set<std::string> tags;
|
14
|
+
NWordTagger tagger;
|
15
|
+
};
|
16
|
+
|
17
|
+
|
18
|
+
TestBase::TestBase()
|
19
|
+
{
|
20
|
+
tags.insert("fitness");
|
21
|
+
tags.insert("delightful");
|
22
|
+
tags.insert("dreaming");
|
23
|
+
tags.insert("dreaming of their world");
|
24
|
+
tags.insert("names");
|
25
|
+
tags.insert("places");
|
26
|
+
tags.insert("diabetes");
|
27
|
+
tags.insert("sugars");
|
28
|
+
tags.insert("allergy");
|
29
|
+
tags.insert("dermatitis");
|
30
|
+
|
31
|
+
tagger.setNWords( 4 );
|
32
|
+
|
33
|
+
tagger.loadTags( tags );
|
34
|
+
}
|
35
|
+
|
36
|
+
void TestBase::overflow()
|
37
|
+
{
|
38
|
+
// input passed to the filter should be processed to downcase, and remove all punctionation
|
39
|
+
std::string words( "hello fitness fitness fitness party dreaming dreaming of their world how are you all doing today so many times I ve seen or heard a delightful story or tales" );
|
40
|
+
std::vector<std::string> matched_tags = tagger.execute( words.c_str(), 2 );
|
41
|
+
|
42
|
+
printf( "%ld tags\n", (long int)matched_tags.size() );
|
43
|
+
for( size_t i = 0; i < matched_tags.size(); ++i ){
|
44
|
+
printf( "tagged: %s\n", matched_tags[i].c_str() );
|
45
|
+
}
|
46
|
+
|
47
|
+
assert( matched_tags.size() == 2 );
|
48
|
+
assert( matched_tags[0] == "dreaming" );
|
49
|
+
assert( matched_tags[1] == "fitness" );
|
50
|
+
}
|
51
|
+
|
52
|
+
void TestBase::small()
|
53
|
+
{
|
54
|
+
// input passed to the filter should be processed to downcase, and remove all punctionation
|
55
|
+
std::string words( "nothing to see here" );
|
56
|
+
std::vector<std::string> matched_tags = tagger.execute( words.c_str(), 2 );
|
57
|
+
|
58
|
+
assert( matched_tags.size() == 0 );
|
59
|
+
}
|
60
|
+
|
61
|
+
void TestBase::large_file()
|
62
|
+
{
|
63
|
+
// input passed to the filter should be processed to downcase, and remove all punctionation
|
64
|
+
std::string words;
|
65
|
+
FILE *in = fopen("doc.txt","r");
|
66
|
+
struct stat s;
|
67
|
+
char *buffer = NULL;
|
68
|
+
fstat( fileno(in), &s );
|
69
|
+
buffer = (char*)malloc(sizeof(char)*(s.st_size+1));
|
70
|
+
memset(buffer,'\0',s.st_size+1);
|
71
|
+
fread( buffer, sizeof(char), s.st_size, in );
|
72
|
+
words = buffer;
|
73
|
+
free(buffer);
|
74
|
+
fclose(in);
|
75
|
+
|
76
|
+
std::vector<std::string> matched_tags = tagger.execute( words.c_str(), 10 );
|
77
|
+
|
78
|
+
printf( "%ld tags\n", (long int)matched_tags.size() );
|
79
|
+
for( size_t i = 0; i < matched_tags.size(); ++i ){
|
80
|
+
printf( "tagged: %s\n", matched_tags[i].c_str() );
|
81
|
+
}
|
82
|
+
|
83
|
+
assert( matched_tags.size() == 2 );
|
84
|
+
assert( matched_tags[0] == "allergy" );
|
85
|
+
assert( matched_tags[1] == "dermatitis" );
|
86
|
+
}
|
87
|
+
|
88
|
+
static void test_run()
|
89
|
+
{
|
90
|
+
TestBase test;
|
91
|
+
|
92
|
+
for( int i = 0; i < 10; ++i ) {
|
93
|
+
test.overflow();
|
94
|
+
test.small();
|
95
|
+
test.large_file();
|
96
|
+
}
|
97
|
+
|
98
|
+
}
|
99
|
+
|
100
|
+
int main()
|
101
|
+
{
|
102
|
+
// running multiple iterations to test for memory leaks
|
103
|
+
for( int i = 0; i < 2; ++i ) {
|
104
|
+
test_run();
|
105
|
+
}
|
106
|
+
return 0;
|
107
|
+
}
|
@@ -0,0 +1,31 @@
|
|
1
|
+
if $0 == __FILE__
|
2
|
+
require 'test/unit'
|
3
|
+
require 'tagger'
|
4
|
+
|
5
|
+
class NWordTest < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def setup
|
8
|
+
if !defined?($tagger)
|
9
|
+
$tagger = Tagger::SimpleTagger.new
|
10
|
+
$tagger.load_tags( File.read('../../tags.txt').split("\n") )
|
11
|
+
$tagger.set_words( 4 );
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_basic
|
16
|
+
timer = Time.now
|
17
|
+
text = "This is a sa'mple doc[]ument lets see how cancer ngrams 4 works out for this interesting text!"
|
18
|
+
tags = $tagger.execute( text )
|
19
|
+
assert_equal ['cancer','work'], tags
|
20
|
+
puts "Duration: #{Time.now - timer} sec"
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_ngram_size3
|
24
|
+
timer = Time.now
|
25
|
+
text = "This body of text contains something like ventricular septal defect"
|
26
|
+
tags = $tagger.execute( text )
|
27
|
+
assert_equal ['ventricular septal defect'], tags
|
28
|
+
puts "Duration: #{Time.now - timer} sec"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|