rbtagger 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/COPYING +21 -0
  2. data/History.txt +4 -0
  3. data/LICENSE +21 -0
  4. data/License.txt +20 -0
  5. data/Manifest.txt +75 -0
  6. data/PostInstall.txt +7 -0
  7. data/README +7 -0
  8. data/README.txt +53 -0
  9. data/Rakefile +33 -0
  10. data/config/hoe.rb +74 -0
  11. data/config/requirements.rb +15 -0
  12. data/ext/rule_tagger/bool.h +38 -0
  13. data/ext/rule_tagger/darray.c +292 -0
  14. data/ext/rule_tagger/darray.h +125 -0
  15. data/ext/rule_tagger/darrayP.h +50 -0
  16. data/ext/rule_tagger/extconf.rb +14 -0
  17. data/ext/rule_tagger/lex.c +170 -0
  18. data/ext/rule_tagger/lex.h +49 -0
  19. data/ext/rule_tagger/memory.c +127 -0
  20. data/ext/rule_tagger/memory.h +20 -0
  21. data/ext/rule_tagger/rbtagger.c +252 -0
  22. data/ext/rule_tagger/registry.c +326 -0
  23. data/ext/rule_tagger/registry.h +129 -0
  24. data/ext/rule_tagger/registryP.h +46 -0
  25. data/ext/rule_tagger/ruby-compat.h +20 -0
  26. data/ext/rule_tagger/rules.c +525 -0
  27. data/ext/rule_tagger/rules.h +42 -0
  28. data/ext/rule_tagger/sysdep.h +20 -0
  29. data/ext/rule_tagger/tagger.c +110 -0
  30. data/ext/rule_tagger/tagger.h +46 -0
  31. data/ext/rule_tagger/useful.c +44 -0
  32. data/ext/rule_tagger/useful.h +51 -0
  33. data/ext/word_tagger/extconf.rb +7 -0
  34. data/ext/word_tagger/porter_stemmer.c +430 -0
  35. data/ext/word_tagger/porter_stemmer.h +19 -0
  36. data/ext/word_tagger/rtagger.cc +83 -0
  37. data/ext/word_tagger/tagger.cc +153 -0
  38. data/ext/word_tagger/tagger.h +27 -0
  39. data/ext/word_tagger/tagger.rb +8 -0
  40. data/ext/word_tagger/test/Makefile +22 -0
  41. data/ext/word_tagger/test/doc.txt +87 -0
  42. data/ext/word_tagger/test/test.cc +107 -0
  43. data/ext/word_tagger/test.rb +31 -0
  44. data/lib/brill/tagger.rb +225 -0
  45. data/lib/rbtagger/version.rb +9 -0
  46. data/lib/rbtagger.rb +6 -0
  47. data/script/console +10 -0
  48. data/script/destroy +14 -0
  49. data/script/generate +14 -0
  50. data/script/txt2html +82 -0
  51. data/setup.rb +1585 -0
  52. data/tasks/deployment.rake +34 -0
  53. data/tasks/environment.rake +7 -0
  54. data/tasks/website.rake +17 -0
  55. data/test/CONTEXTUALRULEFILE +284 -0
  56. data/test/LEXICALRULEFILE +148 -0
  57. data/test/LEXICON +93696 -0
  58. data/test/docs/doc0.txt +20 -0
  59. data/test/docs/doc1.txt +11 -0
  60. data/test/docs/doc2.txt +52 -0
  61. data/test/docs/doc3.txt +128 -0
  62. data/test/docs/doc4.txt +337 -0
  63. data/test/docs/doc5.txt +497 -0
  64. data/test/docs/doc6.txt +116 -0
  65. data/test/docs/doc7.txt +101 -0
  66. data/test/docs/doc8.txt +25 -0
  67. data/test/docs/doc9.txt +84 -0
  68. data/test/tagger_test.rb +60 -0
  69. data/test/test_helper.rb +2 -0
  70. data/tools/rakehelp.rb +113 -0
  71. data/website/index.html +113 -0
  72. data/website/index.txt +53 -0
  73. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  74. data/website/stylesheets/screen.css +138 -0
  75. data/website/template.html.erb +48 -0
  76. metadata +155 -0
@@ -0,0 +1,110 @@
1
+ #include <stdlib.h>
2
+ #include <string.h>
3
+ #include "tagger.h"
4
+
5
+ TaggerContext *tagger_context_new()
6
+ {
7
+ return tagger_context_new_with_lexicon_size_hint( 94000 );
8
+ }
9
+ TaggerContext *tagger_context_new_with_lexicon_size_hint(int lexicon_size)
10
+ {
11
+ TaggerContext *tc = (TaggerContext*)malloc(sizeof(TaggerContext));
12
+
13
+ tc->lexicon_hash = Registry_create(Registry_strcmp, Registry_strhash);
14
+ tc->lexicon_tag_hash= Registry_create(Registry_strcmp, Registry_strhash);
15
+ tc->ntot_hash = Registry_create(Registry_strcmp, Registry_strhash);
16
+ tc->good_right_hash = Registry_create(Registry_strcmp, Registry_strhash);
17
+ tc->good_left_hash = Registry_create(Registry_strcmp, Registry_strhash);
18
+ tc->bigram_hash = Registry_create(Registry_strcmp, Registry_strhash);
19
+ tc->wordlist_hash = Registry_create(Registry_strcmp, Registry_strhash);
20
+
21
+ tc->rule_array = Darray_create();
22
+ tc->contextual_rule_array = Darray_create();
23
+
24
+ Registry_size_hint(tc->lexicon_hash, lexicon_size);
25
+ Registry_size_hint(tc->lexicon_tag_hash, lexicon_size);
26
+
27
+ return tc;
28
+ }
29
+
30
+ /* Used in the free_registry() function below */
31
+ static void free_registry_entry(void *key, void *value, void *other)
32
+ {
33
+ free(key);
34
+ if( value != (void *)1)
35
+ free(value);
36
+ }
37
+
38
+ /* Destroy a Registry whose keys & values have been allocated */
39
+ static void free_registry(Registry r)
40
+ {
41
+ Registry_traverse(r, free_registry_entry, NULL);
42
+ Registry_destroy(r);
43
+ }
44
+
45
+ /* Destroy the memory allocated to one of the rule arrays */
46
+ static void free_rule_array(Darray a)
47
+ {
48
+ int i;
49
+ int t = Darray_len(a);
50
+ for (i=0; i<t; i++) {
51
+ rule_destroy(Darray_get(a, i));
52
+ }
53
+ Darray_destroy(a);
54
+ }
55
+
56
+ void tagger_context_free( TaggerContext *tc )
57
+ {
58
+ free_registry(tc->lexicon_hash);
59
+ free_registry(tc->lexicon_tag_hash);
60
+ free_registry(tc->ntot_hash);
61
+ free_registry(tc->good_right_hash);
62
+ free_registry(tc->good_left_hash);
63
+ free_registry(tc->bigram_hash);
64
+ free_registry(tc->wordlist_hash);
65
+
66
+ free_rule_array(tc->rule_array);
67
+ free_rule_array(tc->contextual_rule_array);
68
+ }
69
+
70
+ int tagger_context_add_to_lexicon( TaggerContext *tc, const char *word, const char *tag )
71
+ {
72
+ VOIDP wp = (VOIDP)strdup(word);
73
+ VOIDP tp = (VOIDP)strdup(tag); // XXX: separate lines for valgrind memory leak reporting
74
+ return Registry_add(tc->lexicon_hash, wp, tp);
75
+ }
76
+
77
+ int tagger_context_add_to_lexicon_tags( TaggerContext *tc, const char *bigram )
78
+ {
79
+ return Registry_add(tc->lexicon_tag_hash, (VOIDP)strdup(bigram), (VOIDP)1);
80
+ }
81
+
82
+ void tagger_context_add_lexical_rule( TaggerContext *tc, const char *rule )
83
+ {
84
+ trans_rule *r = parse_lexical_rule(rule);
85
+ Darray_addh(tc->rule_array, r);
86
+ }
87
+
88
+ void tagger_context_add_contextual_rule( TaggerContext *tc, const char *rule )
89
+ {
90
+ trans_rule *r = parse_contextual_rule(rule);
91
+ Darray_addh(tc->contextual_rule_array, r);
92
+ }
93
+
94
+ int tagger_context_add_word_to_wordlist( TaggerContext *tc, const char *word )
95
+ {
96
+ VOIDP pw = (VOIDP)strdup(word);
97
+ return Registry_add(tc->wordlist_hash, pw, (VOIDP)1);
98
+ }
99
+
100
+ int tagger_context_add_goodleft( TaggerContext *tc, const char *word )
101
+ {
102
+ VOIDP pw = (VOIDP)strdup(word);
103
+ return Registry_add(tc->good_left_hash, pw, (VOIDP)1);
104
+ }
105
+
106
+ int tagger_context_add_goodright( TaggerContext *tc, const char *word )
107
+ {
108
+ VOIDP pw = (VOIDP)strdup(word);
109
+ return Registry_add(tc->good_right_hash, pw, (VOIDP)1);
110
+ }
@@ -0,0 +1,46 @@
1
+ #ifndef C_BRILL_TAGGER_H
2
+ #define C_BRILL_TAGGER_H
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ #include "lex.h"
9
+ #include "darray.h"
10
+ #include "registry.h"
11
+ #include "memory.h"
12
+ #include "useful.h"
13
+ #include "rules.h"
14
+
15
+ typedef struct _TaggerContext{
16
+ Registry lexicon_hash;
17
+ Registry lexicon_tag_hash;
18
+ Registry good_right_hash;
19
+ Registry good_left_hash;
20
+ Registry ntot_hash;
21
+ Registry bigram_hash;
22
+ Registry wordlist_hash;
23
+
24
+ Darray rule_array;
25
+ Darray contextual_rule_array;
26
+ } TaggerContext;
27
+
28
+ TaggerContext *tagger_context_new();
29
+ TaggerContext *tagger_context_new_with_lexicon_size_hint(int lexicon_size);
30
+ void tagger_context_free( TaggerContext *tc );
31
+
32
+ int tagger_context_add_to_lexicon( TaggerContext *tc, const char *word, const char *tag );
33
+ int tagger_context_add_to_lexicon_tags( TaggerContext *tc, const char *bigram );
34
+ void tagger_context_add_lexical_rule( TaggerContext *tc, const char *rule );
35
+ void tagger_context_add_contextual_rule( TaggerContext *tc, const char *rule );
36
+ int tagger_context_add_word_to_wordlist( TaggerContext *tc, const char *word );
37
+ int tagger_context_add_goodleft( TaggerContext *tc, const char *word );
38
+ int tagger_context_add_goodright( TaggerContext *tc, const char *word );
39
+
40
+ //void tagger_context_apply_lexical_rules( TaggerContext *tc, const char *word );
41
+
42
+ #ifdef __cplusplus
43
+ }
44
+ #endif
45
+
46
+ #endif
@@ -0,0 +1,44 @@
1
+ #include <string.h>
2
+ #include <stdlib.h>
3
+ #include "useful.h"
4
+
5
+ char *mystrdup(thestr)
6
+ const char *thestr;
7
+ {
8
+
9
+ return((char *)strcpy(malloc(strlen(thestr)+1),thestr));
10
+ }
11
+
12
+ int not_just_blank(thestr)
13
+ char *thestr;
14
+ /* make sure not just processing a no-character line */
15
+ {
16
+ char *thestr2;
17
+ thestr2 = thestr;
18
+ while(*thestr2 != '\0') {
19
+ if (*thestr2 != ' ' && *thestr2 != '\t' && *thestr2 != '\n'){
20
+ return(1); }
21
+ ++thestr2;
22
+ }
23
+ return(0);
24
+ }
25
+
26
+ int num_words(thestr)
27
+ char *thestr;
28
+ {
29
+ int count,returncount;
30
+
31
+ returncount=0;
32
+ count=0;
33
+ while (thestr[count] != '\0' && (thestr[count] == ' '
34
+ || thestr[count] == '\t')) ++count;
35
+ while (thestr[count++] != '\0') {
36
+ if (thestr[count-1] == ' ' || thestr[count-1] == '\t') {
37
+ ++returncount;
38
+ while (thestr[count] == ' ' || thestr[count] == '\t')
39
+ ++count;
40
+ if (thestr[count] == '\0') --returncount;
41
+ }
42
+ }
43
+ return(returncount);
44
+ }
@@ -0,0 +1,51 @@
1
+ #ifndef _USEFUL_H
2
+ #define _USEFUL_H
3
+
4
+ #include "sysdep.h"
5
+
6
+ #ifndef NULL
7
+ #define NULL 0
8
+ #endif
9
+
10
+ #ifndef TRUE
11
+ #define TRUE (!NULL)
12
+ #endif
13
+
14
+ #ifndef FALSE
15
+ #define FALSE NULL
16
+ #endif
17
+
18
+ #ifndef USHORT
19
+ #define USHORT unsigned short
20
+ #endif
21
+
22
+ #ifndef ULONG
23
+ #define ULONG unsigned long
24
+ #endif
25
+
26
+ #ifndef MAX
27
+ #define MAX(x, y) (x >= y ? x : y)
28
+ #endif
29
+
30
+ #ifndef MIN
31
+ #define MIN(x, y) (x <= y ? x : y)
32
+ #endif
33
+
34
+ #ifndef ABS
35
+ #define ABS(x) (x < 0 ? x * (-1) : x)
36
+ #endif
37
+
38
+ #ifdef __STDC__
39
+ extern char *mystrdup(const char *);
40
+ /* Just a hack around the fact that strdup isn't standard */
41
+ extern int not_just_blank(char *);
42
+ /* make sure we aren't processing a blank line */
43
+ extern int num_words(char *);
44
+ /* returns number of words in a string */
45
+ #else
46
+ extern char *mystrdup();
47
+ extern int not_just_blank();
48
+ extern int num_words();
49
+ #endif /* __STDC__ */
50
+
51
+ #endif
@@ -0,0 +1,7 @@
1
+ require 'mkmf'
2
+
3
+ dir_config("word_tagger")
4
+ have_library("c", "main")
5
+ have_library("stdc++")
6
+
7
+ create_makefile("word_tagger")
@@ -0,0 +1,430 @@
1
+
2
+ /* This is the Porter stemming algorithm, coded up as thread-safe ANSI C
3
+ by the author.
4
+
5
+ It may be be regarded as cononical, in that it follows the algorithm
6
+ presented in
7
+
8
+ Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
9
+ no. 3, pp 130-137,
10
+
11
+ only differing from it at the points maked --DEPARTURE-- below.
12
+
13
+ See also http://www.tartarus.org/~martin/PorterStemmer
14
+
15
+ The algorithm as described in the paper could be exactly replicated
16
+ by adjusting the points of DEPARTURE, but this is barely necessary,
17
+ because (a) the points of DEPARTURE are definitely improvements, and
18
+ (b) no encoding of the Porter stemmer I have seen is anything like
19
+ as exact as this version, even with the points of DEPARTURE!
20
+
21
+ You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which
22
+ 'stem' takes a list of inputs and sends the stemmed equivalent to
23
+ stdout.
24
+
25
+ The algorithm as encoded here is particularly fast.
26
+
27
+ Release 2 (the more old-fashioned, non-thread-safe version may be
28
+ regarded as release 1.)
29
+ */
30
+
31
+ #include <stdlib.h> /* for malloc, free */
32
+ #include <string.h> /* for memcmp, memmove */
33
+ #include "porter_stemmer.h"
34
+
35
+ /* The main part of the stemming algorithm starts here.
36
+ */
37
+
38
+ #define TRUE 1
39
+ #define FALSE 0
40
+
41
+ /* stemmer is a structure for a few local bits of data,
42
+ */
43
+
44
+ struct stemmer {
45
+ char * b; /* buffer for word to be stemmed */
46
+ int k; /* offset to the end of the string */
47
+ int j; /* a general offset into the string */
48
+ };
49
+
50
+
51
+ /* Member b is a buffer holding a word to be stemmed. The letters are in
52
+ b[0], b[1] ... ending at b[z->k]. Member k is readjusted downwards as
53
+ the stemming progresses. Zero termination is not in fact used in the
54
+ algorithm.
55
+
56
+ Note that only lower case sequences are stemmed. Forcing to lower case
57
+ should be done before stem(...) is called.
58
+
59
+
60
+ Typical usage is:
61
+
62
+ struct stemmer * z = create_stemmer();
63
+ char b[] = "pencils";
64
+ int res = stem(z, b, 6);
65
+ /- stem the 7 characters of b[0] to b[6]. The result, res,
66
+ will be 5 (the 's' is removed). -/
67
+ free_stemmer(z);
68
+ */
69
+
70
+
71
+ extern struct stemmer * porter_stemmer_new(void)
72
+ {
73
+ return (struct stemmer *) malloc(sizeof(struct stemmer));
74
+ /* assume malloc succeeds */
75
+ }
76
+
77
+ extern void porter_stemmer_free(struct stemmer * z)
78
+ {
79
+ free(z);
80
+ }
81
+
82
+
83
+ /* cons(z, i) is TRUE <=> b[i] is a consonant. ('b' means 'z->b', but here
84
+ and below we drop 'z->' in comments.
85
+ */
86
+
87
+ static int cons(struct stemmer * z, int i)
88
+ { switch (z->b[i])
89
+ { case 'a': case 'e': case 'i': case 'o': case 'u': return FALSE;
90
+ case 'y': return (i == 0) ? TRUE : !cons(z, i - 1);
91
+ default: return TRUE;
92
+ }
93
+ }
94
+
95
+ /* m(z) measures the number of consonant sequences between 0 and j. if c is
96
+ a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
97
+ presence,
98
+
99
+ <c><v> gives 0
100
+ <c>vc<v> gives 1
101
+ <c>vcvc<v> gives 2
102
+ <c>vcvcvc<v> gives 3
103
+ ....
104
+ */
105
+
106
+ static int m(struct stemmer * z)
107
+ { int n = 0;
108
+ int i = 0;
109
+ int j = z->j;
110
+ while(TRUE)
111
+ { if (i > j) return n;
112
+ if (! cons(z, i)) break; i++;
113
+ }
114
+ i++;
115
+ while(TRUE)
116
+ { while(TRUE)
117
+ { if (i > j) return n;
118
+ if (cons(z, i)) break;
119
+ i++;
120
+ }
121
+ i++;
122
+ n++;
123
+ while(TRUE)
124
+ { if (i > j) return n;
125
+ if (! cons(z, i)) break;
126
+ i++;
127
+ }
128
+ i++;
129
+ }
130
+ }
131
+
132
+ /* vowelinstem(z) is TRUE <=> 0,...j contains a vowel */
133
+
134
+ static int vowelinstem(struct stemmer * z)
135
+ {
136
+ int j = z->j;
137
+ int i; for (i = 0; i <= j; i++) if (! cons(z, i)) return TRUE;
138
+ return FALSE;
139
+ }
140
+
141
+ /* doublec(z, j) is TRUE <=> j,(j-1) contain a double consonant. */
142
+
143
+ static int doublec(struct stemmer * z, int j)
144
+ {
145
+ char * b = z->b;
146
+ if (j < 1) return FALSE;
147
+ if (b[j] != b[j - 1]) return FALSE;
148
+ return cons(z, j);
149
+ }
150
+
151
+ /* cvc(z, i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
152
+ and also if the second c is not w,x or y. this is used when trying to
153
+ restore an e at the end of a short word. e.g.
154
+
155
+ cav(e), lov(e), hop(e), crim(e), but
156
+ snow, box, tray.
157
+
158
+ */
159
+
160
+ static int cvc(struct stemmer * z, int i)
161
+ { if (i < 2 || !cons(z, i) || cons(z, i - 1) || !cons(z, i - 2)) return FALSE;
162
+ { int ch = z->b[i];
163
+ if (ch == 'w' || ch == 'x' || ch == 'y') return FALSE;
164
+ }
165
+ return TRUE;
166
+ }
167
+
168
+ /* ends(z, s) is TRUE <=> 0,...k ends with the string s. */
169
+
170
+ static int ends(struct stemmer * z, char * s)
171
+ { int length = s[0];
172
+ char * b = z->b;
173
+ int k = z->k;
174
+ if (s[length] != b[k]) return FALSE; /* tiny speed-up */
175
+ if (length > k + 1) return FALSE;
176
+ if (memcmp(b + k - length + 1, s + 1, length) != 0) return FALSE;
177
+ z->j = k-length;
178
+ return TRUE;
179
+ }
180
+
181
+ /* setto(z, s) sets (j+1),...k to the characters in the string s, readjusting
182
+ k. */
183
+
184
+ static void setto(struct stemmer * z, char * s)
185
+ { int length = s[0];
186
+ int j = z->j;
187
+ memmove(z->b + j + 1, s + 1, length);
188
+ z->k = j+length;
189
+ }
190
+
191
+ /* r(z, s) is used further down. */
192
+
193
+ static void r(struct stemmer * z, char * s) { if (m(z) > 0) setto(z, s); }
194
+
195
+ /* step1ab(z) gets rid of plurals and -ed or -ing. e.g.
196
+
197
+ caresses -> caress
198
+ ponies -> poni
199
+ ties -> ti
200
+ caress -> caress
201
+ cats -> cat
202
+
203
+ feed -> feed
204
+ agreed -> agree
205
+ disabled -> disable
206
+
207
+ matting -> mat
208
+ mating -> mate
209
+ meeting -> meet
210
+ milling -> mill
211
+ messing -> mess
212
+
213
+ meetings -> meet
214
+
215
+ */
216
+
217
+ static void step1ab(struct stemmer * z)
218
+ {
219
+ char * b = z->b;
220
+ if (b[z->k] == 's')
221
+ { if (ends(z, "\04" "sses")) z->k -= 2; else
222
+ if (ends(z, "\03" "ies")) setto(z, "\01" "i"); else
223
+ if (b[z->k - 1] != 's') z->k--;
224
+ }
225
+ if (ends(z, "\03" "eed")) { if (m(z) > 0) z->k--; } else
226
+ if ((ends(z, "\02" "ed") || ends(z, "\03" "ing")) && vowelinstem(z))
227
+ { z->k = z->j;
228
+ if (ends(z, "\02" "at")) setto(z, "\03" "ate"); else
229
+ if (ends(z, "\02" "bl")) setto(z, "\03" "ble"); else
230
+ if (ends(z, "\02" "iz")) setto(z, "\03" "ize"); else
231
+ if (doublec(z, z->k))
232
+ { z->k--;
233
+ { int ch = b[z->k];
234
+ if (ch == 'l' || ch == 's' || ch == 'z') z->k++;
235
+ }
236
+ }
237
+ else if (m(z) == 1 && cvc(z, z->k)) setto(z, "\01" "e");
238
+ }
239
+ }
240
+
241
+ /* step1c(z) turns terminal y to i when there is another vowel in the stem. */
242
+
243
+ static void step1c(struct stemmer * z)
244
+ {
245
+ if (ends(z, "\01" "y") && vowelinstem(z)) z->b[z->k] = 'i';
246
+ }
247
+
248
+
249
+ /* step2(z) maps double suffices to single ones. so -ization ( = -ize plus
250
+ -ation) maps to -ize etc. note that the string before the suffix must give
251
+ m(z) > 0. */
252
+
253
+ static void step2(struct stemmer * z) { switch (z->b[z->k-1])
254
+ {
255
+ case 'a': if (ends(z, "\07" "ational")) { r(z, "\03" "ate"); break; }
256
+ if (ends(z, "\06" "tional")) { r(z, "\04" "tion"); break; }
257
+ break;
258
+ case 'c': if (ends(z, "\04" "enci")) { r(z, "\04" "ence"); break; }
259
+ if (ends(z, "\04" "anci")) { r(z, "\04" "ance"); break; }
260
+ break;
261
+ case 'e': if (ends(z, "\04" "izer")) { r(z, "\03" "ize"); break; }
262
+ break;
263
+ case 'l': if (ends(z, "\03" "bli")) { r(z, "\03" "ble"); break; } /*-DEPARTURE-*/
264
+
265
+ /* To match the published algorithm, replace this line with
266
+ case 'l': if (ends(z, "\04" "abli")) { r(z, "\04" "able"); break; } */
267
+
268
+ if (ends(z, "\04" "alli")) { r(z, "\02" "al"); break; }
269
+ if (ends(z, "\05" "entli")) { r(z, "\03" "ent"); break; }
270
+ if (ends(z, "\03" "eli")) { r(z, "\01" "e"); break; }
271
+ if (ends(z, "\05" "ousli")) { r(z, "\03" "ous"); break; }
272
+ break;
273
+ case 'o': if (ends(z, "\07" "ization")) { r(z, "\03" "ize"); break; }
274
+ if (ends(z, "\05" "ation")) { r(z, "\03" "ate"); break; }
275
+ if (ends(z, "\04" "ator")) { r(z, "\03" "ate"); break; }
276
+ break;
277
+ case 's': if (ends(z, "\05" "alism")) { r(z, "\02" "al"); break; }
278
+ if (ends(z, "\07" "iveness")) { r(z, "\03" "ive"); break; }
279
+ if (ends(z, "\07" "fulness")) { r(z, "\03" "ful"); break; }
280
+ if (ends(z, "\07" "ousness")) { r(z, "\03" "ous"); break; }
281
+ break;
282
+ case 't': if (ends(z, "\05" "aliti")) { r(z, "\02" "al"); break; }
283
+ if (ends(z, "\05" "iviti")) { r(z, "\03" "ive"); break; }
284
+ if (ends(z, "\06" "biliti")) { r(z, "\03" "ble"); break; }
285
+ break;
286
+ case 'g': if (ends(z, "\04" "logi")) { r(z, "\03" "log"); break; } /*-DEPARTURE-*/
287
+
288
+ /* To match the published algorithm, delete this line */
289
+
290
+ } }
291
+
292
+ /* step3(z) deals with -ic-, -full, -ness etc. similar strategy to step2. */
293
+
294
+ static void step3(struct stemmer * z) { switch (z->b[z->k])
295
+ {
296
+ case 'e': if (ends(z, "\05" "icate")) { r(z, "\02" "ic"); break; }
297
+ if (ends(z, "\05" "ative")) { r(z, "\00" ""); break; }
298
+ if (ends(z, "\05" "alize")) { r(z, "\02" "al"); break; }
299
+ break;
300
+ case 'i': if (ends(z, "\05" "iciti")) { r(z, "\02" "ic"); break; }
301
+ break;
302
+ case 'l': if (ends(z, "\04" "ical")) { r(z, "\02" "ic"); break; }
303
+ if (ends(z, "\03" "ful")) { r(z, "\00" ""); break; }
304
+ break;
305
+ case 's': if (ends(z, "\04" "ness")) { r(z, "\00" ""); break; }
306
+ break;
307
+ } }
308
+
309
+ /* step4(z) takes off -ant, -ence etc., in context <c>vcvc<v>. */
310
+
311
+ static void step4(struct stemmer * z)
312
+ { switch (z->b[z->k-1])
313
+ { case 'a': if (ends(z, "\02" "al")) break; return;
314
+ case 'c': if (ends(z, "\04" "ance")) break;
315
+ if (ends(z, "\04" "ence")) break; return;
316
+ case 'e': if (ends(z, "\02" "er")) break; return;
317
+ case 'i': if (ends(z, "\02" "ic")) break; return;
318
+ case 'l': if (ends(z, "\04" "able")) break;
319
+ if (ends(z, "\04" "ible")) break; return;
320
+ case 'n': if (ends(z, "\03" "ant")) break;
321
+ if (ends(z, "\05" "ement")) break;
322
+ if (ends(z, "\04" "ment")) break;
323
+ if (ends(z, "\03" "ent")) break; return;
324
+ case 'o': if (ends(z, "\03" "ion") && (z->b[z->j] == 's' || z->b[z->j] == 't')) break;
325
+ if (ends(z, "\02" "ou")) break; return;
326
+ /* takes care of -ous */
327
+ case 's': if (ends(z, "\03" "ism")) break; return;
328
+ case 't': if (ends(z, "\03" "ate")) break;
329
+ if (ends(z, "\03" "iti")) break; return;
330
+ case 'u': if (ends(z, "\03" "ous")) break; return;
331
+ case 'v': if (ends(z, "\03" "ive")) break; return;
332
+ case 'z': if (ends(z, "\03" "ize")) break; return;
333
+ default: return;
334
+ }
335
+ if (m(z) > 1) z->k = z->j;
336
+ }
337
+
338
+ /* step5(z) removes a final -e if m(z) > 1, and changes -ll to -l if
339
+ m(z) > 1. */
340
+
341
+ static void step5(struct stemmer * z)
342
+ {
343
+ char * b = z->b;
344
+ z->j = z->k;
345
+ if (b[z->k] == 'e')
346
+ { int a = m(z);
347
+ if ( (a > 1 || a == 1) && !cvc(z, z->k - 1)) z->k--;
348
+ }
349
+ if (b[z->k] == 'l' && doublec(z, z->k) && m(z) > 1) z->k--;
350
+ }
351
+
352
+ /* In stem(z, b, k), b is a char pointer, and the string to be stemmed is
353
+ from b[0] to b[k] inclusive. Possibly b[k+1] == '\0', but it is not
354
+ important. The stemmer adjusts the characters b[0] ... b[k] and returns
355
+ the new end-point of the string, k'. Stemming never increases word
356
+ length, so 0 <= k' <= k.
357
+ */
358
+
359
+ extern int porter_stem(struct stemmer * z, const char * b, int k)
360
+ {
361
+ if (k <= 1) return k; /*-DEPARTURE-*/
362
+ z->b = (char*)b; z->k = k; /* copy the parameters into z */
363
+
364
+ /* With this line, strings of length 1 or 2 don't go through the
365
+ stemming process, although no mention is made of this in the
366
+ published algorithm. Remove the line to match the published
367
+ algorithm. */
368
+
369
+ step1ab(z); step1c(z); step2(z); step3(z); step4(z); step5(z);
370
+ return z->k;
371
+ }
372
+ #if 0
373
+
374
+ /*--------------------stemmer definition ends here------------------------*/
375
+
376
+ #include <stdio.h>
377
+ #include <stdlib.h> /* for malloc, free */
378
+ #include <ctype.h> /* for isupper, islower, tolower */
379
+
380
+ static char * s; /* buffer for words tobe stemmed */
381
+
382
+ #define INC 50 /* size units in which s is increased */
383
+ static int i_max = INC; /* maximum offset in s */
384
+
385
+ #define LETTER(ch) (isupper(ch) || islower(ch))
386
+
387
+ void stemfile(struct stemmer * z, FILE * f)
388
+ { while(TRUE)
389
+ { int ch = getc(f);
390
+ if (ch == EOF) return;
391
+ if (LETTER(ch))
392
+ { int i = 0;
393
+ while(TRUE)
394
+ { if (i == i_max)
395
+ { i_max += INC;
396
+ s = realloc(s, i_max + 1);
397
+ }
398
+ ch = tolower(ch); /* forces lower case */
399
+
400
+ s[i] = ch; i++;
401
+ ch = getc(f);
402
+ if (!LETTER(ch)) { ungetc(ch,f); break; }
403
+ }
404
+ s[porter_stem(z, s, i - 1) + 1] = 0;
405
+ /* the previous line calls the stemmer and uses its result to
406
+ zero-terminate the string in s */
407
+ printf("%s",s);
408
+ }
409
+ else putchar(ch);
410
+ }
411
+ }
412
+
413
+ int main(int argc, char * argv[])
414
+ { int i;
415
+
416
+ struct stemmer * z = create_stemmer();
417
+
418
+ s = (char *) malloc(i_max + 1);
419
+ for (i = 1; i < argc; i++)
420
+ { FILE * f = fopen(argv[i],"r");
421
+ if (f == 0) { fprintf(stderr,"File %s not found\n",argv[i]); exit(1); }
422
+ stemfile(z, f);
423
+ }
424
+ free(s);
425
+
426
+ free_stemmer(z);
427
+
428
+ return 0;
429
+ }
430
+ #endif