rbtagger 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/COPYING +21 -0
  2. data/History.txt +4 -0
  3. data/LICENSE +21 -0
  4. data/License.txt +20 -0
  5. data/Manifest.txt +75 -0
  6. data/PostInstall.txt +7 -0
  7. data/README +7 -0
  8. data/README.txt +53 -0
  9. data/Rakefile +33 -0
  10. data/config/hoe.rb +74 -0
  11. data/config/requirements.rb +15 -0
  12. data/ext/rule_tagger/bool.h +38 -0
  13. data/ext/rule_tagger/darray.c +292 -0
  14. data/ext/rule_tagger/darray.h +125 -0
  15. data/ext/rule_tagger/darrayP.h +50 -0
  16. data/ext/rule_tagger/extconf.rb +14 -0
  17. data/ext/rule_tagger/lex.c +170 -0
  18. data/ext/rule_tagger/lex.h +49 -0
  19. data/ext/rule_tagger/memory.c +127 -0
  20. data/ext/rule_tagger/memory.h +20 -0
  21. data/ext/rule_tagger/rbtagger.c +252 -0
  22. data/ext/rule_tagger/registry.c +326 -0
  23. data/ext/rule_tagger/registry.h +129 -0
  24. data/ext/rule_tagger/registryP.h +46 -0
  25. data/ext/rule_tagger/ruby-compat.h +20 -0
  26. data/ext/rule_tagger/rules.c +525 -0
  27. data/ext/rule_tagger/rules.h +42 -0
  28. data/ext/rule_tagger/sysdep.h +20 -0
  29. data/ext/rule_tagger/tagger.c +110 -0
  30. data/ext/rule_tagger/tagger.h +46 -0
  31. data/ext/rule_tagger/useful.c +44 -0
  32. data/ext/rule_tagger/useful.h +51 -0
  33. data/ext/word_tagger/extconf.rb +7 -0
  34. data/ext/word_tagger/porter_stemmer.c +430 -0
  35. data/ext/word_tagger/porter_stemmer.h +19 -0
  36. data/ext/word_tagger/rtagger.cc +83 -0
  37. data/ext/word_tagger/tagger.cc +153 -0
  38. data/ext/word_tagger/tagger.h +27 -0
  39. data/ext/word_tagger/tagger.rb +8 -0
  40. data/ext/word_tagger/test/Makefile +22 -0
  41. data/ext/word_tagger/test/doc.txt +87 -0
  42. data/ext/word_tagger/test/test.cc +107 -0
  43. data/ext/word_tagger/test.rb +31 -0
  44. data/lib/brill/tagger.rb +225 -0
  45. data/lib/rbtagger/version.rb +9 -0
  46. data/lib/rbtagger.rb +6 -0
  47. data/script/console +10 -0
  48. data/script/destroy +14 -0
  49. data/script/generate +14 -0
  50. data/script/txt2html +82 -0
  51. data/setup.rb +1585 -0
  52. data/tasks/deployment.rake +34 -0
  53. data/tasks/environment.rake +7 -0
  54. data/tasks/website.rake +17 -0
  55. data/test/CONTEXTUALRULEFILE +284 -0
  56. data/test/LEXICALRULEFILE +148 -0
  57. data/test/LEXICON +93696 -0
  58. data/test/docs/doc0.txt +20 -0
  59. data/test/docs/doc1.txt +11 -0
  60. data/test/docs/doc2.txt +52 -0
  61. data/test/docs/doc3.txt +128 -0
  62. data/test/docs/doc4.txt +337 -0
  63. data/test/docs/doc5.txt +497 -0
  64. data/test/docs/doc6.txt +116 -0
  65. data/test/docs/doc7.txt +101 -0
  66. data/test/docs/doc8.txt +25 -0
  67. data/test/docs/doc9.txt +84 -0
  68. data/test/tagger_test.rb +60 -0
  69. data/test/test_helper.rb +2 -0
  70. data/tools/rakehelp.rb +113 -0
  71. data/website/index.html +113 -0
  72. data/website/index.txt +53 -0
  73. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  74. data/website/stylesheets/screen.css +138 -0
  75. data/website/template.html.erb +48 -0
  76. metadata +155 -0
@@ -0,0 +1,46 @@
1
+ #ifndef _regyP_h_
2
+ #define _regyP_h_
3
+
4
+ #include "memory.h"
5
+ #include "sysdep.h"
6
+ #include "darray.h"
7
+
8
+ #include "registry.h"
9
+
10
+ /* private internal representation of the table */
11
+ typedef struct RegistryRecord_st {
12
+ VOIDP name;
13
+ VOIDP obj;
14
+ struct RegistryRecord_st *next;
15
+ } RegistryRecord;
16
+
17
+ #define DEFAULT_HT_SIZE 97 /* This should be prime */
18
+
19
+ /* The Registry representation */
20
+ typedef struct Registry_st {
21
+ unsigned int ht_size;
22
+ RegistryRecord **hash_table; /* First record of directory */
23
+ Registry_CompareFunc comp_fun; /* Comparison function */
24
+ Registry_HashFunc hash_fun; /* Hash function */
25
+ unsigned int record_count; /* Number of records in the registry */
26
+ } Registry_rep;
27
+
28
+ /* private traversal routine used to implement Registry_fetch_contents() */
29
+ #ifdef __STDC__
30
+ static NORET add_to_darrays(VOIDP, VOIDP, VOIDP);
31
+ #else
32
+ static NORET add_to_darrays();
33
+ #endif
34
+
35
+ /* used when calling add_to_darrays() within Registry_fetch_contents() */
36
+ struct darray_pair {
37
+ Darray key_darray;
38
+ Darray value_darray;
39
+ };
40
+
41
+ #define raise(p_to_rep) ((Registry)p_to_rep)
42
+ #define lower(obj) ((Registry_rep *)obj)
43
+ #define create() ((Registry_rep *) Memory_allocate(sizeof (Registry_rep)))
44
+ #define destroy(p_to_rep) (Memory_free((VOIDP)p_to_rep))
45
+
46
+ #endif
@@ -0,0 +1,20 @@
1
+ #ifndef _RUBY_COMPAT_HEADER_H
2
+ #define _RUBY_COMPAT_HEADER_H
3
+
4
+ #define DEBUG
5
+ #ifdef DEBUG
6
+ #define TRACE() fprintf(stderr, "> %s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__)
7
+ #else
8
+ #define TRACE()
9
+ #endif
10
+
11
+ /* ruby 1.9 compat */
12
+ #ifndef RSTRING_PTR
13
+ #define RSTRING_PTR(str) RSTRING(str)->ptr
14
+ #endif
15
+
16
+ #ifndef RSTRING_LEN
17
+ #define RSTRING_LEN(str) RSTRING(str)->len
18
+ #endif
19
+
20
+ #endif
@@ -0,0 +1,525 @@
1
+ #include <stdio.h>
2
+ #include <string.h>
3
+ #include <stdlib.h>
4
+ #include "useful.h"
5
+ #include "rules.h"
6
+ #include "lex.h"
7
+ #include "darray.h"
8
+ #include "registry.h"
9
+ #include "memory.h"
10
+
11
+ #define MAXTAGLEN 256 /* max char length of pos tags */
12
+ #define MAXWORDLEN 256 /* max char length of words */
13
+ #define MAXAFFIXLEN 5 /* max length of affixes being considered */
14
+
15
+ void change_the_tag(theentry,thetag,theposition)
16
+ char **theentry, *thetag;
17
+ int theposition;
18
+ {
19
+ free(theentry[theposition]);
20
+ theentry[theposition] = strdup(thetag);
21
+ }
22
+
23
+ void change_the_tag_darray(tag_array,theposition,thetag)
24
+ Darray tag_array;
25
+ int theposition;
26
+ char *thetag;
27
+ {
28
+ free(Darray_get(tag_array, theposition));
29
+ Darray_set(tag_array, theposition, strdup(thetag));
30
+ }
31
+
32
+ void rule_destroy(trans_rule *r) {
33
+ free(r->old);
34
+ free(r->new);
35
+ free(r->when);
36
+ free(r->arg1);
37
+ free(r->arg2);
38
+ free(r);
39
+ }
40
+
41
+ trans_rule *parse_lexical_rule (const char *rule_text) {
42
+ trans_rule *rule = (trans_rule*) malloc(sizeof(trans_rule));
43
+ char **split_ptr = perl_split(rule_text);
44
+
45
+ /* The general rule-pattern is:
46
+ * [old] arg1 when [arg2] new
47
+ * 'old' is only present when 'when' starts with 'f'.
48
+ * 'arg2' is only present for a few 'when' types.
49
+ */
50
+
51
+
52
+ int offset = 0;
53
+
54
+ /* Rule types starting with 'f' have an extra 'old' arg at the beginning */
55
+ if (*split_ptr[2] == 'f') {
56
+ rule->old = strdup(split_ptr[0]);
57
+ offset = 1;
58
+ } else {
59
+ rule->old = NULL;
60
+ }
61
+
62
+ rule->arg1 = strdup(split_ptr[0 + offset]);
63
+ rule->when = strdup(split_ptr[1 + offset]);
64
+
65
+ /* A few rules have a string-length argument too */
66
+ if (strstr(rule->when, "hassuf") ||
67
+ strstr(rule->when, "haspref") ||
68
+ strstr(rule->when, "addpref") ||
69
+ strstr(rule->when, "addsuf") ||
70
+ strstr(rule->when, "deletesuf") ||
71
+ strstr(rule->when, "deletepref") ) {
72
+ rule->arg2 = strdup(split_ptr[2 + offset]);
73
+ offset++;
74
+ } else {
75
+ rule->arg2 = NULL;
76
+ }
77
+
78
+ rule->new = strdup(split_ptr[2 + offset]);
79
+
80
+ perl_split_free( split_ptr );
81
+
82
+ return rule;
83
+ }
84
+
85
+ trans_rule *parse_contextual_rule (const char *rule_text) {
86
+ trans_rule *rule = (trans_rule*) malloc(sizeof(trans_rule));
87
+ char **split_ptr = perl_split(rule_text);
88
+
89
+ rule->old = strdup(split_ptr[0]);
90
+ rule->new = strdup(split_ptr[1]);
91
+ rule->when = strdup(split_ptr[2]);
92
+ rule->arg1 = strdup(split_ptr[3]);
93
+
94
+ /* The following rule-types take an additional argument */
95
+ if (strcmp(rule->when, "SURROUNDTAG") == 0 ||
96
+ strcmp(rule->when, "PREVBIGRAM") == 0 ||
97
+ strcmp(rule->when, "NEXTBIGRAM") == 0 ||
98
+ strcmp(rule->when, "LBIGRAM") == 0 ||
99
+ strcmp(rule->when, "WDPREVTAG") == 0 ||
100
+ strcmp(rule->when, "RBIGRAM") == 0 ||
101
+ strcmp(rule->when, "WDNEXTTAG") == 0 ||
102
+ strcmp(rule->when, "WDAND2BFR") == 0 ||
103
+ strcmp(rule->when, "WDAND2TAGBFR") == 0 ||
104
+ strcmp(rule->when, "WDAND2AFT") == 0 ||
105
+ strcmp(rule->when, "WDAND2TAGAFT") == 0 )
106
+
107
+ rule->arg2 = strdup(split_ptr[4]);
108
+ else
109
+ rule->arg2 = NULL;
110
+
111
+ perl_split_free( split_ptr );
112
+
113
+ return rule;
114
+ }
115
+
116
+
117
+ void apply_contextual_rule(const trans_rule *r,
118
+ char **word_corpus_array,
119
+ char **tag_corpus_array,
120
+ int corpus_size,
121
+ int RESTRICT_MOVE,
122
+ Registry WORDS,
123
+ Registry SEENTAGGING
124
+ ) {
125
+
126
+ char atempstr2[256];
127
+
128
+ int count, tempcount1, tempcount2;
129
+
130
+ corpus_size--; /* Is used below as the index of the last element (dunno why...) */
131
+
132
+ /* fprintf(stderr,"R: OLD: %s NEW: %s WHEN: %s (%s).\n", r->old, r->new, r->when, r->arg1); */
133
+
134
+ for (count = 0; count <= corpus_size; ++count) {
135
+ if (strcmp(tag_corpus_array[count], r->old) == 0) {
136
+
137
+ sprintf(atempstr2,"%s %s", word_corpus_array[count], r->new);
138
+
139
+ if (! RESTRICT_MOVE ||
140
+ ! Registry_get(WORDS, word_corpus_array[count]) ||
141
+ Registry_get(SEENTAGGING,atempstr2)) {
142
+
143
+ if (strcmp(r->when, "SURROUNDTAG") == 0) {
144
+ if (count < corpus_size && count > 0) {
145
+ if (strcmp(r->arg1, tag_corpus_array[count - 1]) == 0 &&
146
+ strcmp(r->arg2, tag_corpus_array[count + 1]) == 0)
147
+ change_the_tag(tag_corpus_array, r->new, count);
148
+ }
149
+ } else if (strcmp(r->when, "NEXTTAG") == 0) {
150
+ if (count < corpus_size) {
151
+ if (strcmp(r->arg1,tag_corpus_array[count + 1]) == 0)
152
+ change_the_tag(tag_corpus_array, r->new, count);
153
+ }
154
+ }
155
+ else if (strcmp(r->when, "CURWD") == 0) {
156
+ if (strcmp(r->arg1, word_corpus_array[count]) == 0)
157
+ change_the_tag(tag_corpus_array, r->new, count);
158
+ }
159
+ else if (strcmp(r->when, "NEXTWD") == 0) {
160
+ if (count < corpus_size) {
161
+ if (strcmp(r->arg1, word_corpus_array[count + 1]) == 0)
162
+ change_the_tag(tag_corpus_array, r->new, count);
163
+ }
164
+ }
165
+ else if (strcmp(r->when, "RBIGRAM") == 0) {
166
+ if (count < corpus_size) {
167
+ if (strcmp(r->arg1, word_corpus_array[count]) ==
168
+ 0 &&
169
+ strcmp(r->arg2, word_corpus_array[count+1]) ==
170
+ 0)
171
+ change_the_tag(tag_corpus_array, r->new, count);
172
+ }
173
+ }
174
+ else if (strcmp(r->when, "WDNEXTTAG") == 0) {
175
+ if (count < corpus_size) {
176
+ if (strcmp(r->arg1, word_corpus_array[count]) ==
177
+ 0 &&
178
+ strcmp(r->arg2, tag_corpus_array[count+1]) ==
179
+ 0)
180
+ change_the_tag(tag_corpus_array, r->new, count);
181
+ }
182
+ }
183
+
184
+ else if (strcmp(r->when, "WDAND2AFT") == 0) {
185
+ if (count < corpus_size-1) {
186
+ if (strcmp(r->arg1, word_corpus_array[count]) ==
187
+ 0 &&
188
+ strcmp(r->arg2, word_corpus_array[count+2]) ==
189
+ 0)
190
+ change_the_tag(tag_corpus_array, r->new, count);
191
+ }
192
+ }
193
+ else if (strcmp(r->when, "WDAND2TAGAFT") == 0) {
194
+ if (count < corpus_size-1) {
195
+ if (strcmp(r->arg1, word_corpus_array[count]) ==
196
+ 0 &&
197
+ strcmp(r->arg2, tag_corpus_array[count+2]) ==
198
+ 0)
199
+ change_the_tag(tag_corpus_array, r->new, count);
200
+ }
201
+ }
202
+
203
+ else if (strcmp(r->when, "NEXT2TAG") == 0) {
204
+ if (count < corpus_size - 1) {
205
+ if (strcmp(r->arg1, tag_corpus_array[count + 2]) == 0)
206
+ change_the_tag(tag_corpus_array, r->new, count);
207
+ }
208
+ } else if (strcmp(r->when, "NEXT2WD") == 0) {
209
+ if (count < corpus_size - 1) {
210
+ if (strcmp(r->arg1, word_corpus_array[count + 2]) == 0)
211
+ change_the_tag(tag_corpus_array, r->new, count);
212
+ }
213
+ } else if (strcmp(r->when, "NEXTBIGRAM") == 0) {
214
+ if (count < corpus_size - 1) {
215
+ if
216
+ (strcmp(r->arg1, tag_corpus_array[count + 1]) == 0 &&
217
+ strcmp(r->arg2, tag_corpus_array[count + 2]) == 0)
218
+ change_the_tag(tag_corpus_array, r->new, count);
219
+ }
220
+ } else if (strcmp(r->when, "NEXT1OR2TAG") == 0) {
221
+ if (count < corpus_size) {
222
+ if (count < corpus_size-1)
223
+ tempcount1 = count+2;
224
+ else
225
+ tempcount1 = count+1;
226
+ if
227
+ (strcmp(r->arg1, tag_corpus_array[count + 1]) == 0 ||
228
+ strcmp(r->arg1, tag_corpus_array[tempcount1]) == 0)
229
+ change_the_tag(tag_corpus_array, r->new, count);
230
+ }
231
+ } else if (strcmp(r->when, "NEXT1OR2WD") == 0) {
232
+ if (count < corpus_size) {
233
+ if (count < corpus_size-1)
234
+ tempcount1 = count+2;
235
+ else
236
+ tempcount1 = count+1;
237
+ if
238
+ (strcmp(r->arg1, word_corpus_array[count + 1]) == 0 ||
239
+ strcmp(r->arg1, word_corpus_array[tempcount1]) == 0)
240
+ change_the_tag(tag_corpus_array, r->new, count);
241
+ }
242
+ } else if (strcmp(r->when, "NEXT1OR2OR3TAG") == 0) {
243
+ if (count < corpus_size) {
244
+ if (count < corpus_size -1)
245
+ tempcount1 = count+2;
246
+ else
247
+ tempcount1 = count+1;
248
+ if (count < corpus_size-2)
249
+ tempcount2 = count+3;
250
+ else
251
+ tempcount2 =count+1;
252
+ if
253
+ (strcmp(r->arg1, tag_corpus_array[count + 1]) == 0 ||
254
+ strcmp(r->arg1, tag_corpus_array[tempcount1]) == 0 ||
255
+ strcmp(r->arg1, tag_corpus_array[tempcount2]) == 0)
256
+ change_the_tag(tag_corpus_array, r->new, count);
257
+ }
258
+ } else if (strcmp(r->when, "NEXT1OR2OR3WD") == 0) {
259
+ if (count < corpus_size) {
260
+ if (count < corpus_size -1)
261
+ tempcount1 = count+2;
262
+ else
263
+ tempcount1 = count+1;
264
+ if (count < corpus_size-2)
265
+ tempcount2 = count+3;
266
+ else
267
+ tempcount2 =count+1;
268
+ if
269
+ (strcmp(r->arg1, word_corpus_array[count + 1]) == 0 ||
270
+ strcmp(r->arg1, word_corpus_array[tempcount1]) == 0 ||
271
+ strcmp(r->arg1, word_corpus_array[tempcount2]) == 0)
272
+ change_the_tag(tag_corpus_array, r->new, count);
273
+ }
274
+ } else if (strcmp(r->when, "PREVTAG") == 0) {
275
+ if (count > 0) {
276
+ if (strcmp(r->arg1, tag_corpus_array[count - 1]) == 0) {
277
+ change_the_tag(tag_corpus_array, r->new, count);
278
+ }
279
+ }
280
+ } else if (strcmp(r->when, "PREVWD") == 0) {
281
+ if (count > 0) {
282
+ if (strcmp(r->arg1, word_corpus_array[count - 1]) == 0) {
283
+ change_the_tag(tag_corpus_array, r->new, count);
284
+ }
285
+ }
286
+ }
287
+ else if (strcmp(r->when, "LBIGRAM") == 0) {
288
+ if (count > 0) {
289
+ if (strcmp(r->arg2, word_corpus_array[count]) ==
290
+ 0 &&
291
+ strcmp(r->arg1, word_corpus_array[count-1]) ==
292
+ 0)
293
+ change_the_tag(tag_corpus_array, r->new, count);
294
+ }
295
+ }
296
+ else if (strcmp(r->when, "WDPREVTAG") == 0) {
297
+ if (count > 0) {
298
+ if (strcmp(r->arg2, word_corpus_array[count]) ==
299
+ 0 &&
300
+ strcmp(r->arg1, tag_corpus_array[count-1]) ==
301
+ 0)
302
+ change_the_tag(tag_corpus_array, r->new, count);
303
+ }
304
+ }
305
+ else if (strcmp(r->when, "WDAND2BFR") == 0) {
306
+ if (count > 1) {
307
+ if (strcmp(r->arg2, word_corpus_array[count]) ==
308
+ 0 &&
309
+ strcmp(r->arg1, word_corpus_array[count-2]) ==
310
+ 0)
311
+ change_the_tag(tag_corpus_array, r->new, count);
312
+ }
313
+ }
314
+ else if (strcmp(r->when, "WDAND2TAGBFR") == 0) {
315
+ if (count > 1) {
316
+ if (strcmp(r->arg2, word_corpus_array[count]) ==
317
+ 0 &&
318
+ strcmp(r->arg1, tag_corpus_array[count-2]) ==
319
+ 0)
320
+ change_the_tag(tag_corpus_array, r->new, count);
321
+ }
322
+ }
323
+
324
+ else if (strcmp(r->when, "PREV2TAG") == 0) {
325
+ if (count > 1) {
326
+ if (strcmp(r->arg1, tag_corpus_array[count - 2]) == 0)
327
+ change_the_tag(tag_corpus_array, r->new, count);
328
+ }
329
+ } else if (strcmp(r->when, "PREV2WD") == 0) {
330
+ if (count > 1) {
331
+ if (strcmp(r->arg1, word_corpus_array[count - 2]) == 0)
332
+ change_the_tag(tag_corpus_array, r->new, count);
333
+ }
334
+ } else if (strcmp(r->when, "PREV1OR2TAG") == 0) {
335
+ if (count > 0) {
336
+ if (count > 1)
337
+ tempcount1 = count-2;
338
+ else
339
+ tempcount1 = count-1;
340
+ if (strcmp(r->arg1, tag_corpus_array[count - 1]) == 0 ||
341
+ strcmp(r->arg1, tag_corpus_array[tempcount1]) == 0)
342
+ change_the_tag(tag_corpus_array, r->new, count);
343
+ }
344
+ } else if (strcmp(r->when, "PREV1OR2WD") == 0) {
345
+ if (count > 0) {
346
+ if (count > 1)
347
+ tempcount1 = count-2;
348
+ else
349
+ tempcount1 = count-1;
350
+ if (strcmp(r->arg1, word_corpus_array[count - 1]) == 0 ||
351
+ strcmp(r->arg1, word_corpus_array[tempcount1]) == 0)
352
+ change_the_tag(tag_corpus_array, r->new, count);
353
+ }
354
+ } else if (strcmp(r->when, "PREV1OR2OR3TAG") == 0) {
355
+ if (count > 0) {
356
+ if (count>1)
357
+ tempcount1 = count-2;
358
+ else
359
+ tempcount1 = count-1;
360
+ if (count >2)
361
+ tempcount2 = count-3;
362
+ else
363
+ tempcount2 = count-1;
364
+ if (strcmp(r->arg1, tag_corpus_array[count - 1]) == 0 ||
365
+ strcmp(r->arg1, tag_corpus_array[tempcount1]) == 0 ||
366
+ strcmp(r->arg1, tag_corpus_array[tempcount2]) == 0)
367
+ change_the_tag(tag_corpus_array, r->new, count);
368
+ }
369
+ } else if (strcmp(r->when, "PREV1OR2OR3WD") == 0) {
370
+ if (count > 0) {
371
+ if (count>1)
372
+ tempcount1 = count-2;
373
+ else
374
+ tempcount1 = count-1;
375
+ if (count >2)
376
+ tempcount2 = count-3;
377
+ else
378
+ tempcount2 = count-1;
379
+ if (strcmp(r->arg1, word_corpus_array[count - 1]) == 0 ||
380
+ strcmp(r->arg1, word_corpus_array[tempcount1]) == 0 ||
381
+ strcmp(r->arg1, word_corpus_array[tempcount2]) == 0)
382
+ change_the_tag(tag_corpus_array, r->new, count);
383
+ }
384
+ } else if (strcmp(r->when, "PREVBIGRAM") == 0) {
385
+ if (count > 1) {
386
+ if (strcmp(r->arg2, tag_corpus_array[count - 1]) == 0 &&
387
+ strcmp(r->arg1, tag_corpus_array[count - 2]) == 0)
388
+ change_the_tag(tag_corpus_array, r->new, count);
389
+ }
390
+ }
391
+ else
392
+ fprintf(stderr,
393
+ "ERROR: %s is not an allowable transform type\n",
394
+ r->when);
395
+ }
396
+ }
397
+ }
398
+ }
399
+
400
+
401
+ void apply_lexical_rule(const trans_rule *r,
402
+ Darray tag_array_key,
403
+ Darray tag_array_val,
404
+ Registry lexicon_hash,
405
+ Registry wordlist_hash,
406
+ Registry bigram_hash,
407
+ int EXTRAWDS
408
+ ) {
409
+
410
+ int count2, count3, tempcount;
411
+ char *tempstr2;
412
+ char *rule_text;
413
+
414
+ char tempstr_space[MAXWORDLEN+MAXAFFIXLEN], bigram_space[MAXWORDLEN*2];
415
+
416
+ int check_current_tag = (r->when[0] == 'f');
417
+ char *name = strdup( check_current_tag ? &r->when[1] : r->when );
418
+
419
+ for (count2=0;count2<Darray_len(tag_array_key);++count2) {
420
+
421
+ if (check_current_tag
422
+ ? (strcmp(Darray_get(tag_array_val, count2), r->old) != 0)
423
+ : (strcmp(Darray_get(tag_array_val, count2), r->new) == 0))
424
+ continue;
425
+
426
+ if (strcmp(name, "char") == 0) {
427
+ if(strpbrk(Darray_get(tag_array_key,count2), r->arg1)) {
428
+ change_the_tag_darray(tag_array_val,count2,r->new);
429
+ }
430
+ }
431
+ else if (strcmp(name, "deletepref") == 0) {
432
+ int arg1_len = atoi(r->arg2);
433
+
434
+ rule_text = Darray_get(tag_array_key,count2);
435
+ for (count3=0;count3<arg1_len;++count3) {
436
+ if (rule_text[count3] != r->arg1[count3])
437
+ break;}
438
+ if (count3 == arg1_len) {
439
+ rule_text += arg1_len;
440
+ if (Registry_get(lexicon_hash,(char *)rule_text) != NULL ||
441
+ (EXTRAWDS &&
442
+ Registry_get(wordlist_hash,(char *)rule_text) != NULL)){
443
+ change_the_tag_darray(tag_array_val,count2,r->new);}
444
+ }
445
+ }
446
+ else if (strcmp(name,"haspref") == 0) {
447
+ int arg1_len = atoi(r->arg2);
448
+
449
+ rule_text = Darray_get(tag_array_key,count2);
450
+ for (count3=0;count3<arg1_len;++count3) {
451
+ if (rule_text[count3] != r->arg1[count3])
452
+ break;}
453
+ if (count3 == arg1_len) {
454
+ change_the_tag_darray(tag_array_val,count2,r->new);}
455
+ }
456
+ else if (strcmp(name,"deletesuf") == 0) {
457
+ int arg1_len = atoi(r->arg2);
458
+
459
+ rule_text = Darray_get(tag_array_key,count2);
460
+ tempcount=strlen(rule_text)-arg1_len;
461
+ for (count3=tempcount;
462
+ count3<strlen(rule_text); ++count3) {
463
+ if (rule_text[count3] != r->arg1[count3-tempcount])
464
+ break;}
465
+ if (count3 == strlen(rule_text)) {
466
+ tempstr2 = strdup(rule_text);
467
+ tempstr2[tempcount] = '\0';
468
+ if (Registry_get(lexicon_hash,(char *)tempstr2) != NULL ||
469
+ (EXTRAWDS &&
470
+ Registry_get(wordlist_hash,(char *)tempstr2) != NULL)) {
471
+
472
+ change_the_tag_darray(tag_array_val,count2,r->new);}
473
+ free(tempstr2);
474
+ }
475
+ }
476
+ else if (strcmp(name,"hassuf") == 0) {
477
+ int arg1_len = atoi(r->arg2);
478
+
479
+ rule_text = Darray_get(tag_array_key,count2);
480
+ tempcount=strlen(rule_text)-arg1_len;
481
+ for (count3=tempcount;
482
+ count3<strlen(rule_text); ++count3) {
483
+ if (rule_text[count3] != r->arg1[count3-tempcount])
484
+ break;}
485
+ if (count3 == strlen(rule_text)) {
486
+
487
+ change_the_tag_darray(tag_array_val,count2,r->new);}
488
+ }
489
+ else if (strcmp(name,"addpref") == 0) {
490
+ snprintf(tempstr_space,MAXWORDLEN+MAXAFFIXLEN,"%s%s",
491
+ (char*)r->arg1,(char*)Darray_get(tag_array_key,count2));
492
+ if (Registry_get(lexicon_hash,(char *)tempstr_space) != NULL
493
+ ||
494
+ (EXTRAWDS &&
495
+ Registry_get(wordlist_hash,(char *)tempstr_space) != NULL)) {
496
+
497
+ change_the_tag_darray(tag_array_val,count2,r->new);}
498
+ }
499
+ else if (strcmp(name,"addsuf") == 0) {
500
+ snprintf(tempstr_space,MAXWORDLEN+MAXAFFIXLEN,"%s%s",
501
+ (char*)Darray_get(tag_array_key,count2),
502
+ (char*)r->arg1);
503
+ if (Registry_get(lexicon_hash,(char *)tempstr_space) != NULL
504
+ ||
505
+ (EXTRAWDS &&
506
+ Registry_get(wordlist_hash,(char *)tempstr_space) != NULL)){
507
+
508
+ change_the_tag_darray(tag_array_val,count2,r->new);}
509
+ }
510
+ else if (strcmp(name,"goodleft") == 0) {
511
+ snprintf(bigram_space,MAXWORDLEN*2,"%s %s",
512
+ (char*)Darray_get(tag_array_key,count2),(char*)r->arg1);
513
+ if (Registry_get(bigram_hash,(char *)bigram_space) != NULL) {
514
+
515
+ change_the_tag_darray(tag_array_val,count2,r->new);}
516
+ }
517
+ else if (strcmp(name,"goodright") == 0) {
518
+ snprintf(bigram_space,MAXWORDLEN*2,"%s %s",(char*)r->arg1,(char*)Darray_get(tag_array_key,count2));
519
+ if (Registry_get(bigram_hash,(char *)bigram_space) != NULL) {
520
+
521
+ change_the_tag_darray(tag_array_val,count2,r->new);}
522
+ }
523
+ }
524
+ free( name );
525
+ }
@@ -0,0 +1,42 @@
1
+
2
+ #ifndef _RULES_H_
3
+ #define _RULES_H_
4
+
5
+ #include "darray.h"
6
+ #include "registry.h"
7
+
8
+ typedef struct {
9
+ char *old;
10
+ char *new;
11
+ char *when;
12
+ char *arg1;
13
+ char *arg2;
14
+ } trans_rule;
15
+
16
+ trans_rule *parse_lexical_rule (const char *rule_text);
17
+
18
+ trans_rule *parse_contextual_rule (const char *rule_text);
19
+
20
+ void rule_destroy(trans_rule *r);
21
+
22
+ void change_the_tag(char **theentry, char *thetag, int theposition);
23
+
24
+ void apply_lexical_rule(const trans_rule *r,
25
+ Darray tag_array_key,
26
+ Darray tag_array_val,
27
+ Registry lexicon_hash,
28
+ Registry wordlist_hash,
29
+ Registry bigram_hash,
30
+ int EXTRAWDS
31
+ );
32
+
33
+ void apply_contextual_rule(const trans_rule *r,
34
+ char **word_corpus_array,
35
+ char **tag_corpus_array,
36
+ int corpus_size,
37
+ int RESTRICT_MOVE,
38
+ Registry WORDS,
39
+ Registry SEENTAGGING
40
+ );
41
+
42
+ #endif /* _RULES_H_ */
@@ -0,0 +1,20 @@
1
+ #ifndef _SYSDEP_H_
2
+ #define _SYSDEP_H_
3
+
4
+ #define NORET void
5
+
6
+ /* CONSTVOIDP is for pointers to non-modifyable void objects */
7
+
8
+ #ifdef __STDC__
9
+ typedef const void * CONSTVOIDP;
10
+ typedef void * VOIDP;
11
+ #define NOARGS void
12
+ #define PROTOTYPE(x) x
13
+ #else
14
+ typedef char * VOIDP;
15
+ typedef char * CONSTVOIDP;
16
+ #define NOARGS
17
+ #define PROTOTYPE(x) ()
18
+ #endif
19
+
20
+ #endif /* ifndef _SYSDEP_H_ */