rbtagger 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/COPYING +21 -0
  2. data/History.txt +4 -0
  3. data/LICENSE +21 -0
  4. data/License.txt +20 -0
  5. data/Manifest.txt +75 -0
  6. data/PostInstall.txt +7 -0
  7. data/README +7 -0
  8. data/README.txt +53 -0
  9. data/Rakefile +33 -0
  10. data/config/hoe.rb +74 -0
  11. data/config/requirements.rb +15 -0
  12. data/ext/rule_tagger/bool.h +38 -0
  13. data/ext/rule_tagger/darray.c +292 -0
  14. data/ext/rule_tagger/darray.h +125 -0
  15. data/ext/rule_tagger/darrayP.h +50 -0
  16. data/ext/rule_tagger/extconf.rb +14 -0
  17. data/ext/rule_tagger/lex.c +170 -0
  18. data/ext/rule_tagger/lex.h +49 -0
  19. data/ext/rule_tagger/memory.c +127 -0
  20. data/ext/rule_tagger/memory.h +20 -0
  21. data/ext/rule_tagger/rbtagger.c +252 -0
  22. data/ext/rule_tagger/registry.c +326 -0
  23. data/ext/rule_tagger/registry.h +129 -0
  24. data/ext/rule_tagger/registryP.h +46 -0
  25. data/ext/rule_tagger/ruby-compat.h +20 -0
  26. data/ext/rule_tagger/rules.c +525 -0
  27. data/ext/rule_tagger/rules.h +42 -0
  28. data/ext/rule_tagger/sysdep.h +20 -0
  29. data/ext/rule_tagger/tagger.c +110 -0
  30. data/ext/rule_tagger/tagger.h +46 -0
  31. data/ext/rule_tagger/useful.c +44 -0
  32. data/ext/rule_tagger/useful.h +51 -0
  33. data/ext/word_tagger/extconf.rb +7 -0
  34. data/ext/word_tagger/porter_stemmer.c +430 -0
  35. data/ext/word_tagger/porter_stemmer.h +19 -0
  36. data/ext/word_tagger/rtagger.cc +83 -0
  37. data/ext/word_tagger/tagger.cc +153 -0
  38. data/ext/word_tagger/tagger.h +27 -0
  39. data/ext/word_tagger/tagger.rb +8 -0
  40. data/ext/word_tagger/test/Makefile +22 -0
  41. data/ext/word_tagger/test/doc.txt +87 -0
  42. data/ext/word_tagger/test/test.cc +107 -0
  43. data/ext/word_tagger/test.rb +31 -0
  44. data/lib/brill/tagger.rb +225 -0
  45. data/lib/rbtagger/version.rb +9 -0
  46. data/lib/rbtagger.rb +6 -0
  47. data/script/console +10 -0
  48. data/script/destroy +14 -0
  49. data/script/generate +14 -0
  50. data/script/txt2html +82 -0
  51. data/setup.rb +1585 -0
  52. data/tasks/deployment.rake +34 -0
  53. data/tasks/environment.rake +7 -0
  54. data/tasks/website.rake +17 -0
  55. data/test/CONTEXTUALRULEFILE +284 -0
  56. data/test/LEXICALRULEFILE +148 -0
  57. data/test/LEXICON +93696 -0
  58. data/test/docs/doc0.txt +20 -0
  59. data/test/docs/doc1.txt +11 -0
  60. data/test/docs/doc2.txt +52 -0
  61. data/test/docs/doc3.txt +128 -0
  62. data/test/docs/doc4.txt +337 -0
  63. data/test/docs/doc5.txt +497 -0
  64. data/test/docs/doc6.txt +116 -0
  65. data/test/docs/doc7.txt +101 -0
  66. data/test/docs/doc8.txt +25 -0
  67. data/test/docs/doc9.txt +84 -0
  68. data/test/tagger_test.rb +60 -0
  69. data/test/test_helper.rb +2 -0
  70. data/tools/rakehelp.rb +113 -0
  71. data/website/index.html +113 -0
  72. data/website/index.txt +53 -0
  73. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  74. data/website/stylesheets/screen.css +138 -0
  75. data/website/template.html.erb +48 -0
  76. metadata +155 -0
@@ -0,0 +1,46 @@
1
+ #ifndef _regyP_h_
2
+ #define _regyP_h_
3
+
4
+ #include "memory.h"
5
+ #include "sysdep.h"
6
+ #include "darray.h"
7
+
8
+ #include "registry.h"
9
+
10
+ /* private internal representation of the table */
11
+ typedef struct RegistryRecord_st {
12
+ VOIDP name;
13
+ VOIDP obj;
14
+ struct RegistryRecord_st *next;
15
+ } RegistryRecord;
16
+
17
+ #define DEFAULT_HT_SIZE 97 /* This should be prime */
18
+
19
+ /* The Registry representation */
20
+ typedef struct Registry_st {
21
+ unsigned int ht_size;
22
+ RegistryRecord **hash_table; /* First record of directory */
23
+ Registry_CompareFunc comp_fun; /* Comparison function */
24
+ Registry_HashFunc hash_fun; /* Hash function */
25
+ unsigned int record_count; /* Number of records in the registry */
26
+ } Registry_rep;
27
+
28
+ /* private traversal routine used to implement Registry_fetch_contents() */
29
+ #ifdef __STDC__
30
+ static NORET add_to_darrays(VOIDP, VOIDP, VOIDP);
31
+ #else
32
+ static NORET add_to_darrays();
33
+ #endif
34
+
35
+ /* used when calling add_to_darrays() within Registry_fetch_contents() */
36
+ struct darray_pair {
37
+ Darray key_darray;
38
+ Darray value_darray;
39
+ };
40
+
41
+ #define raise(p_to_rep) ((Registry)p_to_rep)
42
+ #define lower(obj) ((Registry_rep *)obj)
43
+ #define create() ((Registry_rep *) Memory_allocate(sizeof (Registry_rep)))
44
+ #define destroy(p_to_rep) (Memory_free((VOIDP)p_to_rep))
45
+
46
+ #endif
@@ -0,0 +1,20 @@
1
+ #ifndef _RUBY_COMPAT_HEADER_H
2
+ #define _RUBY_COMPAT_HEADER_H
3
+
4
+ #define DEBUG
5
+ #ifdef DEBUG
6
+ #define TRACE() fprintf(stderr, "> %s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__)
7
+ #else
8
+ #define TRACE()
9
+ #endif
10
+
11
+ /* ruby 1.9 compat */
12
+ #ifndef RSTRING_PTR
13
+ #define RSTRING_PTR(str) RSTRING(str)->ptr
14
+ #endif
15
+
16
+ #ifndef RSTRING_LEN
17
+ #define RSTRING_LEN(str) RSTRING(str)->len
18
+ #endif
19
+
20
+ #endif
@@ -0,0 +1,525 @@
1
+ #include <stdio.h>
2
+ #include <string.h>
3
+ #include <stdlib.h>
4
+ #include "useful.h"
5
+ #include "rules.h"
6
+ #include "lex.h"
7
+ #include "darray.h"
8
+ #include "registry.h"
9
+ #include "memory.h"
10
+
11
+ #define MAXTAGLEN 256 /* max char length of pos tags */
12
+ #define MAXWORDLEN 256 /* max char length of words */
13
+ #define MAXAFFIXLEN 5 /* max length of affixes being considered */
14
+
15
+ void change_the_tag(theentry,thetag,theposition)
16
+ char **theentry, *thetag;
17
+ int theposition;
18
+ {
19
+ free(theentry[theposition]);
20
+ theentry[theposition] = strdup(thetag);
21
+ }
22
+
23
+ void change_the_tag_darray(tag_array,theposition,thetag)
24
+ Darray tag_array;
25
+ int theposition;
26
+ char *thetag;
27
+ {
28
+ free(Darray_get(tag_array, theposition));
29
+ Darray_set(tag_array, theposition, strdup(thetag));
30
+ }
31
+
32
+ void rule_destroy(trans_rule *r) {
33
+ free(r->old);
34
+ free(r->new);
35
+ free(r->when);
36
+ free(r->arg1);
37
+ free(r->arg2);
38
+ free(r);
39
+ }
40
+
41
+ trans_rule *parse_lexical_rule (const char *rule_text) {
42
+ trans_rule *rule = (trans_rule*) malloc(sizeof(trans_rule));
43
+ char **split_ptr = perl_split(rule_text);
44
+
45
+ /* The general rule-pattern is:
46
+ * [old] arg1 when [arg2] new
47
+ * 'old' is only present when 'when' starts with 'f'.
48
+ * 'arg2' is only present for a few 'when' types.
49
+ */
50
+
51
+
52
+ int offset = 0;
53
+
54
+ /* Rule types starting with 'f' have an extra 'old' arg at the beginning */
55
+ if (*split_ptr[2] == 'f') {
56
+ rule->old = strdup(split_ptr[0]);
57
+ offset = 1;
58
+ } else {
59
+ rule->old = NULL;
60
+ }
61
+
62
+ rule->arg1 = strdup(split_ptr[0 + offset]);
63
+ rule->when = strdup(split_ptr[1 + offset]);
64
+
65
+ /* A few rules have a string-length argument too */
66
+ if (strstr(rule->when, "hassuf") ||
67
+ strstr(rule->when, "haspref") ||
68
+ strstr(rule->when, "addpref") ||
69
+ strstr(rule->when, "addsuf") ||
70
+ strstr(rule->when, "deletesuf") ||
71
+ strstr(rule->when, "deletepref") ) {
72
+ rule->arg2 = strdup(split_ptr[2 + offset]);
73
+ offset++;
74
+ } else {
75
+ rule->arg2 = NULL;
76
+ }
77
+
78
+ rule->new = strdup(split_ptr[2 + offset]);
79
+
80
+ perl_split_free( split_ptr );
81
+
82
+ return rule;
83
+ }
84
+
85
+ trans_rule *parse_contextual_rule (const char *rule_text) {
86
+ trans_rule *rule = (trans_rule*) malloc(sizeof(trans_rule));
87
+ char **split_ptr = perl_split(rule_text);
88
+
89
+ rule->old = strdup(split_ptr[0]);
90
+ rule->new = strdup(split_ptr[1]);
91
+ rule->when = strdup(split_ptr[2]);
92
+ rule->arg1 = strdup(split_ptr[3]);
93
+
94
+ /* The following rule-types take an additional argument */
95
+ if (strcmp(rule->when, "SURROUNDTAG") == 0 ||
96
+ strcmp(rule->when, "PREVBIGRAM") == 0 ||
97
+ strcmp(rule->when, "NEXTBIGRAM") == 0 ||
98
+ strcmp(rule->when, "LBIGRAM") == 0 ||
99
+ strcmp(rule->when, "WDPREVTAG") == 0 ||
100
+ strcmp(rule->when, "RBIGRAM") == 0 ||
101
+ strcmp(rule->when, "WDNEXTTAG") == 0 ||
102
+ strcmp(rule->when, "WDAND2BFR") == 0 ||
103
+ strcmp(rule->when, "WDAND2TAGBFR") == 0 ||
104
+ strcmp(rule->when, "WDAND2AFT") == 0 ||
105
+ strcmp(rule->when, "WDAND2TAGAFT") == 0 )
106
+
107
+ rule->arg2 = strdup(split_ptr[4]);
108
+ else
109
+ rule->arg2 = NULL;
110
+
111
+ perl_split_free( split_ptr );
112
+
113
+ return rule;
114
+ }
115
+
116
+
117
+ void apply_contextual_rule(const trans_rule *r,
118
+ char **word_corpus_array,
119
+ char **tag_corpus_array,
120
+ int corpus_size,
121
+ int RESTRICT_MOVE,
122
+ Registry WORDS,
123
+ Registry SEENTAGGING
124
+ ) {
125
+
126
+ char atempstr2[256];
127
+
128
+ int count, tempcount1, tempcount2;
129
+
130
+ corpus_size--; /* Is used below as the index of the last element (dunno why...) */
131
+
132
+ /* fprintf(stderr,"R: OLD: %s NEW: %s WHEN: %s (%s).\n", r->old, r->new, r->when, r->arg1); */
133
+
134
+ for (count = 0; count <= corpus_size; ++count) {
135
+ if (strcmp(tag_corpus_array[count], r->old) == 0) {
136
+
137
+ sprintf(atempstr2,"%s %s", word_corpus_array[count], r->new);
138
+
139
+ if (! RESTRICT_MOVE ||
140
+ ! Registry_get(WORDS, word_corpus_array[count]) ||
141
+ Registry_get(SEENTAGGING,atempstr2)) {
142
+
143
+ if (strcmp(r->when, "SURROUNDTAG") == 0) {
144
+ if (count < corpus_size && count > 0) {
145
+ if (strcmp(r->arg1, tag_corpus_array[count - 1]) == 0 &&
146
+ strcmp(r->arg2, tag_corpus_array[count + 1]) == 0)
147
+ change_the_tag(tag_corpus_array, r->new, count);
148
+ }
149
+ } else if (strcmp(r->when, "NEXTTAG") == 0) {
150
+ if (count < corpus_size) {
151
+ if (strcmp(r->arg1,tag_corpus_array[count + 1]) == 0)
152
+ change_the_tag(tag_corpus_array, r->new, count);
153
+ }
154
+ }
155
+ else if (strcmp(r->when, "CURWD") == 0) {
156
+ if (strcmp(r->arg1, word_corpus_array[count]) == 0)
157
+ change_the_tag(tag_corpus_array, r->new, count);
158
+ }
159
+ else if (strcmp(r->when, "NEXTWD") == 0) {
160
+ if (count < corpus_size) {
161
+ if (strcmp(r->arg1, word_corpus_array[count + 1]) == 0)
162
+ change_the_tag(tag_corpus_array, r->new, count);
163
+ }
164
+ }
165
+ else if (strcmp(r->when, "RBIGRAM") == 0) {
166
+ if (count < corpus_size) {
167
+ if (strcmp(r->arg1, word_corpus_array[count]) ==
168
+ 0 &&
169
+ strcmp(r->arg2, word_corpus_array[count+1]) ==
170
+ 0)
171
+ change_the_tag(tag_corpus_array, r->new, count);
172
+ }
173
+ }
174
+ else if (strcmp(r->when, "WDNEXTTAG") == 0) {
175
+ if (count < corpus_size) {
176
+ if (strcmp(r->arg1, word_corpus_array[count]) ==
177
+ 0 &&
178
+ strcmp(r->arg2, tag_corpus_array[count+1]) ==
179
+ 0)
180
+ change_the_tag(tag_corpus_array, r->new, count);
181
+ }
182
+ }
183
+
184
+ else if (strcmp(r->when, "WDAND2AFT") == 0) {
185
+ if (count < corpus_size-1) {
186
+ if (strcmp(r->arg1, word_corpus_array[count]) ==
187
+ 0 &&
188
+ strcmp(r->arg2, word_corpus_array[count+2]) ==
189
+ 0)
190
+ change_the_tag(tag_corpus_array, r->new, count);
191
+ }
192
+ }
193
+ else if (strcmp(r->when, "WDAND2TAGAFT") == 0) {
194
+ if (count < corpus_size-1) {
195
+ if (strcmp(r->arg1, word_corpus_array[count]) ==
196
+ 0 &&
197
+ strcmp(r->arg2, tag_corpus_array[count+2]) ==
198
+ 0)
199
+ change_the_tag(tag_corpus_array, r->new, count);
200
+ }
201
+ }
202
+
203
+ else if (strcmp(r->when, "NEXT2TAG") == 0) {
204
+ if (count < corpus_size - 1) {
205
+ if (strcmp(r->arg1, tag_corpus_array[count + 2]) == 0)
206
+ change_the_tag(tag_corpus_array, r->new, count);
207
+ }
208
+ } else if (strcmp(r->when, "NEXT2WD") == 0) {
209
+ if (count < corpus_size - 1) {
210
+ if (strcmp(r->arg1, word_corpus_array[count + 2]) == 0)
211
+ change_the_tag(tag_corpus_array, r->new, count);
212
+ }
213
+ } else if (strcmp(r->when, "NEXTBIGRAM") == 0) {
214
+ if (count < corpus_size - 1) {
215
+ if
216
+ (strcmp(r->arg1, tag_corpus_array[count + 1]) == 0 &&
217
+ strcmp(r->arg2, tag_corpus_array[count + 2]) == 0)
218
+ change_the_tag(tag_corpus_array, r->new, count);
219
+ }
220
+ } else if (strcmp(r->when, "NEXT1OR2TAG") == 0) {
221
+ if (count < corpus_size) {
222
+ if (count < corpus_size-1)
223
+ tempcount1 = count+2;
224
+ else
225
+ tempcount1 = count+1;
226
+ if
227
+ (strcmp(r->arg1, tag_corpus_array[count + 1]) == 0 ||
228
+ strcmp(r->arg1, tag_corpus_array[tempcount1]) == 0)
229
+ change_the_tag(tag_corpus_array, r->new, count);
230
+ }
231
+ } else if (strcmp(r->when, "NEXT1OR2WD") == 0) {
232
+ if (count < corpus_size) {
233
+ if (count < corpus_size-1)
234
+ tempcount1 = count+2;
235
+ else
236
+ tempcount1 = count+1;
237
+ if
238
+ (strcmp(r->arg1, word_corpus_array[count + 1]) == 0 ||
239
+ strcmp(r->arg1, word_corpus_array[tempcount1]) == 0)
240
+ change_the_tag(tag_corpus_array, r->new, count);
241
+ }
242
+ } else if (strcmp(r->when, "NEXT1OR2OR3TAG") == 0) {
243
+ if (count < corpus_size) {
244
+ if (count < corpus_size -1)
245
+ tempcount1 = count+2;
246
+ else
247
+ tempcount1 = count+1;
248
+ if (count < corpus_size-2)
249
+ tempcount2 = count+3;
250
+ else
251
+ tempcount2 =count+1;
252
+ if
253
+ (strcmp(r->arg1, tag_corpus_array[count + 1]) == 0 ||
254
+ strcmp(r->arg1, tag_corpus_array[tempcount1]) == 0 ||
255
+ strcmp(r->arg1, tag_corpus_array[tempcount2]) == 0)
256
+ change_the_tag(tag_corpus_array, r->new, count);
257
+ }
258
+ } else if (strcmp(r->when, "NEXT1OR2OR3WD") == 0) {
259
+ if (count < corpus_size) {
260
+ if (count < corpus_size -1)
261
+ tempcount1 = count+2;
262
+ else
263
+ tempcount1 = count+1;
264
+ if (count < corpus_size-2)
265
+ tempcount2 = count+3;
266
+ else
267
+ tempcount2 =count+1;
268
+ if
269
+ (strcmp(r->arg1, word_corpus_array[count + 1]) == 0 ||
270
+ strcmp(r->arg1, word_corpus_array[tempcount1]) == 0 ||
271
+ strcmp(r->arg1, word_corpus_array[tempcount2]) == 0)
272
+ change_the_tag(tag_corpus_array, r->new, count);
273
+ }
274
+ } else if (strcmp(r->when, "PREVTAG") == 0) {
275
+ if (count > 0) {
276
+ if (strcmp(r->arg1, tag_corpus_array[count - 1]) == 0) {
277
+ change_the_tag(tag_corpus_array, r->new, count);
278
+ }
279
+ }
280
+ } else if (strcmp(r->when, "PREVWD") == 0) {
281
+ if (count > 0) {
282
+ if (strcmp(r->arg1, word_corpus_array[count - 1]) == 0) {
283
+ change_the_tag(tag_corpus_array, r->new, count);
284
+ }
285
+ }
286
+ }
287
+ else if (strcmp(r->when, "LBIGRAM") == 0) {
288
+ if (count > 0) {
289
+ if (strcmp(r->arg2, word_corpus_array[count]) ==
290
+ 0 &&
291
+ strcmp(r->arg1, word_corpus_array[count-1]) ==
292
+ 0)
293
+ change_the_tag(tag_corpus_array, r->new, count);
294
+ }
295
+ }
296
+ else if (strcmp(r->when, "WDPREVTAG") == 0) {
297
+ if (count > 0) {
298
+ if (strcmp(r->arg2, word_corpus_array[count]) ==
299
+ 0 &&
300
+ strcmp(r->arg1, tag_corpus_array[count-1]) ==
301
+ 0)
302
+ change_the_tag(tag_corpus_array, r->new, count);
303
+ }
304
+ }
305
+ else if (strcmp(r->when, "WDAND2BFR") == 0) {
306
+ if (count > 1) {
307
+ if (strcmp(r->arg2, word_corpus_array[count]) ==
308
+ 0 &&
309
+ strcmp(r->arg1, word_corpus_array[count-2]) ==
310
+ 0)
311
+ change_the_tag(tag_corpus_array, r->new, count);
312
+ }
313
+ }
314
+ else if (strcmp(r->when, "WDAND2TAGBFR") == 0) {
315
+ if (count > 1) {
316
+ if (strcmp(r->arg2, word_corpus_array[count]) ==
317
+ 0 &&
318
+ strcmp(r->arg1, tag_corpus_array[count-2]) ==
319
+ 0)
320
+ change_the_tag(tag_corpus_array, r->new, count);
321
+ }
322
+ }
323
+
324
+ else if (strcmp(r->when, "PREV2TAG") == 0) {
325
+ if (count > 1) {
326
+ if (strcmp(r->arg1, tag_corpus_array[count - 2]) == 0)
327
+ change_the_tag(tag_corpus_array, r->new, count);
328
+ }
329
+ } else if (strcmp(r->when, "PREV2WD") == 0) {
330
+ if (count > 1) {
331
+ if (strcmp(r->arg1, word_corpus_array[count - 2]) == 0)
332
+ change_the_tag(tag_corpus_array, r->new, count);
333
+ }
334
+ } else if (strcmp(r->when, "PREV1OR2TAG") == 0) {
335
+ if (count > 0) {
336
+ if (count > 1)
337
+ tempcount1 = count-2;
338
+ else
339
+ tempcount1 = count-1;
340
+ if (strcmp(r->arg1, tag_corpus_array[count - 1]) == 0 ||
341
+ strcmp(r->arg1, tag_corpus_array[tempcount1]) == 0)
342
+ change_the_tag(tag_corpus_array, r->new, count);
343
+ }
344
+ } else if (strcmp(r->when, "PREV1OR2WD") == 0) {
345
+ if (count > 0) {
346
+ if (count > 1)
347
+ tempcount1 = count-2;
348
+ else
349
+ tempcount1 = count-1;
350
+ if (strcmp(r->arg1, word_corpus_array[count - 1]) == 0 ||
351
+ strcmp(r->arg1, word_corpus_array[tempcount1]) == 0)
352
+ change_the_tag(tag_corpus_array, r->new, count);
353
+ }
354
+ } else if (strcmp(r->when, "PREV1OR2OR3TAG") == 0) {
355
+ if (count > 0) {
356
+ if (count>1)
357
+ tempcount1 = count-2;
358
+ else
359
+ tempcount1 = count-1;
360
+ if (count >2)
361
+ tempcount2 = count-3;
362
+ else
363
+ tempcount2 = count-1;
364
+ if (strcmp(r->arg1, tag_corpus_array[count - 1]) == 0 ||
365
+ strcmp(r->arg1, tag_corpus_array[tempcount1]) == 0 ||
366
+ strcmp(r->arg1, tag_corpus_array[tempcount2]) == 0)
367
+ change_the_tag(tag_corpus_array, r->new, count);
368
+ }
369
+ } else if (strcmp(r->when, "PREV1OR2OR3WD") == 0) {
370
+ if (count > 0) {
371
+ if (count>1)
372
+ tempcount1 = count-2;
373
+ else
374
+ tempcount1 = count-1;
375
+ if (count >2)
376
+ tempcount2 = count-3;
377
+ else
378
+ tempcount2 = count-1;
379
+ if (strcmp(r->arg1, word_corpus_array[count - 1]) == 0 ||
380
+ strcmp(r->arg1, word_corpus_array[tempcount1]) == 0 ||
381
+ strcmp(r->arg1, word_corpus_array[tempcount2]) == 0)
382
+ change_the_tag(tag_corpus_array, r->new, count);
383
+ }
384
+ } else if (strcmp(r->when, "PREVBIGRAM") == 0) {
385
+ if (count > 1) {
386
+ if (strcmp(r->arg2, tag_corpus_array[count - 1]) == 0 &&
387
+ strcmp(r->arg1, tag_corpus_array[count - 2]) == 0)
388
+ change_the_tag(tag_corpus_array, r->new, count);
389
+ }
390
+ }
391
+ else
392
+ fprintf(stderr,
393
+ "ERROR: %s is not an allowable transform type\n",
394
+ r->when);
395
+ }
396
+ }
397
+ }
398
+ }
399
+
400
+
401
+ void apply_lexical_rule(const trans_rule *r,
402
+ Darray tag_array_key,
403
+ Darray tag_array_val,
404
+ Registry lexicon_hash,
405
+ Registry wordlist_hash,
406
+ Registry bigram_hash,
407
+ int EXTRAWDS
408
+ ) {
409
+
410
+ int count2, count3, tempcount;
411
+ char *tempstr2;
412
+ char *rule_text;
413
+
414
+ char tempstr_space[MAXWORDLEN+MAXAFFIXLEN], bigram_space[MAXWORDLEN*2];
415
+
416
+ int check_current_tag = (r->when[0] == 'f');
417
+ char *name = strdup( check_current_tag ? &r->when[1] : r->when );
418
+
419
+ for (count2=0;count2<Darray_len(tag_array_key);++count2) {
420
+
421
+ if (check_current_tag
422
+ ? (strcmp(Darray_get(tag_array_val, count2), r->old) != 0)
423
+ : (strcmp(Darray_get(tag_array_val, count2), r->new) == 0))
424
+ continue;
425
+
426
+ if (strcmp(name, "char") == 0) {
427
+ if(strpbrk(Darray_get(tag_array_key,count2), r->arg1)) {
428
+ change_the_tag_darray(tag_array_val,count2,r->new);
429
+ }
430
+ }
431
+ else if (strcmp(name, "deletepref") == 0) {
432
+ int arg1_len = atoi(r->arg2);
433
+
434
+ rule_text = Darray_get(tag_array_key,count2);
435
+ for (count3=0;count3<arg1_len;++count3) {
436
+ if (rule_text[count3] != r->arg1[count3])
437
+ break;}
438
+ if (count3 == arg1_len) {
439
+ rule_text += arg1_len;
440
+ if (Registry_get(lexicon_hash,(char *)rule_text) != NULL ||
441
+ (EXTRAWDS &&
442
+ Registry_get(wordlist_hash,(char *)rule_text) != NULL)){
443
+ change_the_tag_darray(tag_array_val,count2,r->new);}
444
+ }
445
+ }
446
+ else if (strcmp(name,"haspref") == 0) {
447
+ int arg1_len = atoi(r->arg2);
448
+
449
+ rule_text = Darray_get(tag_array_key,count2);
450
+ for (count3=0;count3<arg1_len;++count3) {
451
+ if (rule_text[count3] != r->arg1[count3])
452
+ break;}
453
+ if (count3 == arg1_len) {
454
+ change_the_tag_darray(tag_array_val,count2,r->new);}
455
+ }
456
+ else if (strcmp(name,"deletesuf") == 0) {
457
+ int arg1_len = atoi(r->arg2);
458
+
459
+ rule_text = Darray_get(tag_array_key,count2);
460
+ tempcount=strlen(rule_text)-arg1_len;
461
+ for (count3=tempcount;
462
+ count3<strlen(rule_text); ++count3) {
463
+ if (rule_text[count3] != r->arg1[count3-tempcount])
464
+ break;}
465
+ if (count3 == strlen(rule_text)) {
466
+ tempstr2 = strdup(rule_text);
467
+ tempstr2[tempcount] = '\0';
468
+ if (Registry_get(lexicon_hash,(char *)tempstr2) != NULL ||
469
+ (EXTRAWDS &&
470
+ Registry_get(wordlist_hash,(char *)tempstr2) != NULL)) {
471
+
472
+ change_the_tag_darray(tag_array_val,count2,r->new);}
473
+ free(tempstr2);
474
+ }
475
+ }
476
+ else if (strcmp(name,"hassuf") == 0) {
477
+ int arg1_len = atoi(r->arg2);
478
+
479
+ rule_text = Darray_get(tag_array_key,count2);
480
+ tempcount=strlen(rule_text)-arg1_len;
481
+ for (count3=tempcount;
482
+ count3<strlen(rule_text); ++count3) {
483
+ if (rule_text[count3] != r->arg1[count3-tempcount])
484
+ break;}
485
+ if (count3 == strlen(rule_text)) {
486
+
487
+ change_the_tag_darray(tag_array_val,count2,r->new);}
488
+ }
489
+ else if (strcmp(name,"addpref") == 0) {
490
+ snprintf(tempstr_space,MAXWORDLEN+MAXAFFIXLEN,"%s%s",
491
+ (char*)r->arg1,(char*)Darray_get(tag_array_key,count2));
492
+ if (Registry_get(lexicon_hash,(char *)tempstr_space) != NULL
493
+ ||
494
+ (EXTRAWDS &&
495
+ Registry_get(wordlist_hash,(char *)tempstr_space) != NULL)) {
496
+
497
+ change_the_tag_darray(tag_array_val,count2,r->new);}
498
+ }
499
+ else if (strcmp(name,"addsuf") == 0) {
500
+ snprintf(tempstr_space,MAXWORDLEN+MAXAFFIXLEN,"%s%s",
501
+ (char*)Darray_get(tag_array_key,count2),
502
+ (char*)r->arg1);
503
+ if (Registry_get(lexicon_hash,(char *)tempstr_space) != NULL
504
+ ||
505
+ (EXTRAWDS &&
506
+ Registry_get(wordlist_hash,(char *)tempstr_space) != NULL)){
507
+
508
+ change_the_tag_darray(tag_array_val,count2,r->new);}
509
+ }
510
+ else if (strcmp(name,"goodleft") == 0) {
511
+ snprintf(bigram_space,MAXWORDLEN*2,"%s %s",
512
+ (char*)Darray_get(tag_array_key,count2),(char*)r->arg1);
513
+ if (Registry_get(bigram_hash,(char *)bigram_space) != NULL) {
514
+
515
+ change_the_tag_darray(tag_array_val,count2,r->new);}
516
+ }
517
+ else if (strcmp(name,"goodright") == 0) {
518
+ snprintf(bigram_space,MAXWORDLEN*2,"%s %s",(char*)r->arg1,(char*)Darray_get(tag_array_key,count2));
519
+ if (Registry_get(bigram_hash,(char *)bigram_space) != NULL) {
520
+
521
+ change_the_tag_darray(tag_array_val,count2,r->new);}
522
+ }
523
+ }
524
+ free( name );
525
+ }
@@ -0,0 +1,42 @@
1
+
2
+ #ifndef _RULES_H_
3
+ #define _RULES_H_
4
+
5
+ #include "darray.h"
6
+ #include "registry.h"
7
+
8
+ typedef struct {
9
+ char *old;
10
+ char *new;
11
+ char *when;
12
+ char *arg1;
13
+ char *arg2;
14
+ } trans_rule;
15
+
16
+ trans_rule *parse_lexical_rule (const char *rule_text);
17
+
18
+ trans_rule *parse_contextual_rule (const char *rule_text);
19
+
20
+ void rule_destroy(trans_rule *r);
21
+
22
+ void change_the_tag(char **theentry, char *thetag, int theposition);
23
+
24
+ void apply_lexical_rule(const trans_rule *r,
25
+ Darray tag_array_key,
26
+ Darray tag_array_val,
27
+ Registry lexicon_hash,
28
+ Registry wordlist_hash,
29
+ Registry bigram_hash,
30
+ int EXTRAWDS
31
+ );
32
+
33
+ void apply_contextual_rule(const trans_rule *r,
34
+ char **word_corpus_array,
35
+ char **tag_corpus_array,
36
+ int corpus_size,
37
+ int RESTRICT_MOVE,
38
+ Registry WORDS,
39
+ Registry SEENTAGGING
40
+ );
41
+
42
+ #endif /* _RULES_H_ */
@@ -0,0 +1,20 @@
1
+ #ifndef _SYSDEP_H_
2
+ #define _SYSDEP_H_
3
+
4
+ #define NORET void
5
+
6
+ /* CONSTVOIDP is for pointers to non-modifyable void objects */
7
+
8
+ #ifdef __STDC__
9
+ typedef const void * CONSTVOIDP;
10
+ typedef void * VOIDP;
11
+ #define NOARGS void
12
+ #define PROTOTYPE(x) x
13
+ #else
14
+ typedef char * VOIDP;
15
+ typedef char * CONSTVOIDP;
16
+ #define NOARGS
17
+ #define PROTOTYPE(x) ()
18
+ #endif
19
+
20
+ #endif /* ifndef _SYSDEP_H_ */