rbtagger 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +21 -0
- data/History.txt +4 -0
- data/LICENSE +21 -0
- data/License.txt +20 -0
- data/Manifest.txt +75 -0
- data/PostInstall.txt +7 -0
- data/README +7 -0
- data/README.txt +53 -0
- data/Rakefile +33 -0
- data/config/hoe.rb +74 -0
- data/config/requirements.rb +15 -0
- data/ext/rule_tagger/bool.h +38 -0
- data/ext/rule_tagger/darray.c +292 -0
- data/ext/rule_tagger/darray.h +125 -0
- data/ext/rule_tagger/darrayP.h +50 -0
- data/ext/rule_tagger/extconf.rb +14 -0
- data/ext/rule_tagger/lex.c +170 -0
- data/ext/rule_tagger/lex.h +49 -0
- data/ext/rule_tagger/memory.c +127 -0
- data/ext/rule_tagger/memory.h +20 -0
- data/ext/rule_tagger/rbtagger.c +252 -0
- data/ext/rule_tagger/registry.c +326 -0
- data/ext/rule_tagger/registry.h +129 -0
- data/ext/rule_tagger/registryP.h +46 -0
- data/ext/rule_tagger/ruby-compat.h +20 -0
- data/ext/rule_tagger/rules.c +525 -0
- data/ext/rule_tagger/rules.h +42 -0
- data/ext/rule_tagger/sysdep.h +20 -0
- data/ext/rule_tagger/tagger.c +110 -0
- data/ext/rule_tagger/tagger.h +46 -0
- data/ext/rule_tagger/useful.c +44 -0
- data/ext/rule_tagger/useful.h +51 -0
- data/ext/word_tagger/extconf.rb +7 -0
- data/ext/word_tagger/porter_stemmer.c +430 -0
- data/ext/word_tagger/porter_stemmer.h +19 -0
- data/ext/word_tagger/rtagger.cc +83 -0
- data/ext/word_tagger/tagger.cc +153 -0
- data/ext/word_tagger/tagger.h +27 -0
- data/ext/word_tagger/tagger.rb +8 -0
- data/ext/word_tagger/test/Makefile +22 -0
- data/ext/word_tagger/test/doc.txt +87 -0
- data/ext/word_tagger/test/test.cc +107 -0
- data/ext/word_tagger/test.rb +31 -0
- data/lib/brill/tagger.rb +225 -0
- data/lib/rbtagger/version.rb +9 -0
- data/lib/rbtagger.rb +6 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/script/txt2html +82 -0
- data/setup.rb +1585 -0
- data/tasks/deployment.rake +34 -0
- data/tasks/environment.rake +7 -0
- data/tasks/website.rake +17 -0
- data/test/CONTEXTUALRULEFILE +284 -0
- data/test/LEXICALRULEFILE +148 -0
- data/test/LEXICON +93696 -0
- data/test/docs/doc0.txt +20 -0
- data/test/docs/doc1.txt +11 -0
- data/test/docs/doc2.txt +52 -0
- data/test/docs/doc3.txt +128 -0
- data/test/docs/doc4.txt +337 -0
- data/test/docs/doc5.txt +497 -0
- data/test/docs/doc6.txt +116 -0
- data/test/docs/doc7.txt +101 -0
- data/test/docs/doc8.txt +25 -0
- data/test/docs/doc9.txt +84 -0
- data/test/tagger_test.rb +60 -0
- data/test/test_helper.rb +2 -0
- data/tools/rakehelp.rb +113 -0
- data/website/index.html +113 -0
- data/website/index.txt +53 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +138 -0
- data/website/template.html.erb +48 -0
- metadata +155 -0
@@ -0,0 +1,46 @@
|
|
1
|
+
#ifndef _regyP_h_
|
2
|
+
#define _regyP_h_
|
3
|
+
|
4
|
+
#include "memory.h"
|
5
|
+
#include "sysdep.h"
|
6
|
+
#include "darray.h"
|
7
|
+
|
8
|
+
#include "registry.h"
|
9
|
+
|
10
|
+
/* private internal representation of the table */
|
11
|
+
typedef struct RegistryRecord_st {
|
12
|
+
VOIDP name;
|
13
|
+
VOIDP obj;
|
14
|
+
struct RegistryRecord_st *next;
|
15
|
+
} RegistryRecord;
|
16
|
+
|
17
|
+
#define DEFAULT_HT_SIZE 97 /* This should be prime */
|
18
|
+
|
19
|
+
/* The Registry representation */
|
20
|
+
typedef struct Registry_st {
|
21
|
+
unsigned int ht_size;
|
22
|
+
RegistryRecord **hash_table; /* First record of directory */
|
23
|
+
Registry_CompareFunc comp_fun; /* Comparison function */
|
24
|
+
Registry_HashFunc hash_fun; /* Hash function */
|
25
|
+
unsigned int record_count; /* Number of records in the registry */
|
26
|
+
} Registry_rep;
|
27
|
+
|
28
|
+
/* private traversal routine used to implement Registry_fetch_contents() */
|
29
|
+
#ifdef __STDC__
|
30
|
+
static NORET add_to_darrays(VOIDP, VOIDP, VOIDP);
|
31
|
+
#else
|
32
|
+
static NORET add_to_darrays();
|
33
|
+
#endif
|
34
|
+
|
35
|
+
/* used when calling add_to_darrays() within Registry_fetch_contents() */
|
36
|
+
struct darray_pair {
|
37
|
+
Darray key_darray;
|
38
|
+
Darray value_darray;
|
39
|
+
};
|
40
|
+
|
41
|
+
#define raise(p_to_rep) ((Registry)p_to_rep)
|
42
|
+
#define lower(obj) ((Registry_rep *)obj)
|
43
|
+
#define create() ((Registry_rep *) Memory_allocate(sizeof (Registry_rep)))
|
44
|
+
#define destroy(p_to_rep) (Memory_free((VOIDP)p_to_rep))
|
45
|
+
|
46
|
+
#endif
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#ifndef _RUBY_COMPAT_HEADER_H
|
2
|
+
#define _RUBY_COMPAT_HEADER_H
|
3
|
+
|
4
|
+
#define DEBUG
|
5
|
+
#ifdef DEBUG
|
6
|
+
#define TRACE() fprintf(stderr, "> %s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__)
|
7
|
+
#else
|
8
|
+
#define TRACE()
|
9
|
+
#endif
|
10
|
+
|
11
|
+
/* ruby 1.9 compat */
|
12
|
+
#ifndef RSTRING_PTR
|
13
|
+
#define RSTRING_PTR(str) RSTRING(str)->ptr
|
14
|
+
#endif
|
15
|
+
|
16
|
+
#ifndef RSTRING_LEN
|
17
|
+
#define RSTRING_LEN(str) RSTRING(str)->len
|
18
|
+
#endif
|
19
|
+
|
20
|
+
#endif
|
@@ -0,0 +1,525 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include "useful.h"
|
5
|
+
#include "rules.h"
|
6
|
+
#include "lex.h"
|
7
|
+
#include "darray.h"
|
8
|
+
#include "registry.h"
|
9
|
+
#include "memory.h"
|
10
|
+
|
11
|
+
#define MAXTAGLEN 256 /* max char length of pos tags */
|
12
|
+
#define MAXWORDLEN 256 /* max char length of words */
|
13
|
+
#define MAXAFFIXLEN 5 /* max length of affixes being considered */
|
14
|
+
|
15
|
+
void change_the_tag(theentry,thetag,theposition)
|
16
|
+
char **theentry, *thetag;
|
17
|
+
int theposition;
|
18
|
+
{
|
19
|
+
free(theentry[theposition]);
|
20
|
+
theentry[theposition] = strdup(thetag);
|
21
|
+
}
|
22
|
+
|
23
|
+
void change_the_tag_darray(tag_array,theposition,thetag)
|
24
|
+
Darray tag_array;
|
25
|
+
int theposition;
|
26
|
+
char *thetag;
|
27
|
+
{
|
28
|
+
free(Darray_get(tag_array, theposition));
|
29
|
+
Darray_set(tag_array, theposition, strdup(thetag));
|
30
|
+
}
|
31
|
+
|
32
|
+
void rule_destroy(trans_rule *r) {
|
33
|
+
free(r->old);
|
34
|
+
free(r->new);
|
35
|
+
free(r->when);
|
36
|
+
free(r->arg1);
|
37
|
+
free(r->arg2);
|
38
|
+
free(r);
|
39
|
+
}
|
40
|
+
|
41
|
+
trans_rule *parse_lexical_rule (const char *rule_text) {
|
42
|
+
trans_rule *rule = (trans_rule*) malloc(sizeof(trans_rule));
|
43
|
+
char **split_ptr = perl_split(rule_text);
|
44
|
+
|
45
|
+
/* The general rule-pattern is:
|
46
|
+
* [old] arg1 when [arg2] new
|
47
|
+
* 'old' is only present when 'when' starts with 'f'.
|
48
|
+
* 'arg2' is only present for a few 'when' types.
|
49
|
+
*/
|
50
|
+
|
51
|
+
|
52
|
+
int offset = 0;
|
53
|
+
|
54
|
+
/* Rule types starting with 'f' have an extra 'old' arg at the beginning */
|
55
|
+
if (*split_ptr[2] == 'f') {
|
56
|
+
rule->old = strdup(split_ptr[0]);
|
57
|
+
offset = 1;
|
58
|
+
} else {
|
59
|
+
rule->old = NULL;
|
60
|
+
}
|
61
|
+
|
62
|
+
rule->arg1 = strdup(split_ptr[0 + offset]);
|
63
|
+
rule->when = strdup(split_ptr[1 + offset]);
|
64
|
+
|
65
|
+
/* A few rules have a string-length argument too */
|
66
|
+
if (strstr(rule->when, "hassuf") ||
|
67
|
+
strstr(rule->when, "haspref") ||
|
68
|
+
strstr(rule->when, "addpref") ||
|
69
|
+
strstr(rule->when, "addsuf") ||
|
70
|
+
strstr(rule->when, "deletesuf") ||
|
71
|
+
strstr(rule->when, "deletepref") ) {
|
72
|
+
rule->arg2 = strdup(split_ptr[2 + offset]);
|
73
|
+
offset++;
|
74
|
+
} else {
|
75
|
+
rule->arg2 = NULL;
|
76
|
+
}
|
77
|
+
|
78
|
+
rule->new = strdup(split_ptr[2 + offset]);
|
79
|
+
|
80
|
+
perl_split_free( split_ptr );
|
81
|
+
|
82
|
+
return rule;
|
83
|
+
}
|
84
|
+
|
85
|
+
trans_rule *parse_contextual_rule (const char *rule_text) {
|
86
|
+
trans_rule *rule = (trans_rule*) malloc(sizeof(trans_rule));
|
87
|
+
char **split_ptr = perl_split(rule_text);
|
88
|
+
|
89
|
+
rule->old = strdup(split_ptr[0]);
|
90
|
+
rule->new = strdup(split_ptr[1]);
|
91
|
+
rule->when = strdup(split_ptr[2]);
|
92
|
+
rule->arg1 = strdup(split_ptr[3]);
|
93
|
+
|
94
|
+
/* The following rule-types take an additional argument */
|
95
|
+
if (strcmp(rule->when, "SURROUNDTAG") == 0 ||
|
96
|
+
strcmp(rule->when, "PREVBIGRAM") == 0 ||
|
97
|
+
strcmp(rule->when, "NEXTBIGRAM") == 0 ||
|
98
|
+
strcmp(rule->when, "LBIGRAM") == 0 ||
|
99
|
+
strcmp(rule->when, "WDPREVTAG") == 0 ||
|
100
|
+
strcmp(rule->when, "RBIGRAM") == 0 ||
|
101
|
+
strcmp(rule->when, "WDNEXTTAG") == 0 ||
|
102
|
+
strcmp(rule->when, "WDAND2BFR") == 0 ||
|
103
|
+
strcmp(rule->when, "WDAND2TAGBFR") == 0 ||
|
104
|
+
strcmp(rule->when, "WDAND2AFT") == 0 ||
|
105
|
+
strcmp(rule->when, "WDAND2TAGAFT") == 0 )
|
106
|
+
|
107
|
+
rule->arg2 = strdup(split_ptr[4]);
|
108
|
+
else
|
109
|
+
rule->arg2 = NULL;
|
110
|
+
|
111
|
+
perl_split_free( split_ptr );
|
112
|
+
|
113
|
+
return rule;
|
114
|
+
}
|
115
|
+
|
116
|
+
|
117
|
+
void apply_contextual_rule(const trans_rule *r,
|
118
|
+
char **word_corpus_array,
|
119
|
+
char **tag_corpus_array,
|
120
|
+
int corpus_size,
|
121
|
+
int RESTRICT_MOVE,
|
122
|
+
Registry WORDS,
|
123
|
+
Registry SEENTAGGING
|
124
|
+
) {
|
125
|
+
|
126
|
+
char atempstr2[256];
|
127
|
+
|
128
|
+
int count, tempcount1, tempcount2;
|
129
|
+
|
130
|
+
corpus_size--; /* Is used below as the index of the last element (dunno why...) */
|
131
|
+
|
132
|
+
/* fprintf(stderr,"R: OLD: %s NEW: %s WHEN: %s (%s).\n", r->old, r->new, r->when, r->arg1); */
|
133
|
+
|
134
|
+
for (count = 0; count <= corpus_size; ++count) {
|
135
|
+
if (strcmp(tag_corpus_array[count], r->old) == 0) {
|
136
|
+
|
137
|
+
sprintf(atempstr2,"%s %s", word_corpus_array[count], r->new);
|
138
|
+
|
139
|
+
if (! RESTRICT_MOVE ||
|
140
|
+
! Registry_get(WORDS, word_corpus_array[count]) ||
|
141
|
+
Registry_get(SEENTAGGING,atempstr2)) {
|
142
|
+
|
143
|
+
if (strcmp(r->when, "SURROUNDTAG") == 0) {
|
144
|
+
if (count < corpus_size && count > 0) {
|
145
|
+
if (strcmp(r->arg1, tag_corpus_array[count - 1]) == 0 &&
|
146
|
+
strcmp(r->arg2, tag_corpus_array[count + 1]) == 0)
|
147
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
148
|
+
}
|
149
|
+
} else if (strcmp(r->when, "NEXTTAG") == 0) {
|
150
|
+
if (count < corpus_size) {
|
151
|
+
if (strcmp(r->arg1,tag_corpus_array[count + 1]) == 0)
|
152
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
153
|
+
}
|
154
|
+
}
|
155
|
+
else if (strcmp(r->when, "CURWD") == 0) {
|
156
|
+
if (strcmp(r->arg1, word_corpus_array[count]) == 0)
|
157
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
158
|
+
}
|
159
|
+
else if (strcmp(r->when, "NEXTWD") == 0) {
|
160
|
+
if (count < corpus_size) {
|
161
|
+
if (strcmp(r->arg1, word_corpus_array[count + 1]) == 0)
|
162
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
163
|
+
}
|
164
|
+
}
|
165
|
+
else if (strcmp(r->when, "RBIGRAM") == 0) {
|
166
|
+
if (count < corpus_size) {
|
167
|
+
if (strcmp(r->arg1, word_corpus_array[count]) ==
|
168
|
+
0 &&
|
169
|
+
strcmp(r->arg2, word_corpus_array[count+1]) ==
|
170
|
+
0)
|
171
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
172
|
+
}
|
173
|
+
}
|
174
|
+
else if (strcmp(r->when, "WDNEXTTAG") == 0) {
|
175
|
+
if (count < corpus_size) {
|
176
|
+
if (strcmp(r->arg1, word_corpus_array[count]) ==
|
177
|
+
0 &&
|
178
|
+
strcmp(r->arg2, tag_corpus_array[count+1]) ==
|
179
|
+
0)
|
180
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
181
|
+
}
|
182
|
+
}
|
183
|
+
|
184
|
+
else if (strcmp(r->when, "WDAND2AFT") == 0) {
|
185
|
+
if (count < corpus_size-1) {
|
186
|
+
if (strcmp(r->arg1, word_corpus_array[count]) ==
|
187
|
+
0 &&
|
188
|
+
strcmp(r->arg2, word_corpus_array[count+2]) ==
|
189
|
+
0)
|
190
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
191
|
+
}
|
192
|
+
}
|
193
|
+
else if (strcmp(r->when, "WDAND2TAGAFT") == 0) {
|
194
|
+
if (count < corpus_size-1) {
|
195
|
+
if (strcmp(r->arg1, word_corpus_array[count]) ==
|
196
|
+
0 &&
|
197
|
+
strcmp(r->arg2, tag_corpus_array[count+2]) ==
|
198
|
+
0)
|
199
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
200
|
+
}
|
201
|
+
}
|
202
|
+
|
203
|
+
else if (strcmp(r->when, "NEXT2TAG") == 0) {
|
204
|
+
if (count < corpus_size - 1) {
|
205
|
+
if (strcmp(r->arg1, tag_corpus_array[count + 2]) == 0)
|
206
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
207
|
+
}
|
208
|
+
} else if (strcmp(r->when, "NEXT2WD") == 0) {
|
209
|
+
if (count < corpus_size - 1) {
|
210
|
+
if (strcmp(r->arg1, word_corpus_array[count + 2]) == 0)
|
211
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
212
|
+
}
|
213
|
+
} else if (strcmp(r->when, "NEXTBIGRAM") == 0) {
|
214
|
+
if (count < corpus_size - 1) {
|
215
|
+
if
|
216
|
+
(strcmp(r->arg1, tag_corpus_array[count + 1]) == 0 &&
|
217
|
+
strcmp(r->arg2, tag_corpus_array[count + 2]) == 0)
|
218
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
219
|
+
}
|
220
|
+
} else if (strcmp(r->when, "NEXT1OR2TAG") == 0) {
|
221
|
+
if (count < corpus_size) {
|
222
|
+
if (count < corpus_size-1)
|
223
|
+
tempcount1 = count+2;
|
224
|
+
else
|
225
|
+
tempcount1 = count+1;
|
226
|
+
if
|
227
|
+
(strcmp(r->arg1, tag_corpus_array[count + 1]) == 0 ||
|
228
|
+
strcmp(r->arg1, tag_corpus_array[tempcount1]) == 0)
|
229
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
230
|
+
}
|
231
|
+
} else if (strcmp(r->when, "NEXT1OR2WD") == 0) {
|
232
|
+
if (count < corpus_size) {
|
233
|
+
if (count < corpus_size-1)
|
234
|
+
tempcount1 = count+2;
|
235
|
+
else
|
236
|
+
tempcount1 = count+1;
|
237
|
+
if
|
238
|
+
(strcmp(r->arg1, word_corpus_array[count + 1]) == 0 ||
|
239
|
+
strcmp(r->arg1, word_corpus_array[tempcount1]) == 0)
|
240
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
241
|
+
}
|
242
|
+
} else if (strcmp(r->when, "NEXT1OR2OR3TAG") == 0) {
|
243
|
+
if (count < corpus_size) {
|
244
|
+
if (count < corpus_size -1)
|
245
|
+
tempcount1 = count+2;
|
246
|
+
else
|
247
|
+
tempcount1 = count+1;
|
248
|
+
if (count < corpus_size-2)
|
249
|
+
tempcount2 = count+3;
|
250
|
+
else
|
251
|
+
tempcount2 =count+1;
|
252
|
+
if
|
253
|
+
(strcmp(r->arg1, tag_corpus_array[count + 1]) == 0 ||
|
254
|
+
strcmp(r->arg1, tag_corpus_array[tempcount1]) == 0 ||
|
255
|
+
strcmp(r->arg1, tag_corpus_array[tempcount2]) == 0)
|
256
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
257
|
+
}
|
258
|
+
} else if (strcmp(r->when, "NEXT1OR2OR3WD") == 0) {
|
259
|
+
if (count < corpus_size) {
|
260
|
+
if (count < corpus_size -1)
|
261
|
+
tempcount1 = count+2;
|
262
|
+
else
|
263
|
+
tempcount1 = count+1;
|
264
|
+
if (count < corpus_size-2)
|
265
|
+
tempcount2 = count+3;
|
266
|
+
else
|
267
|
+
tempcount2 =count+1;
|
268
|
+
if
|
269
|
+
(strcmp(r->arg1, word_corpus_array[count + 1]) == 0 ||
|
270
|
+
strcmp(r->arg1, word_corpus_array[tempcount1]) == 0 ||
|
271
|
+
strcmp(r->arg1, word_corpus_array[tempcount2]) == 0)
|
272
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
273
|
+
}
|
274
|
+
} else if (strcmp(r->when, "PREVTAG") == 0) {
|
275
|
+
if (count > 0) {
|
276
|
+
if (strcmp(r->arg1, tag_corpus_array[count - 1]) == 0) {
|
277
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
278
|
+
}
|
279
|
+
}
|
280
|
+
} else if (strcmp(r->when, "PREVWD") == 0) {
|
281
|
+
if (count > 0) {
|
282
|
+
if (strcmp(r->arg1, word_corpus_array[count - 1]) == 0) {
|
283
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
284
|
+
}
|
285
|
+
}
|
286
|
+
}
|
287
|
+
else if (strcmp(r->when, "LBIGRAM") == 0) {
|
288
|
+
if (count > 0) {
|
289
|
+
if (strcmp(r->arg2, word_corpus_array[count]) ==
|
290
|
+
0 &&
|
291
|
+
strcmp(r->arg1, word_corpus_array[count-1]) ==
|
292
|
+
0)
|
293
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
294
|
+
}
|
295
|
+
}
|
296
|
+
else if (strcmp(r->when, "WDPREVTAG") == 0) {
|
297
|
+
if (count > 0) {
|
298
|
+
if (strcmp(r->arg2, word_corpus_array[count]) ==
|
299
|
+
0 &&
|
300
|
+
strcmp(r->arg1, tag_corpus_array[count-1]) ==
|
301
|
+
0)
|
302
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
303
|
+
}
|
304
|
+
}
|
305
|
+
else if (strcmp(r->when, "WDAND2BFR") == 0) {
|
306
|
+
if (count > 1) {
|
307
|
+
if (strcmp(r->arg2, word_corpus_array[count]) ==
|
308
|
+
0 &&
|
309
|
+
strcmp(r->arg1, word_corpus_array[count-2]) ==
|
310
|
+
0)
|
311
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
312
|
+
}
|
313
|
+
}
|
314
|
+
else if (strcmp(r->when, "WDAND2TAGBFR") == 0) {
|
315
|
+
if (count > 1) {
|
316
|
+
if (strcmp(r->arg2, word_corpus_array[count]) ==
|
317
|
+
0 &&
|
318
|
+
strcmp(r->arg1, tag_corpus_array[count-2]) ==
|
319
|
+
0)
|
320
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
321
|
+
}
|
322
|
+
}
|
323
|
+
|
324
|
+
else if (strcmp(r->when, "PREV2TAG") == 0) {
|
325
|
+
if (count > 1) {
|
326
|
+
if (strcmp(r->arg1, tag_corpus_array[count - 2]) == 0)
|
327
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
328
|
+
}
|
329
|
+
} else if (strcmp(r->when, "PREV2WD") == 0) {
|
330
|
+
if (count > 1) {
|
331
|
+
if (strcmp(r->arg1, word_corpus_array[count - 2]) == 0)
|
332
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
333
|
+
}
|
334
|
+
} else if (strcmp(r->when, "PREV1OR2TAG") == 0) {
|
335
|
+
if (count > 0) {
|
336
|
+
if (count > 1)
|
337
|
+
tempcount1 = count-2;
|
338
|
+
else
|
339
|
+
tempcount1 = count-1;
|
340
|
+
if (strcmp(r->arg1, tag_corpus_array[count - 1]) == 0 ||
|
341
|
+
strcmp(r->arg1, tag_corpus_array[tempcount1]) == 0)
|
342
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
343
|
+
}
|
344
|
+
} else if (strcmp(r->when, "PREV1OR2WD") == 0) {
|
345
|
+
if (count > 0) {
|
346
|
+
if (count > 1)
|
347
|
+
tempcount1 = count-2;
|
348
|
+
else
|
349
|
+
tempcount1 = count-1;
|
350
|
+
if (strcmp(r->arg1, word_corpus_array[count - 1]) == 0 ||
|
351
|
+
strcmp(r->arg1, word_corpus_array[tempcount1]) == 0)
|
352
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
353
|
+
}
|
354
|
+
} else if (strcmp(r->when, "PREV1OR2OR3TAG") == 0) {
|
355
|
+
if (count > 0) {
|
356
|
+
if (count>1)
|
357
|
+
tempcount1 = count-2;
|
358
|
+
else
|
359
|
+
tempcount1 = count-1;
|
360
|
+
if (count >2)
|
361
|
+
tempcount2 = count-3;
|
362
|
+
else
|
363
|
+
tempcount2 = count-1;
|
364
|
+
if (strcmp(r->arg1, tag_corpus_array[count - 1]) == 0 ||
|
365
|
+
strcmp(r->arg1, tag_corpus_array[tempcount1]) == 0 ||
|
366
|
+
strcmp(r->arg1, tag_corpus_array[tempcount2]) == 0)
|
367
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
368
|
+
}
|
369
|
+
} else if (strcmp(r->when, "PREV1OR2OR3WD") == 0) {
|
370
|
+
if (count > 0) {
|
371
|
+
if (count>1)
|
372
|
+
tempcount1 = count-2;
|
373
|
+
else
|
374
|
+
tempcount1 = count-1;
|
375
|
+
if (count >2)
|
376
|
+
tempcount2 = count-3;
|
377
|
+
else
|
378
|
+
tempcount2 = count-1;
|
379
|
+
if (strcmp(r->arg1, word_corpus_array[count - 1]) == 0 ||
|
380
|
+
strcmp(r->arg1, word_corpus_array[tempcount1]) == 0 ||
|
381
|
+
strcmp(r->arg1, word_corpus_array[tempcount2]) == 0)
|
382
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
383
|
+
}
|
384
|
+
} else if (strcmp(r->when, "PREVBIGRAM") == 0) {
|
385
|
+
if (count > 1) {
|
386
|
+
if (strcmp(r->arg2, tag_corpus_array[count - 1]) == 0 &&
|
387
|
+
strcmp(r->arg1, tag_corpus_array[count - 2]) == 0)
|
388
|
+
change_the_tag(tag_corpus_array, r->new, count);
|
389
|
+
}
|
390
|
+
}
|
391
|
+
else
|
392
|
+
fprintf(stderr,
|
393
|
+
"ERROR: %s is not an allowable transform type\n",
|
394
|
+
r->when);
|
395
|
+
}
|
396
|
+
}
|
397
|
+
}
|
398
|
+
}
|
399
|
+
|
400
|
+
|
401
|
+
void apply_lexical_rule(const trans_rule *r,
|
402
|
+
Darray tag_array_key,
|
403
|
+
Darray tag_array_val,
|
404
|
+
Registry lexicon_hash,
|
405
|
+
Registry wordlist_hash,
|
406
|
+
Registry bigram_hash,
|
407
|
+
int EXTRAWDS
|
408
|
+
) {
|
409
|
+
|
410
|
+
int count2, count3, tempcount;
|
411
|
+
char *tempstr2;
|
412
|
+
char *rule_text;
|
413
|
+
|
414
|
+
char tempstr_space[MAXWORDLEN+MAXAFFIXLEN], bigram_space[MAXWORDLEN*2];
|
415
|
+
|
416
|
+
int check_current_tag = (r->when[0] == 'f');
|
417
|
+
char *name = strdup( check_current_tag ? &r->when[1] : r->when );
|
418
|
+
|
419
|
+
for (count2=0;count2<Darray_len(tag_array_key);++count2) {
|
420
|
+
|
421
|
+
if (check_current_tag
|
422
|
+
? (strcmp(Darray_get(tag_array_val, count2), r->old) != 0)
|
423
|
+
: (strcmp(Darray_get(tag_array_val, count2), r->new) == 0))
|
424
|
+
continue;
|
425
|
+
|
426
|
+
if (strcmp(name, "char") == 0) {
|
427
|
+
if(strpbrk(Darray_get(tag_array_key,count2), r->arg1)) {
|
428
|
+
change_the_tag_darray(tag_array_val,count2,r->new);
|
429
|
+
}
|
430
|
+
}
|
431
|
+
else if (strcmp(name, "deletepref") == 0) {
|
432
|
+
int arg1_len = atoi(r->arg2);
|
433
|
+
|
434
|
+
rule_text = Darray_get(tag_array_key,count2);
|
435
|
+
for (count3=0;count3<arg1_len;++count3) {
|
436
|
+
if (rule_text[count3] != r->arg1[count3])
|
437
|
+
break;}
|
438
|
+
if (count3 == arg1_len) {
|
439
|
+
rule_text += arg1_len;
|
440
|
+
if (Registry_get(lexicon_hash,(char *)rule_text) != NULL ||
|
441
|
+
(EXTRAWDS &&
|
442
|
+
Registry_get(wordlist_hash,(char *)rule_text) != NULL)){
|
443
|
+
change_the_tag_darray(tag_array_val,count2,r->new);}
|
444
|
+
}
|
445
|
+
}
|
446
|
+
else if (strcmp(name,"haspref") == 0) {
|
447
|
+
int arg1_len = atoi(r->arg2);
|
448
|
+
|
449
|
+
rule_text = Darray_get(tag_array_key,count2);
|
450
|
+
for (count3=0;count3<arg1_len;++count3) {
|
451
|
+
if (rule_text[count3] != r->arg1[count3])
|
452
|
+
break;}
|
453
|
+
if (count3 == arg1_len) {
|
454
|
+
change_the_tag_darray(tag_array_val,count2,r->new);}
|
455
|
+
}
|
456
|
+
else if (strcmp(name,"deletesuf") == 0) {
|
457
|
+
int arg1_len = atoi(r->arg2);
|
458
|
+
|
459
|
+
rule_text = Darray_get(tag_array_key,count2);
|
460
|
+
tempcount=strlen(rule_text)-arg1_len;
|
461
|
+
for (count3=tempcount;
|
462
|
+
count3<strlen(rule_text); ++count3) {
|
463
|
+
if (rule_text[count3] != r->arg1[count3-tempcount])
|
464
|
+
break;}
|
465
|
+
if (count3 == strlen(rule_text)) {
|
466
|
+
tempstr2 = strdup(rule_text);
|
467
|
+
tempstr2[tempcount] = '\0';
|
468
|
+
if (Registry_get(lexicon_hash,(char *)tempstr2) != NULL ||
|
469
|
+
(EXTRAWDS &&
|
470
|
+
Registry_get(wordlist_hash,(char *)tempstr2) != NULL)) {
|
471
|
+
|
472
|
+
change_the_tag_darray(tag_array_val,count2,r->new);}
|
473
|
+
free(tempstr2);
|
474
|
+
}
|
475
|
+
}
|
476
|
+
else if (strcmp(name,"hassuf") == 0) {
|
477
|
+
int arg1_len = atoi(r->arg2);
|
478
|
+
|
479
|
+
rule_text = Darray_get(tag_array_key,count2);
|
480
|
+
tempcount=strlen(rule_text)-arg1_len;
|
481
|
+
for (count3=tempcount;
|
482
|
+
count3<strlen(rule_text); ++count3) {
|
483
|
+
if (rule_text[count3] != r->arg1[count3-tempcount])
|
484
|
+
break;}
|
485
|
+
if (count3 == strlen(rule_text)) {
|
486
|
+
|
487
|
+
change_the_tag_darray(tag_array_val,count2,r->new);}
|
488
|
+
}
|
489
|
+
else if (strcmp(name,"addpref") == 0) {
|
490
|
+
snprintf(tempstr_space,MAXWORDLEN+MAXAFFIXLEN,"%s%s",
|
491
|
+
(char*)r->arg1,(char*)Darray_get(tag_array_key,count2));
|
492
|
+
if (Registry_get(lexicon_hash,(char *)tempstr_space) != NULL
|
493
|
+
||
|
494
|
+
(EXTRAWDS &&
|
495
|
+
Registry_get(wordlist_hash,(char *)tempstr_space) != NULL)) {
|
496
|
+
|
497
|
+
change_the_tag_darray(tag_array_val,count2,r->new);}
|
498
|
+
}
|
499
|
+
else if (strcmp(name,"addsuf") == 0) {
|
500
|
+
snprintf(tempstr_space,MAXWORDLEN+MAXAFFIXLEN,"%s%s",
|
501
|
+
(char*)Darray_get(tag_array_key,count2),
|
502
|
+
(char*)r->arg1);
|
503
|
+
if (Registry_get(lexicon_hash,(char *)tempstr_space) != NULL
|
504
|
+
||
|
505
|
+
(EXTRAWDS &&
|
506
|
+
Registry_get(wordlist_hash,(char *)tempstr_space) != NULL)){
|
507
|
+
|
508
|
+
change_the_tag_darray(tag_array_val,count2,r->new);}
|
509
|
+
}
|
510
|
+
else if (strcmp(name,"goodleft") == 0) {
|
511
|
+
snprintf(bigram_space,MAXWORDLEN*2,"%s %s",
|
512
|
+
(char*)Darray_get(tag_array_key,count2),(char*)r->arg1);
|
513
|
+
if (Registry_get(bigram_hash,(char *)bigram_space) != NULL) {
|
514
|
+
|
515
|
+
change_the_tag_darray(tag_array_val,count2,r->new);}
|
516
|
+
}
|
517
|
+
else if (strcmp(name,"goodright") == 0) {
|
518
|
+
snprintf(bigram_space,MAXWORDLEN*2,"%s %s",(char*)r->arg1,(char*)Darray_get(tag_array_key,count2));
|
519
|
+
if (Registry_get(bigram_hash,(char *)bigram_space) != NULL) {
|
520
|
+
|
521
|
+
change_the_tag_darray(tag_array_val,count2,r->new);}
|
522
|
+
}
|
523
|
+
}
|
524
|
+
free( name );
|
525
|
+
}
|
@@ -0,0 +1,42 @@
|
|
1
|
+
|
2
|
+
#ifndef _RULES_H_
|
3
|
+
#define _RULES_H_
|
4
|
+
|
5
|
+
#include "darray.h"
|
6
|
+
#include "registry.h"
|
7
|
+
|
8
|
+
typedef struct {
|
9
|
+
char *old;
|
10
|
+
char *new;
|
11
|
+
char *when;
|
12
|
+
char *arg1;
|
13
|
+
char *arg2;
|
14
|
+
} trans_rule;
|
15
|
+
|
16
|
+
trans_rule *parse_lexical_rule (const char *rule_text);
|
17
|
+
|
18
|
+
trans_rule *parse_contextual_rule (const char *rule_text);
|
19
|
+
|
20
|
+
void rule_destroy(trans_rule *r);
|
21
|
+
|
22
|
+
void change_the_tag(char **theentry, char *thetag, int theposition);
|
23
|
+
|
24
|
+
void apply_lexical_rule(const trans_rule *r,
|
25
|
+
Darray tag_array_key,
|
26
|
+
Darray tag_array_val,
|
27
|
+
Registry lexicon_hash,
|
28
|
+
Registry wordlist_hash,
|
29
|
+
Registry bigram_hash,
|
30
|
+
int EXTRAWDS
|
31
|
+
);
|
32
|
+
|
33
|
+
void apply_contextual_rule(const trans_rule *r,
|
34
|
+
char **word_corpus_array,
|
35
|
+
char **tag_corpus_array,
|
36
|
+
int corpus_size,
|
37
|
+
int RESTRICT_MOVE,
|
38
|
+
Registry WORDS,
|
39
|
+
Registry SEENTAGGING
|
40
|
+
);
|
41
|
+
|
42
|
+
#endif /* _RULES_H_ */
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#ifndef _SYSDEP_H_
|
2
|
+
#define _SYSDEP_H_
|
3
|
+
|
4
|
+
#define NORET void
|
5
|
+
|
6
|
+
/* CONSTVOIDP is for pointers to non-modifyable void objects */
|
7
|
+
|
8
|
+
#ifdef __STDC__
|
9
|
+
typedef const void * CONSTVOIDP;
|
10
|
+
typedef void * VOIDP;
|
11
|
+
#define NOARGS void
|
12
|
+
#define PROTOTYPE(x) x
|
13
|
+
#else
|
14
|
+
typedef char * VOIDP;
|
15
|
+
typedef char * CONSTVOIDP;
|
16
|
+
#define NOARGS
|
17
|
+
#define PROTOTYPE(x) ()
|
18
|
+
#endif
|
19
|
+
|
20
|
+
#endif /* ifndef _SYSDEP_H_ */
|