scws4r 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/ext/scws4r/rule.c ADDED
@@ -0,0 +1,407 @@
1
+ /**
2
+ * @file rule.c (auto surame & areaname & special group)
3
+ * @author Hightman Mar
4
+ * @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
5
+ * $Id$
6
+ */
7
+
8
+ #ifdef HAVE_CONFIG_H
9
+ # include "config.h"
10
+ #endif
11
+
12
+ #ifdef WIN32
13
+ # include "config_win32.h"
14
+ #endif
15
+
16
+ #include "rule.h"
17
+ #include <stdio.h>
18
+ #include <stdlib.h>
19
+ #include <string.h>
20
+
21
+ static inline int _rule_index_get(rule_t r, const char *name)
22
+ {
23
+ int i;
24
+ for (i = 0; i < SCWS_RULE_MAX; i++)
25
+ {
26
+ if (r->items[i].name[0] == '\0')
27
+ break;
28
+
29
+ if (!strcasecmp(r->items[i].name, name))
30
+ return i;
31
+ }
32
+ return -1;
33
+ }
34
+
35
+ rule_t scws_rule_new(const char *fpath, unsigned char *mblen)
36
+ {
37
+ FILE *fp;
38
+ rule_t r;
39
+ rule_item_t cr;
40
+ int i, j, rbl, aflag;
41
+ rule_attr_t a, rtail;
42
+ unsigned char buf[512], *str, *ptr, *qtr;
43
+
44
+ /* loaded or open file failed */
45
+ if ((fp = fopen(fpath, "r")) == NULL)
46
+ return NULL;
47
+
48
+ /* alloc the memory */
49
+ r = (rule_t) malloc(sizeof(rule_st));
50
+ memset(r, 0, sizeof(rule_st));
51
+ r->ref = 1;
52
+
53
+ /* quick scan to add the name to list */
54
+ i = j = rbl = aflag = 0;
55
+ while (fgets(buf, sizeof(buf) - 1, fp))
56
+ {
57
+ if (buf[0] != '[' || !(ptr = strchr(buf, ']')))
58
+ continue;
59
+
60
+ str = buf + 1;
61
+ *ptr = '\0';
62
+ if (ptr == str || (ptr - str) > 15 || !strcasecmp(str, "attrs"))
63
+ continue;
64
+
65
+ if (_rule_index_get(r, str) >= 0)
66
+ continue;
67
+
68
+ strcpy(r->items[i].name, str);
69
+ r->items[i].tf = 5.0;
70
+ r->items[i].idf = 3.5;
71
+ strncpy(r->items[i].attr, "un", 2);
72
+ if (!strcasecmp(str, "special"))
73
+ r->items[i].bit = SCWS_RULE_SPECIAL;
74
+ else if (!strcasecmp(str, "nostats"))
75
+ r->items[i].bit = SCWS_RULE_NOSTATS;
76
+ else
77
+ {
78
+ r->items[i].bit = (1 << j);
79
+ j++;
80
+ }
81
+
82
+ if (++i >= SCWS_RULE_MAX)
83
+ break;
84
+ }
85
+ rewind(fp);
86
+
87
+ /* load the tree data */
88
+ if ((r->tree = xtree_new(0, 1)) == NULL)
89
+ {
90
+ free(r);
91
+ return NULL;
92
+ }
93
+ cr = NULL;
94
+ while (fgets(buf, sizeof(buf) - 1, fp))
95
+ {
96
+ if (buf[0] == ';')
97
+ continue;
98
+
99
+ if (buf[0] == '[')
100
+ {
101
+ cr = NULL;
102
+ str = buf + 1;
103
+ aflag = 0;
104
+ if ((ptr = strchr(str, ']')) != NULL)
105
+ {
106
+ *ptr = '\0';
107
+ if (!strcasecmp(str, "attrs"))
108
+ {
109
+ aflag = 1;
110
+ }
111
+ else if ((i = _rule_index_get(r, str)) >= 0)
112
+ {
113
+ rbl = 1; /* default read by line = yes */
114
+ cr = &r->items[i];
115
+ }
116
+ }
117
+ continue;
118
+ }
119
+
120
+ /* attr flag open? */
121
+ if (aflag == 1)
122
+ {
123
+ /* parse the attr line */
124
+ str = buf;
125
+ while (*str == ' ' || *str == '\t') str++;
126
+ if ((ptr = strchr(str, '+')) == NULL) continue;
127
+ *ptr++ = '\0';
128
+ if ((qtr = strchr(ptr, '=')) == NULL) continue;
129
+ *qtr++ = '\0';
130
+
131
+ /* create new memory */
132
+ a = (rule_attr_t) malloc(sizeof(struct scws_rule_attr));
133
+ memset(a, 0, sizeof(struct scws_rule_attr));
134
+
135
+ /* get ratio */
136
+ while (*qtr == ' ' || *qtr == '\t') qtr++;
137
+ a->ratio = (short) atoi(qtr);
138
+ if (a->ratio < 1)
139
+ a->ratio = 1;
140
+ a->npath[0] = a->npath[1] = 0xff;
141
+
142
+ /* read attr1 & npath1? */
143
+ a->attr1[0] = *str++;
144
+ if (*str && *str != '(' && *str != ' ' && *str != '\t')
145
+ a->attr1[1] = *str++;
146
+ while (*str && *str != '(') str++;
147
+ if (*str == '(')
148
+ {
149
+ str++;
150
+ if ((qtr = strchr(str, ')')) != NULL)
151
+ {
152
+ *qtr = '\0';
153
+ a->npath[0] = (unsigned char) atoi(str);
154
+ if (a->npath[0] > 0)
155
+ a->npath[0]--;
156
+ else
157
+ a->npath[0] = 0xff;
158
+ }
159
+ }
160
+
161
+ /* read attr1 & npath2? */
162
+ str = ptr;
163
+ while (*str == ' ' || *str == '\t') str++;
164
+ a->attr2[0] = *str++;
165
+ if (*str && *str != '(' && *str != ' ' && *str != '\t')
166
+ a->attr2[1] = *str++;
167
+ while (*str && *str != '(') str++;
168
+ if (*str == '(')
169
+ {
170
+ str++;
171
+ if ((qtr = strchr(str, ')')) != NULL)
172
+ {
173
+ *qtr = '\0';
174
+ a->npath[1] = (unsigned char) atoi(str);
175
+ if (a->npath[1] > 0)
176
+ a->npath[1]--;
177
+ else
178
+ a->npath[1] = 0xff;
179
+ }
180
+ }
181
+
182
+ //printf("%c%c(%d)+%c%c(%d)=%d\n", a->attr1[0], a->attr1[1] ? a->attr1[1] : ' ', a->npath[0],
183
+ // a->attr2[0], a->attr2[1] ? a->attr2[1] : ' ', a->npath[1], a->ratio);
184
+
185
+ /* append to the chain list */
186
+ if (r->attr == NULL)
187
+ r->attr = rtail = a;
188
+ else
189
+ {
190
+ rtail->next = a;
191
+ rtail = a;
192
+ }
193
+
194
+ continue;
195
+ }
196
+
197
+ if (cr == NULL)
198
+ continue;
199
+
200
+ /* param set: line|znum|include|exclude|type|tf|idf|attr */
201
+ if (buf[0] == ':')
202
+ {
203
+ str = buf + 1;
204
+ if (!(ptr = strchr(str, '=')))
205
+ continue;
206
+ while (*str == ' ' || *str == '\t') str++;
207
+
208
+ qtr = ptr + 1;
209
+ while (ptr > str && (ptr[-1] == ' ' || ptr[-1] == '\t')) ptr--;
210
+ *ptr = '\0';
211
+ ptr = str;
212
+ str = qtr;
213
+ while (*str == ' ' || *str == '\t') str++;
214
+
215
+ if (!strcmp(ptr, "line"))
216
+ rbl = (*str == 'N' || *str == 'n') ? 0 : 1;
217
+ else if (!strcmp(ptr, "tf"))
218
+ cr->tf = (float) atof(str);
219
+ else if (!strcmp(ptr, "idf"))
220
+ cr->idf = (float) atof(str);
221
+ else if (!strcmp(ptr, "attr"))
222
+ strncpy(cr->attr, str, 2);
223
+ else if (!strcmp(ptr, "znum"))
224
+ {
225
+ if ((ptr = strchr(str, ',')) != NULL)
226
+ {
227
+ *ptr++ = '\0';
228
+ while (*ptr == ' ' || *ptr == '\t') ptr++;
229
+ cr->zmax = atoi(ptr);
230
+ cr->flag |= SCWS_ZRULE_RANGE;
231
+ }
232
+ cr->zmin = atoi(str);
233
+ }
234
+ else if (!strcmp(ptr, "type"))
235
+ {
236
+ if (!strncmp(str, "prefix", 6))
237
+ cr->flag |= SCWS_ZRULE_PREFIX;
238
+ else if (!strncmp(str, "suffix", 6))
239
+ cr->flag |= SCWS_ZRULE_SUFFIX;
240
+ }
241
+ else if (!strcmp(ptr, "include") || !strcmp(ptr, "exclude"))
242
+ {
243
+ unsigned int *clude;
244
+
245
+ if (!strcmp(ptr, "include"))
246
+ {
247
+ clude = &cr->inc;
248
+ cr->flag |= SCWS_ZRULE_INCLUDE;
249
+ }
250
+ else
251
+ {
252
+ clude = &cr->exc;
253
+ cr->flag |= SCWS_ZRULE_EXCLUDE;
254
+ }
255
+
256
+ while ((ptr = strchr(str, ',')) != NULL)
257
+ {
258
+ while (ptr > str && (ptr[-1] == '\t' || ptr[-1] == ' ')) ptr--;
259
+ *ptr = '\0';
260
+ if ((i = _rule_index_get(r, str)) >= 0)
261
+ *clude |= r->items[i].bit;
262
+
263
+ str = ptr + 1;
264
+ while (*str == ' ' || *str == '\t' || *str == ',') str++;
265
+ }
266
+
267
+ ptr = strlen(str) + str;
268
+ while (ptr > str && strchr(" \t\r\n", ptr[-1])) ptr--;
269
+ *ptr = '\0';
270
+ if (ptr > str && (i = _rule_index_get(r, str)))
271
+ *clude |= r->items[i].bit;
272
+ }
273
+ continue;
274
+ }
275
+
276
+ /* read the entries */
277
+ str = buf;
278
+ while (*str == ' ' || *str == '\t') str++;
279
+ ptr = str + strlen(str);
280
+ while (ptr > str && strchr(" \t\r\n", ptr[-1])) ptr--;
281
+ *ptr = '\0';
282
+
283
+ /* emptry line */
284
+ if (ptr == str)
285
+ continue;
286
+
287
+ if (rbl)
288
+ xtree_nput(r->tree, cr, sizeof(struct scws_rule_item), str, ptr - str);
289
+ else
290
+ {
291
+ while (str < ptr)
292
+ {
293
+ j = mblen[(*str)];
294
+
295
+ #ifdef DEBUG
296
+ /* try to check repeat */
297
+ if ((i = (int) xtree_nget(r->tree, str, j, NULL)) != 0)
298
+ fprintf(stderr, "Reapeat word on %s|%s: %.*s\n", cr->name, ((rule_item_t) i)->name, j, str);
299
+ #endif
300
+ xtree_nput(r->tree, cr, sizeof(struct scws_rule_item), str, j);
301
+ str += j;
302
+ }
303
+ }
304
+ }
305
+ fclose(fp);
306
+
307
+ /* optimize the tree */
308
+ xtree_optimize(r->tree);
309
+ return r;
310
+ }
311
+
312
+ /* fork rule */
313
+ rule_t scws_rule_fork(rule_t r)
314
+ {
315
+ if (r != NULL)
316
+ r->ref++;
317
+ return r;
318
+ }
319
+
320
+ /* free rule */
321
+ void scws_rule_free(rule_t r)
322
+ {
323
+ if (r)
324
+ {
325
+ r->ref--;
326
+ if (r->ref == 0)
327
+ {
328
+ rule_attr_t a, b;
329
+
330
+ xtree_free(r->tree);
331
+ a = r->attr;
332
+ while (a != NULL)
333
+ {
334
+ b = a;
335
+ a = b->next;
336
+ free(b);
337
+ }
338
+ free(r);
339
+ }
340
+ }
341
+ }
342
+
343
+ /* get the rule */
344
+ rule_item_t scws_rule_get(rule_t r, const char *str, int len)
345
+ {
346
+ if (!r)
347
+ return NULL;
348
+
349
+ return((rule_item_t) xtree_nget(r->tree, str, len, NULL));
350
+ }
351
+
352
+ /* check the bit with str */
353
+ int scws_rule_checkbit(rule_t r, const char *str, int len, unsigned int bit)
354
+ {
355
+ rule_item_t ri;
356
+
357
+ if (!r)
358
+ return 0;
359
+
360
+ ri = (rule_item_t) xtree_nget(r->tree, str, len, NULL);
361
+ if ((ri != NULL) && (ri->bit & bit))
362
+ return 1;
363
+
364
+ return 0;
365
+ }
366
+
367
+ /* get rule attr x */
368
+ #define EQUAL_RULE_ATTR(x,y) ((y[0]=='*'||y[0]==x[0])&&(y[1]=='\0'||y[1]==x[1]))
369
+ #define EQUAL_RULE_NPATH(x,y) ((y[0]==0xff||y[0]==x[0])&&(y[1]==0xff||y[1]==x[1]))
370
+
371
+ int scws_rule_attr_ratio(rule_t r, const char *attr1, const char *attr2, const unsigned char *npath)
372
+ {
373
+ rule_attr_t a;
374
+ int ret = 1;
375
+
376
+ if (!r || (a = r->attr) == NULL)
377
+ return ret;
378
+
379
+ while (a != NULL)
380
+ {
381
+ if (EQUAL_RULE_ATTR(attr1, a->attr1) && EQUAL_RULE_ATTR(attr2, a->attr2) && EQUAL_RULE_NPATH(npath, a->npath))
382
+ {
383
+ ret = (int) a->ratio;
384
+ break;
385
+ }
386
+ a = a->next;
387
+ }
388
+ return ret;
389
+ }
390
+
391
+ #undef EQUAL_RULE_ATTR
392
+ #undef EQUAL_RULE_NPATH
393
+
394
+ /* check the rule */
395
+ int scws_rule_check(rule_t r, rule_item_t cr, const char *str, int len)
396
+ {
397
+ if (!r)
398
+ return 0;
399
+
400
+ if ((cr->flag & SCWS_ZRULE_INCLUDE) && !scws_rule_checkbit(r, str, len, cr->inc))
401
+ return 0;
402
+
403
+ if ((cr->flag & SCWS_ZRULE_EXCLUDE) && scws_rule_checkbit(r, str, len, cr->exc))
404
+ return 0;
405
+
406
+ return 1;
407
+ }
data/ext/scws4r/rule.h ADDED
@@ -0,0 +1,83 @@
1
+ /**
2
+ * @file rule.h
3
+ * @author Hightman Mar
4
+ * @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
5
+ * $Id$
6
+ */
7
+
8
+ #ifndef _SCWS_RULE_20070525_H_
9
+ #define _SCWS_RULE_20070525_H_
10
+
11
+ /* xtree required */
12
+ #include "xtree.h"
13
+
14
+ #define SCWS_RULE_MAX 32
15
+ #define SCWS_RULE_SPECIAL 0x80000000
16
+ #define SCWS_RULE_NOSTATS 0x40000000
17
+
18
+ /* flag: 0x00 ~ 0x4000 */
19
+ #define SCWS_ZRULE_NONE 0x00
20
+ #define SCWS_ZRULE_PREFIX 0x01
21
+ #define SCWS_ZRULE_SUFFIX 0x02
22
+ #define SCWS_ZRULE_INCLUDE 0x04 /* with include */
23
+ #define SCWS_ZRULE_EXCLUDE 0x08 /* with exclude */
24
+ #define SCWS_ZRULE_RANGE 0x10 /* with znum range */
25
+
26
+ /* data structure */
27
+ typedef struct scws_rule_item
28
+ {
29
+ short flag;
30
+ char zmin;
31
+ char zmax;
32
+ char name[17];
33
+ char attr[3];
34
+ float tf;
35
+ float idf;
36
+ unsigned int bit; /* my bit */
37
+ unsigned int inc; /* include */
38
+ unsigned int exc; /* exclude */
39
+ } *rule_item_t;
40
+
41
+ /* special attrs ratio list(single chain, 12bytes) */
42
+ typedef struct scws_rule_attr *rule_attr_t;
43
+ struct scws_rule_attr
44
+ {
45
+ char attr1[2];
46
+ char attr2[2];
47
+ unsigned char npath[2];
48
+ short ratio;
49
+ rule_attr_t next;
50
+ };
51
+
52
+ typedef struct scws_rule
53
+ {
54
+ xtree_t tree;
55
+ rule_attr_t attr;
56
+ struct scws_rule_item items[SCWS_RULE_MAX];
57
+ int ref; // hightman.20130110: refcount (zero to really free/close)
58
+ } rule_st, *rule_t;
59
+
60
+ /* scws ruleset: api */
61
+
62
+ /* create & load ruleset, by fpath & charset */
63
+ rule_t scws_rule_new(const char *fpath, unsigned char *mblen);
64
+
65
+ /* fork ruleset */
66
+ rule_t scws_rule_fork(rule_t r);
67
+
68
+ /* free the memory & resource for ruleset */
69
+ void scws_rule_free(rule_t r);
70
+
71
+ /* get the rule tree record by str */
72
+ rule_item_t scws_rule_get(rule_t r, const char *str, int len);
73
+
74
+ /* check bit */
75
+ int scws_rule_checkbit(rule_t r, const char *str, int len, unsigned int bit);
76
+
77
+ /* get rule attr x */
78
+ int scws_rule_attr_ratio(rule_t r, const char *attr1, const char *attr2, const unsigned char *npath);
79
+
80
+ /* check exclude or include */
81
+ int scws_rule_check(rule_t r, rule_item_t cr, const char *str, int len);
82
+
83
+ #endif