scws4r 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/scws4r/rule.c ADDED
@@ -0,0 +1,407 @@
1
+ /**
2
+ * @file rule.c (auto surame & areaname & special group)
3
+ * @author Hightman Mar
4
+ * @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
5
+ * $Id$
6
+ */
7
+
8
+ #ifdef HAVE_CONFIG_H
9
+ # include "config.h"
10
+ #endif
11
+
12
+ #ifdef WIN32
13
+ # include "config_win32.h"
14
+ #endif
15
+
16
+ #include "rule.h"
17
+ #include <stdio.h>
18
+ #include <stdlib.h>
19
+ #include <string.h>
20
+
21
+ static inline int _rule_index_get(rule_t r, const char *name)
22
+ {
23
+ int i;
24
+ for (i = 0; i < SCWS_RULE_MAX; i++)
25
+ {
26
+ if (r->items[i].name[0] == '\0')
27
+ break;
28
+
29
+ if (!strcasecmp(r->items[i].name, name))
30
+ return i;
31
+ }
32
+ return -1;
33
+ }
34
+
35
+ rule_t scws_rule_new(const char *fpath, unsigned char *mblen)
36
+ {
37
+ FILE *fp;
38
+ rule_t r;
39
+ rule_item_t cr;
40
+ int i, j, rbl, aflag;
41
+ rule_attr_t a, rtail;
42
+ unsigned char buf[512], *str, *ptr, *qtr;
43
+
44
+ /* loaded or open file failed */
45
+ if ((fp = fopen(fpath, "r")) == NULL)
46
+ return NULL;
47
+
48
+ /* alloc the memory */
49
+ r = (rule_t) malloc(sizeof(rule_st));
50
+ memset(r, 0, sizeof(rule_st));
51
+ r->ref = 1;
52
+
53
+ /* quick scan to add the name to list */
54
+ i = j = rbl = aflag = 0;
55
+ while (fgets(buf, sizeof(buf) - 1, fp))
56
+ {
57
+ if (buf[0] != '[' || !(ptr = strchr(buf, ']')))
58
+ continue;
59
+
60
+ str = buf + 1;
61
+ *ptr = '\0';
62
+ if (ptr == str || (ptr - str) > 15 || !strcasecmp(str, "attrs"))
63
+ continue;
64
+
65
+ if (_rule_index_get(r, str) >= 0)
66
+ continue;
67
+
68
+ strcpy(r->items[i].name, str);
69
+ r->items[i].tf = 5.0;
70
+ r->items[i].idf = 3.5;
71
+ strncpy(r->items[i].attr, "un", 2);
72
+ if (!strcasecmp(str, "special"))
73
+ r->items[i].bit = SCWS_RULE_SPECIAL;
74
+ else if (!strcasecmp(str, "nostats"))
75
+ r->items[i].bit = SCWS_RULE_NOSTATS;
76
+ else
77
+ {
78
+ r->items[i].bit = (1 << j);
79
+ j++;
80
+ }
81
+
82
+ if (++i >= SCWS_RULE_MAX)
83
+ break;
84
+ }
85
+ rewind(fp);
86
+
87
+ /* load the tree data */
88
+ if ((r->tree = xtree_new(0, 1)) == NULL)
89
+ {
90
+ free(r);
91
+ return NULL;
92
+ }
93
+ cr = NULL;
94
+ while (fgets(buf, sizeof(buf) - 1, fp))
95
+ {
96
+ if (buf[0] == ';')
97
+ continue;
98
+
99
+ if (buf[0] == '[')
100
+ {
101
+ cr = NULL;
102
+ str = buf + 1;
103
+ aflag = 0;
104
+ if ((ptr = strchr(str, ']')) != NULL)
105
+ {
106
+ *ptr = '\0';
107
+ if (!strcasecmp(str, "attrs"))
108
+ {
109
+ aflag = 1;
110
+ }
111
+ else if ((i = _rule_index_get(r, str)) >= 0)
112
+ {
113
+ rbl = 1; /* default read by line = yes */
114
+ cr = &r->items[i];
115
+ }
116
+ }
117
+ continue;
118
+ }
119
+
120
+ /* attr flag open? */
121
+ if (aflag == 1)
122
+ {
123
+ /* parse the attr line */
124
+ str = buf;
125
+ while (*str == ' ' || *str == '\t') str++;
126
+ if ((ptr = strchr(str, '+')) == NULL) continue;
127
+ *ptr++ = '\0';
128
+ if ((qtr = strchr(ptr, '=')) == NULL) continue;
129
+ *qtr++ = '\0';
130
+
131
+ /* create new memory */
132
+ a = (rule_attr_t) malloc(sizeof(struct scws_rule_attr));
133
+ memset(a, 0, sizeof(struct scws_rule_attr));
134
+
135
+ /* get ratio */
136
+ while (*qtr == ' ' || *qtr == '\t') qtr++;
137
+ a->ratio = (short) atoi(qtr);
138
+ if (a->ratio < 1)
139
+ a->ratio = 1;
140
+ a->npath[0] = a->npath[1] = 0xff;
141
+
142
+ /* read attr1 & npath1? */
143
+ a->attr1[0] = *str++;
144
+ if (*str && *str != '(' && *str != ' ' && *str != '\t')
145
+ a->attr1[1] = *str++;
146
+ while (*str && *str != '(') str++;
147
+ if (*str == '(')
148
+ {
149
+ str++;
150
+ if ((qtr = strchr(str, ')')) != NULL)
151
+ {
152
+ *qtr = '\0';
153
+ a->npath[0] = (unsigned char) atoi(str);
154
+ if (a->npath[0] > 0)
155
+ a->npath[0]--;
156
+ else
157
+ a->npath[0] = 0xff;
158
+ }
159
+ }
160
+
161
+ /* read attr1 & npath2? */
162
+ str = ptr;
163
+ while (*str == ' ' || *str == '\t') str++;
164
+ a->attr2[0] = *str++;
165
+ if (*str && *str != '(' && *str != ' ' && *str != '\t')
166
+ a->attr2[1] = *str++;
167
+ while (*str && *str != '(') str++;
168
+ if (*str == '(')
169
+ {
170
+ str++;
171
+ if ((qtr = strchr(str, ')')) != NULL)
172
+ {
173
+ *qtr = '\0';
174
+ a->npath[1] = (unsigned char) atoi(str);
175
+ if (a->npath[1] > 0)
176
+ a->npath[1]--;
177
+ else
178
+ a->npath[1] = 0xff;
179
+ }
180
+ }
181
+
182
+ //printf("%c%c(%d)+%c%c(%d)=%d\n", a->attr1[0], a->attr1[1] ? a->attr1[1] : ' ', a->npath[0],
183
+ // a->attr2[0], a->attr2[1] ? a->attr2[1] : ' ', a->npath[1], a->ratio);
184
+
185
+ /* append to the chain list */
186
+ if (r->attr == NULL)
187
+ r->attr = rtail = a;
188
+ else
189
+ {
190
+ rtail->next = a;
191
+ rtail = a;
192
+ }
193
+
194
+ continue;
195
+ }
196
+
197
+ if (cr == NULL)
198
+ continue;
199
+
200
+ /* param set: line|znum|include|exclude|type|tf|idf|attr */
201
+ if (buf[0] == ':')
202
+ {
203
+ str = buf + 1;
204
+ if (!(ptr = strchr(str, '=')))
205
+ continue;
206
+ while (*str == ' ' || *str == '\t') str++;
207
+
208
+ qtr = ptr + 1;
209
+ while (ptr > str && (ptr[-1] == ' ' || ptr[-1] == '\t')) ptr--;
210
+ *ptr = '\0';
211
+ ptr = str;
212
+ str = qtr;
213
+ while (*str == ' ' || *str == '\t') str++;
214
+
215
+ if (!strcmp(ptr, "line"))
216
+ rbl = (*str == 'N' || *str == 'n') ? 0 : 1;
217
+ else if (!strcmp(ptr, "tf"))
218
+ cr->tf = (float) atof(str);
219
+ else if (!strcmp(ptr, "idf"))
220
+ cr->idf = (float) atof(str);
221
+ else if (!strcmp(ptr, "attr"))
222
+ strncpy(cr->attr, str, 2);
223
+ else if (!strcmp(ptr, "znum"))
224
+ {
225
+ if ((ptr = strchr(str, ',')) != NULL)
226
+ {
227
+ *ptr++ = '\0';
228
+ while (*ptr == ' ' || *ptr == '\t') ptr++;
229
+ cr->zmax = atoi(ptr);
230
+ cr->flag |= SCWS_ZRULE_RANGE;
231
+ }
232
+ cr->zmin = atoi(str);
233
+ }
234
+ else if (!strcmp(ptr, "type"))
235
+ {
236
+ if (!strncmp(str, "prefix", 6))
237
+ cr->flag |= SCWS_ZRULE_PREFIX;
238
+ else if (!strncmp(str, "suffix", 6))
239
+ cr->flag |= SCWS_ZRULE_SUFFIX;
240
+ }
241
+ else if (!strcmp(ptr, "include") || !strcmp(ptr, "exclude"))
242
+ {
243
+ unsigned int *clude;
244
+
245
+ if (!strcmp(ptr, "include"))
246
+ {
247
+ clude = &cr->inc;
248
+ cr->flag |= SCWS_ZRULE_INCLUDE;
249
+ }
250
+ else
251
+ {
252
+ clude = &cr->exc;
253
+ cr->flag |= SCWS_ZRULE_EXCLUDE;
254
+ }
255
+
256
+ while ((ptr = strchr(str, ',')) != NULL)
257
+ {
258
+ while (ptr > str && (ptr[-1] == '\t' || ptr[-1] == ' ')) ptr--;
259
+ *ptr = '\0';
260
+ if ((i = _rule_index_get(r, str)) >= 0)
261
+ *clude |= r->items[i].bit;
262
+
263
+ str = ptr + 1;
264
+ while (*str == ' ' || *str == '\t' || *str == ',') str++;
265
+ }
266
+
267
+ ptr = strlen(str) + str;
268
+ while (ptr > str && strchr(" \t\r\n", ptr[-1])) ptr--;
269
+ *ptr = '\0';
270
+ if (ptr > str && (i = _rule_index_get(r, str)))
271
+ *clude |= r->items[i].bit;
272
+ }
273
+ continue;
274
+ }
275
+
276
+ /* read the entries */
277
+ str = buf;
278
+ while (*str == ' ' || *str == '\t') str++;
279
+ ptr = str + strlen(str);
280
+ while (ptr > str && strchr(" \t\r\n", ptr[-1])) ptr--;
281
+ *ptr = '\0';
282
+
283
+ /* emptry line */
284
+ if (ptr == str)
285
+ continue;
286
+
287
+ if (rbl)
288
+ xtree_nput(r->tree, cr, sizeof(struct scws_rule_item), str, ptr - str);
289
+ else
290
+ {
291
+ while (str < ptr)
292
+ {
293
+ j = mblen[(*str)];
294
+
295
+ #ifdef DEBUG
296
+ /* try to check repeat */
297
+ if ((i = (int) xtree_nget(r->tree, str, j, NULL)) != 0)
298
+ fprintf(stderr, "Reapeat word on %s|%s: %.*s\n", cr->name, ((rule_item_t) i)->name, j, str);
299
+ #endif
300
+ xtree_nput(r->tree, cr, sizeof(struct scws_rule_item), str, j);
301
+ str += j;
302
+ }
303
+ }
304
+ }
305
+ fclose(fp);
306
+
307
+ /* optimize the tree */
308
+ xtree_optimize(r->tree);
309
+ return r;
310
+ }
311
+
312
+ /* fork rule */
313
+ rule_t scws_rule_fork(rule_t r)
314
+ {
315
+ if (r != NULL)
316
+ r->ref++;
317
+ return r;
318
+ }
319
+
320
+ /* free rule */
321
+ void scws_rule_free(rule_t r)
322
+ {
323
+ if (r)
324
+ {
325
+ r->ref--;
326
+ if (r->ref == 0)
327
+ {
328
+ rule_attr_t a, b;
329
+
330
+ xtree_free(r->tree);
331
+ a = r->attr;
332
+ while (a != NULL)
333
+ {
334
+ b = a;
335
+ a = b->next;
336
+ free(b);
337
+ }
338
+ free(r);
339
+ }
340
+ }
341
+ }
342
+
343
+ /* get the rule */
344
+ rule_item_t scws_rule_get(rule_t r, const char *str, int len)
345
+ {
346
+ if (!r)
347
+ return NULL;
348
+
349
+ return((rule_item_t) xtree_nget(r->tree, str, len, NULL));
350
+ }
351
+
352
+ /* check the bit with str */
353
+ int scws_rule_checkbit(rule_t r, const char *str, int len, unsigned int bit)
354
+ {
355
+ rule_item_t ri;
356
+
357
+ if (!r)
358
+ return 0;
359
+
360
+ ri = (rule_item_t) xtree_nget(r->tree, str, len, NULL);
361
+ if ((ri != NULL) && (ri->bit & bit))
362
+ return 1;
363
+
364
+ return 0;
365
+ }
366
+
367
+ /* get rule attr x */
368
+ #define EQUAL_RULE_ATTR(x,y) ((y[0]=='*'||y[0]==x[0])&&(y[1]=='\0'||y[1]==x[1]))
369
+ #define EQUAL_RULE_NPATH(x,y) ((y[0]==0xff||y[0]==x[0])&&(y[1]==0xff||y[1]==x[1]))
370
+
371
+ int scws_rule_attr_ratio(rule_t r, const char *attr1, const char *attr2, const unsigned char *npath)
372
+ {
373
+ rule_attr_t a;
374
+ int ret = 1;
375
+
376
+ if (!r || (a = r->attr) == NULL)
377
+ return ret;
378
+
379
+ while (a != NULL)
380
+ {
381
+ if (EQUAL_RULE_ATTR(attr1, a->attr1) && EQUAL_RULE_ATTR(attr2, a->attr2) && EQUAL_RULE_NPATH(npath, a->npath))
382
+ {
383
+ ret = (int) a->ratio;
384
+ break;
385
+ }
386
+ a = a->next;
387
+ }
388
+ return ret;
389
+ }
390
+
391
+ #undef EQUAL_RULE_ATTR
392
+ #undef EQUAL_RULE_NPATH
393
+
394
+ /* check the rule */
395
+ int scws_rule_check(rule_t r, rule_item_t cr, const char *str, int len)
396
+ {
397
+ if (!r)
398
+ return 0;
399
+
400
+ if ((cr->flag & SCWS_ZRULE_INCLUDE) && !scws_rule_checkbit(r, str, len, cr->inc))
401
+ return 0;
402
+
403
+ if ((cr->flag & SCWS_ZRULE_EXCLUDE) && scws_rule_checkbit(r, str, len, cr->exc))
404
+ return 0;
405
+
406
+ return 1;
407
+ }
data/ext/scws4r/rule.h ADDED
@@ -0,0 +1,83 @@
1
+ /**
2
+ * @file rule.h
3
+ * @author Hightman Mar
4
+ * @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
5
+ * $Id$
6
+ */
7
+
8
+ #ifndef _SCWS_RULE_20070525_H_
9
+ #define _SCWS_RULE_20070525_H_
10
+
11
+ /* xtree required */
12
+ #include "xtree.h"
13
+
14
+ #define SCWS_RULE_MAX 32
15
+ #define SCWS_RULE_SPECIAL 0x80000000
16
+ #define SCWS_RULE_NOSTATS 0x40000000
17
+
18
+ /* flag: 0x00 ~ 0x4000 */
19
+ #define SCWS_ZRULE_NONE 0x00
20
+ #define SCWS_ZRULE_PREFIX 0x01
21
+ #define SCWS_ZRULE_SUFFIX 0x02
22
+ #define SCWS_ZRULE_INCLUDE 0x04 /* with include */
23
+ #define SCWS_ZRULE_EXCLUDE 0x08 /* with exclude */
24
+ #define SCWS_ZRULE_RANGE 0x10 /* with znum range */
25
+
26
+ /* data structure */
27
+ typedef struct scws_rule_item
28
+ {
29
+ short flag;
30
+ char zmin;
31
+ char zmax;
32
+ char name[17];
33
+ char attr[3];
34
+ float tf;
35
+ float idf;
36
+ unsigned int bit; /* my bit */
37
+ unsigned int inc; /* include */
38
+ unsigned int exc; /* exclude */
39
+ } *rule_item_t;
40
+
41
+ /* special attrs ratio list(single chain, 12bytes) */
42
+ typedef struct scws_rule_attr *rule_attr_t;
43
+ struct scws_rule_attr
44
+ {
45
+ char attr1[2];
46
+ char attr2[2];
47
+ unsigned char npath[2];
48
+ short ratio;
49
+ rule_attr_t next;
50
+ };
51
+
52
+ typedef struct scws_rule
53
+ {
54
+ xtree_t tree;
55
+ rule_attr_t attr;
56
+ struct scws_rule_item items[SCWS_RULE_MAX];
57
+ int ref; // hightman.20130110: refcount (zero to really free/close)
58
+ } rule_st, *rule_t;
59
+
60
+ /* scws ruleset: api */
61
+
62
+ /* create & load ruleset, by fpath & charset */
63
+ rule_t scws_rule_new(const char *fpath, unsigned char *mblen);
64
+
65
+ /* fork ruleset */
66
+ rule_t scws_rule_fork(rule_t r);
67
+
68
+ /* free the memory & resource for ruleset */
69
+ void scws_rule_free(rule_t r);
70
+
71
+ /* get the rule tree record by str */
72
+ rule_item_t scws_rule_get(rule_t r, const char *str, int len);
73
+
74
+ /* check bit */
75
+ int scws_rule_checkbit(rule_t r, const char *str, int len, unsigned int bit);
76
+
77
+ /* get rule attr x */
78
+ int scws_rule_attr_ratio(rule_t r, const char *attr1, const char *attr2, const unsigned char *npath);
79
+
80
+ /* check exclude or include */
81
+ int scws_rule_check(rule_t r, rule_item_t cr, const char *str, int len);
82
+
83
+ #endif