scws4r 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,394 @@
1
+ /**
2
+ * @file xdict.c (dictionary query)
3
+ * @author Hightman Mar
4
+ * @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
5
+ * $Id$
6
+ */
7
+
8
+ #ifdef HAVE_CONFIG_H
9
+ # include "config.h"
10
+ #endif
11
+
12
+ #ifdef WIN32
13
+ # include "config_win32.h"
14
+ #endif
15
+
16
+ #include "xdict.h"
17
+ #include "xtree.h"
18
+ #include "xdb.h"
19
+ #include "crc32.h"
20
+ #include <stdio.h>
21
+ #include <stdlib.h>
22
+ #include <string.h>
23
+ #include <unistd.h>
24
+ #ifndef WIN32
25
+ # include <sys/param.h>
26
+ #endif
27
+ #include <sys/types.h>
28
+ #include <sys/stat.h>
29
+
30
+ /* temp file format for TEXT xdb */
31
+ #if !defined(PATH_MAX) || (PATH_MAX < 1024)
32
+ # define XDICT_PATH_MAX 1024
33
+ #else
34
+ # define XDICT_PATH_MAX PATH_MAX
35
+ #endif
36
+
37
+ #ifdef HAVE_STRTOK_R
38
+ # define _strtok_r strtok_r
39
+ #else
40
+
41
+ static char *_strtok_r(char *s, char *delim, char **lasts)
42
+ {
43
+ register char *spanp;
44
+ register int c, sc;
45
+ char *tok;
46
+
47
+ if (s == NULL && (s = *lasts) == NULL)
48
+ return NULL;
49
+
50
+ /*
51
+ * Skip (span) leading delimiters (s += strspn(s, delim), sort of).
52
+ */
53
+ cont:
54
+ c = *s++;
55
+ for (spanp = (char *) delim; (sc = *spanp++) != 0;)
56
+ {
57
+ if (c == sc) goto cont;
58
+ }
59
+
60
+ if (c == 0)
61
+ { /* no non-delimiter characters */
62
+ *lasts = NULL;
63
+ return NULL;
64
+ }
65
+ tok = s - 1;
66
+
67
+ /*
68
+ * Scan token (scan for delimiters: s += strcspn(s, delim), sort of).
69
+ * Note that delim must have one NUL; we stop if we see that, too.
70
+ */
71
+ for (;;)
72
+ {
73
+ c = *s++;
74
+ spanp = (char *) delim;
75
+ do
76
+ {
77
+ if ((sc = *spanp++) == c)
78
+ {
79
+ if (c == 0) s = NULL;
80
+ else s[-1] = '\0';
81
+ *lasts = s;
82
+ return tok;
83
+ }
84
+ }
85
+ while (sc != 0);
86
+ }
87
+ }
88
+ #endif
89
+
90
+ #ifdef WIN32
91
+ # include <direct.h>
92
+
93
+ static void _realpath(const char *src, char *dst)
94
+ {
95
+ int len = strlen(src);
96
+ if (strchr(src, ':') != NULL)
97
+ memcpy(dst, src, len + 1);
98
+ else
99
+ {
100
+ char *ptr;
101
+ getcwd(dst, XDICT_PATH_MAX - len - 2);
102
+ ptr = dst + strlen(dst);
103
+ *ptr++ = '/';
104
+ memcpy(ptr, src, len + 1);
105
+ }
106
+ }
107
+ #else
108
+ # define _realpath realpath
109
+ #endif
110
+
111
+ /* open the text dict */
112
+ static xdict_t _xdict_open_txt(const char *fpath, int mode, unsigned char *ml)
113
+ {
114
+ xdict_t xd;
115
+ xtree_t xt;
116
+ char buf[XDICT_PATH_MAX], tmpfile[XDICT_PATH_MAX];
117
+ struct stat st1, st2;
118
+
119
+ // check the input filepath
120
+ _realpath(fpath, buf);
121
+ if (stat(buf, &st1) < 0)
122
+ return NULL;
123
+
124
+ // check dest file & orginal file, compare there mtime
125
+ #ifdef WIN32
126
+ {
127
+ char *tmp_ptr;
128
+ GetTempPath(sizeof(tmpfile) - 20, tmpfile);
129
+ tmp_ptr = tmpfile + strlen(tmpfile);
130
+ if (tmp_ptr[-1] == '\\') tmp_ptr--;
131
+ sprintf(tmp_ptr, "\\scws-%08x.xdb", scws_crc32(buf));
132
+ }
133
+ #else
134
+ sprintf(tmpfile, "/tmp/scws-%08x.xdb", scws_crc32(buf));
135
+ #endif
136
+ if (!stat(tmpfile, &st2) && st2.st_mtime > st1.st_mtime)
137
+ {
138
+ xdb_t x;
139
+ if ((x = xdb_open(tmpfile, 'r')) != NULL)
140
+ {
141
+ xd = (xdict_t) malloc(sizeof(xdict_st));
142
+ memset(xd, 0, sizeof(xdict_st));
143
+ xd->ref = 1;
144
+
145
+ if (mode & SCWS_XDICT_MEM)
146
+ {
147
+ /* convert the xdb(disk) -> xtree(memory) */
148
+ if ((xt = xdb_to_xtree(x, NULL)) != NULL)
149
+ {
150
+ xdb_close(x);
151
+ xd->xdict = (void *) xt;
152
+ xd->xmode = SCWS_XDICT_MEM;
153
+ return xd;
154
+ }
155
+ }
156
+ xd->xmode = SCWS_XDICT_XDB;
157
+ xd->xdict = (void *) x;
158
+ return xd;
159
+ }
160
+ }
161
+
162
+ // create xtree
163
+ if ((xt = xtree_new(0, 0)) == NULL)
164
+ return NULL;
165
+ else
166
+ {
167
+ int cl, kl;
168
+ FILE *fp;
169
+ word_st word, *w;
170
+ char *key, *part, *last, *delim = " \t\r\n";
171
+
172
+ // re-build the xdb file from text file
173
+ if ((fp = fopen(buf, "r")) == NULL)
174
+ return NULL;
175
+
176
+ // parse every line
177
+ word.attr[2] = '\0';
178
+ while (fgets(buf, sizeof(buf) - 1, fp) != NULL)
179
+ {
180
+ // <word>[\t<tf>[\t<idf>[\t<attr>]]]
181
+ if (buf[0] == ';' || buf[0] == '#') continue;
182
+
183
+ key = _strtok_r(buf, delim, &last);
184
+ if (key == NULL) continue;
185
+ kl = strlen(key);
186
+
187
+ // init the word
188
+ do
189
+ {
190
+ word.tf = word.idf = 1.0;
191
+ word.flag = SCWS_WORD_FULL;
192
+ word.attr[0] = '@';
193
+ word.attr[1] = '\0';
194
+
195
+ if (!(part = _strtok_r(NULL, delim, &last))) break;
196
+ word.tf = (float) atof(part);
197
+
198
+ if (!(part = _strtok_r(NULL, delim, &last))) break;
199
+ word.idf = (float) atof(part);
200
+
201
+ if (part = _strtok_r(NULL, delim, &last))
202
+ {
203
+ word.attr[0] = part[0];
204
+ if (part[1]) word.attr[1] = part[1];
205
+ }
206
+ }
207
+ while (0);
208
+
209
+ // save into xtree
210
+ if ((w = xtree_nget(xt, key, kl, NULL)) == NULL)
211
+ {
212
+ w = (word_st *) pmalloc(xt->p, sizeof(word_st));
213
+ memcpy(w, &word, sizeof(word));
214
+ xtree_nput(xt, w, sizeof(word), key, kl);
215
+ }
216
+ else
217
+ {
218
+ w->tf = word.tf;
219
+ w->idf = word.idf;
220
+ w->flag |= word.flag;
221
+ strcpy(w->attr, word.attr);
222
+ }
223
+
224
+ // parse the part
225
+ cl = ml[(unsigned char) (key[0])];
226
+ while (1)
227
+ {
228
+ cl += ml[(unsigned char) (key[cl])];
229
+ if (cl >= kl) break;
230
+
231
+ if ((w = xtree_nget(xt, key, cl, NULL)) != NULL)
232
+ w->flag |= SCWS_WORD_PART;
233
+ else
234
+ {
235
+ w = (word_st *) pmalloc_z(xt->p, sizeof(word_st));
236
+ w->flag = SCWS_WORD_PART;
237
+ xtree_nput(xt, w, sizeof(word), key, cl);
238
+ }
239
+ }
240
+ }
241
+ fclose(fp);
242
+
243
+ // optimize the xtree & save to xdb
244
+ xtree_optimize(xt);
245
+ unlink(tmpfile);
246
+ xtree_to_xdb(xt, tmpfile);
247
+ chmod(tmpfile, 0777);
248
+
249
+ // return xtree
250
+ xd = (xdict_t) malloc(sizeof(xdict_st));
251
+ memset(xd, 0, sizeof(xdict_st));
252
+ xd->ref = 1;
253
+ xd->xdict = (void *) xt;
254
+ xd->xmode = SCWS_XDICT_MEM;
255
+ return xd;
256
+ }
257
+ }
258
+
259
+ /* setup & open the dict */
260
+ xdict_t xdict_open(const char *fpath, int mode)
261
+ {
262
+ xdict_t xd;
263
+ xdb_t x;
264
+
265
+ if (!(x = xdb_open(fpath, 'r')))
266
+ return NULL;
267
+
268
+ xd = (xdict_t) malloc(sizeof(xdict_st));
269
+ memset(xd, 0, sizeof(xdict_st));
270
+ xd->ref = 1;
271
+ if (mode & SCWS_XDICT_MEM)
272
+ {
273
+ xtree_t xt;
274
+
275
+ /* convert the xdb(disk) -> xtree(memory) */
276
+ if ((xt = xdb_to_xtree(x, NULL)) != NULL)
277
+ {
278
+ xdb_close(x);
279
+ xd->xdict = (void *) xt;
280
+ xd->xmode = SCWS_XDICT_MEM;
281
+ return xd;
282
+ }
283
+ }
284
+
285
+ xd->xmode = SCWS_XDICT_XDB;
286
+ xd->xdict = (void *) x;
287
+ return xd;
288
+ }
289
+
290
+ /* add a dict */
291
+ xdict_t xdict_add(xdict_t xd, const char *fpath, int mode, unsigned char *ml)
292
+ {
293
+ xdict_t xx;
294
+
295
+ xx = (mode & SCWS_XDICT_TXT ? _xdict_open_txt(fpath, mode, ml) : xdict_open(fpath, mode));
296
+ if (xx != NULL)
297
+ {
298
+ xx->next = xd;
299
+ return xx;
300
+ }
301
+ return xd;
302
+ }
303
+
304
+ /* fork the dict */
305
+ xdict_t xdict_fork(xdict_t xd)
306
+ {
307
+ xdict_t xx;
308
+ for (xx = xd; xx != NULL; xx = xx->next)
309
+ {
310
+ xx->ref++;
311
+ }
312
+ return xd;
313
+ }
314
+
315
+ /* close the dict */
316
+ void xdict_close(xdict_t xd)
317
+ {
318
+ xdict_t xx;
319
+
320
+ while ((xx = xd) != NULL)
321
+ {
322
+ xd = xx->next;
323
+ xx->ref--;
324
+ if (xx->ref == 0)
325
+ {
326
+ if (xx->xmode == SCWS_XDICT_MEM)
327
+ xtree_free((xtree_t) xx->xdict);
328
+ else
329
+ {
330
+ xdb_close((xdb_t) xx->xdict);
331
+ }
332
+ free(xx);
333
+ }
334
+ }
335
+ }
336
+
337
+ /* query the word */
338
+ #define _FLAG_BOTH(x) (((x)->flag & (SCWS_WORD_PART|SCWS_WORD_FULL)) == (SCWS_WORD_PART|SCWS_WORD_FULL))
339
+ #define _FLAG_FULL(x) ((x)->flag & SCWS_WORD_FULL)
340
+ #define _FLAG_PART(x) ((x)->flag & SCWS_WORD_PART)
341
+ #define _FLAG_MALLOC(x) ((x)->flag & SCWS_WORD_MALLOCED)
342
+
343
+ word_t xdict_query(xdict_t xd, const char *key, int len)
344
+ {
345
+ word_t value, value2;
346
+
347
+ value = value2 = NULL;
348
+ while (xd != NULL)
349
+ {
350
+ if (xd->xmode == SCWS_XDICT_MEM)
351
+ {
352
+ /* this is ThreadSafe, recommend. */
353
+ value = (word_t) xtree_nget((xtree_t) xd->xdict, key, len, NULL);
354
+ }
355
+ else
356
+ {
357
+ /* the value malloced in lib-XDB. free required */
358
+ value = (word_t) xdb_nget((xdb_t) xd->xdict, key, len, NULL);
359
+ if (value != NULL) value->flag |= SCWS_WORD_MALLOCED;
360
+ }
361
+ xd = xd->next;
362
+
363
+ // check value2
364
+ if (value != NULL)
365
+ {
366
+ if (value2 == NULL)
367
+ {
368
+ if (_FLAG_BOTH(value))
369
+ return value;
370
+ value2 = value;
371
+ }
372
+ else
373
+ {
374
+ if (_FLAG_FULL(value2) && _FLAG_PART(value))
375
+ {
376
+ value2->flag |= SCWS_WORD_PART;
377
+ if (_FLAG_MALLOC(value))
378
+ free(value);
379
+ return value2;
380
+ }
381
+ if (_FLAG_FULL(value) && _FLAG_PART(value2))
382
+ {
383
+ value->flag |= SCWS_WORD_PART;
384
+ if (_FLAG_MALLOC(value2))
385
+ free(value2);
386
+ return value;
387
+ }
388
+ if (_FLAG_MALLOC(value))
389
+ free(value);
390
+ }
391
+ }
392
+ }
393
+ return value2;
394
+ }
@@ -0,0 +1,73 @@
1
+ /**
2
+ * @file xdict (dictionary)
3
+ * @author Hightman Mar
4
+ * @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
5
+ * $Id$
6
+ */
7
+
8
+ #ifndef _SCWS_XDICT_20070528_H_
9
+ #define _SCWS_XDICT_20070528_H_
10
+
11
+ #ifdef __cplusplus
12
+ extern "C" {
13
+ #endif
14
+
15
+ /* constant var define */
16
+ #define SCWS_WORD_FULL 0x01 // 多字: 整词
17
+ #define SCWS_WORD_PART 0x02 // 多字: 前词段
18
+ #define SCWS_WORD_USED 0x04 // 多字: 已使用
19
+ #define SCWS_WORD_RULE 0x08 // 多字: 自动识别的
20
+ #define SCWS_WORD_LONG 0x10 // 多字: 短词组成的长词
21
+
22
+ #define SCWS_WORD_MALLOCED 0x80 // xdict_query 结果必须调用 free
23
+
24
+ #define SCWS_ZFLAG_PUT 0x02 // 单字: 已使用
25
+ #define SCWS_ZFLAG_N2 0x04 // 单字: 双字名词头
26
+ #define SCWS_ZFLAG_NR2 0x08 // 单字: 词头且为双字人名
27
+ #define SCWS_ZFLAG_WHEAD 0x10 // 单字: 词头
28
+ #define SCWS_ZFLAG_WPART 0x20 // 单字: 词尾或词中
29
+ #define SCWS_ZFLAG_ENGLISH 0x40 // 单字: 夹在中间的英文
30
+ #define SCWS_ZFLAG_SYMBOL 0x80 // 单字: 符号系列
31
+ #define SCWS_XDICT_PRIME 0x3ffd // 词典结构树数:16381
32
+
33
+ /* xdict open mode */
34
+ #define SCWS_XDICT_XDB 1
35
+ #define SCWS_XDICT_MEM 2
36
+ #define SCWS_XDICT_TXT 4 // ...
37
+ #define SCWS_XDICT_SET 4096 // set flag.
38
+
39
+ /* data structure for word(12bytes) */
40
+ typedef struct scws_word
41
+ {
42
+ float tf;
43
+ float idf;
44
+ unsigned char flag;
45
+ char attr[3];
46
+ } word_st, *word_t;
47
+
48
+ typedef struct scws_xdict
49
+ {
50
+ void *xdict;
51
+ int xmode;
52
+ int ref; // hightman.20130110: refcount (zero to really free/close)
53
+ struct scws_xdict *next;
54
+ } xdict_st, *xdict_t;
55
+
56
+ /* pub function (api) */
57
+ xdict_t xdict_open(const char *fpath, int mode);
58
+ void xdict_close(xdict_t xd);
59
+
60
+ /* fork xdict */
61
+ xdict_t xdict_fork(xdict_t xd);
62
+
63
+ /* add a new dict file into xd, succ: 0, error: -1, Mblen only used for XDICT_TXT */
64
+ xdict_t xdict_add(xdict_t xd, const char *fpath, int mode, unsigned char *ml);
65
+
66
+ /* NOW this is ThreadSafe function */
67
+ word_t xdict_query(xdict_t xd, const char *key, int len);
68
+
69
+ #ifdef __cplusplus
70
+ }
71
+ #endif
72
+
73
+ #endif