scws4r 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,394 @@
1
+ /**
2
+ * @file xdict.c (dictionary query)
3
+ * @author Hightman Mar
4
+ * @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
5
+ * $Id$
6
+ */
7
+
8
+ #ifdef HAVE_CONFIG_H
9
+ # include "config.h"
10
+ #endif
11
+
12
+ #ifdef WIN32
13
+ # include "config_win32.h"
14
+ #endif
15
+
16
+ #include "xdict.h"
17
+ #include "xtree.h"
18
+ #include "xdb.h"
19
+ #include "crc32.h"
20
+ #include <stdio.h>
21
+ #include <stdlib.h>
22
+ #include <string.h>
23
+ #include <unistd.h>
24
+ #ifndef WIN32
25
+ # include <sys/param.h>
26
+ #endif
27
+ #include <sys/types.h>
28
+ #include <sys/stat.h>
29
+
30
+ /* temp file format for TEXT xdb */
31
+ #if !defined(PATH_MAX) || (PATH_MAX < 1024)
32
+ # define XDICT_PATH_MAX 1024
33
+ #else
34
+ # define XDICT_PATH_MAX PATH_MAX
35
+ #endif
36
+
37
+ #ifdef HAVE_STRTOK_R
38
+ # define _strtok_r strtok_r
39
+ #else
40
+
41
+ static char *_strtok_r(char *s, char *delim, char **lasts)
42
+ {
43
+ register char *spanp;
44
+ register int c, sc;
45
+ char *tok;
46
+
47
+ if (s == NULL && (s = *lasts) == NULL)
48
+ return NULL;
49
+
50
+ /*
51
+ * Skip (span) leading delimiters (s += strspn(s, delim), sort of).
52
+ */
53
+ cont:
54
+ c = *s++;
55
+ for (spanp = (char *) delim; (sc = *spanp++) != 0;)
56
+ {
57
+ if (c == sc) goto cont;
58
+ }
59
+
60
+ if (c == 0)
61
+ { /* no non-delimiter characters */
62
+ *lasts = NULL;
63
+ return NULL;
64
+ }
65
+ tok = s - 1;
66
+
67
+ /*
68
+ * Scan token (scan for delimiters: s += strcspn(s, delim), sort of).
69
+ * Note that delim must have one NUL; we stop if we see that, too.
70
+ */
71
+ for (;;)
72
+ {
73
+ c = *s++;
74
+ spanp = (char *) delim;
75
+ do
76
+ {
77
+ if ((sc = *spanp++) == c)
78
+ {
79
+ if (c == 0) s = NULL;
80
+ else s[-1] = '\0';
81
+ *lasts = s;
82
+ return tok;
83
+ }
84
+ }
85
+ while (sc != 0);
86
+ }
87
+ }
88
+ #endif
89
+
90
+ #ifdef WIN32
91
+ # include <direct.h>
92
+
93
+ static void _realpath(const char *src, char *dst)
94
+ {
95
+ int len = strlen(src);
96
+ if (strchr(src, ':') != NULL)
97
+ memcpy(dst, src, len + 1);
98
+ else
99
+ {
100
+ char *ptr;
101
+ getcwd(dst, XDICT_PATH_MAX - len - 2);
102
+ ptr = dst + strlen(dst);
103
+ *ptr++ = '/';
104
+ memcpy(ptr, src, len + 1);
105
+ }
106
+ }
107
+ #else
108
+ # define _realpath realpath
109
+ #endif
110
+
111
+ /* open the text dict */
112
+ static xdict_t _xdict_open_txt(const char *fpath, int mode, unsigned char *ml)
113
+ {
114
+ xdict_t xd;
115
+ xtree_t xt;
116
+ char buf[XDICT_PATH_MAX], tmpfile[XDICT_PATH_MAX];
117
+ struct stat st1, st2;
118
+
119
+ // check the input filepath
120
+ _realpath(fpath, buf);
121
+ if (stat(buf, &st1) < 0)
122
+ return NULL;
123
+
124
+ // check dest file & orginal file, compare there mtime
125
+ #ifdef WIN32
126
+ {
127
+ char *tmp_ptr;
128
+ GetTempPath(sizeof(tmpfile) - 20, tmpfile);
129
+ tmp_ptr = tmpfile + strlen(tmpfile);
130
+ if (tmp_ptr[-1] == '\\') tmp_ptr--;
131
+ sprintf(tmp_ptr, "\\scws-%08x.xdb", scws_crc32(buf));
132
+ }
133
+ #else
134
+ sprintf(tmpfile, "/tmp/scws-%08x.xdb", scws_crc32(buf));
135
+ #endif
136
+ if (!stat(tmpfile, &st2) && st2.st_mtime > st1.st_mtime)
137
+ {
138
+ xdb_t x;
139
+ if ((x = xdb_open(tmpfile, 'r')) != NULL)
140
+ {
141
+ xd = (xdict_t) malloc(sizeof(xdict_st));
142
+ memset(xd, 0, sizeof(xdict_st));
143
+ xd->ref = 1;
144
+
145
+ if (mode & SCWS_XDICT_MEM)
146
+ {
147
+ /* convert the xdb(disk) -> xtree(memory) */
148
+ if ((xt = xdb_to_xtree(x, NULL)) != NULL)
149
+ {
150
+ xdb_close(x);
151
+ xd->xdict = (void *) xt;
152
+ xd->xmode = SCWS_XDICT_MEM;
153
+ return xd;
154
+ }
155
+ }
156
+ xd->xmode = SCWS_XDICT_XDB;
157
+ xd->xdict = (void *) x;
158
+ return xd;
159
+ }
160
+ }
161
+
162
+ // create xtree
163
+ if ((xt = xtree_new(0, 0)) == NULL)
164
+ return NULL;
165
+ else
166
+ {
167
+ int cl, kl;
168
+ FILE *fp;
169
+ word_st word, *w;
170
+ char *key, *part, *last, *delim = " \t\r\n";
171
+
172
+ // re-build the xdb file from text file
173
+ if ((fp = fopen(buf, "r")) == NULL)
174
+ return NULL;
175
+
176
+ // parse every line
177
+ word.attr[2] = '\0';
178
+ while (fgets(buf, sizeof(buf) - 1, fp) != NULL)
179
+ {
180
+ // <word>[\t<tf>[\t<idf>[\t<attr>]]]
181
+ if (buf[0] == ';' || buf[0] == '#') continue;
182
+
183
+ key = _strtok_r(buf, delim, &last);
184
+ if (key == NULL) continue;
185
+ kl = strlen(key);
186
+
187
+ // init the word
188
+ do
189
+ {
190
+ word.tf = word.idf = 1.0;
191
+ word.flag = SCWS_WORD_FULL;
192
+ word.attr[0] = '@';
193
+ word.attr[1] = '\0';
194
+
195
+ if (!(part = _strtok_r(NULL, delim, &last))) break;
196
+ word.tf = (float) atof(part);
197
+
198
+ if (!(part = _strtok_r(NULL, delim, &last))) break;
199
+ word.idf = (float) atof(part);
200
+
201
+ if (part = _strtok_r(NULL, delim, &last))
202
+ {
203
+ word.attr[0] = part[0];
204
+ if (part[1]) word.attr[1] = part[1];
205
+ }
206
+ }
207
+ while (0);
208
+
209
+ // save into xtree
210
+ if ((w = xtree_nget(xt, key, kl, NULL)) == NULL)
211
+ {
212
+ w = (word_st *) pmalloc(xt->p, sizeof(word_st));
213
+ memcpy(w, &word, sizeof(word));
214
+ xtree_nput(xt, w, sizeof(word), key, kl);
215
+ }
216
+ else
217
+ {
218
+ w->tf = word.tf;
219
+ w->idf = word.idf;
220
+ w->flag |= word.flag;
221
+ strcpy(w->attr, word.attr);
222
+ }
223
+
224
+ // parse the part
225
+ cl = ml[(unsigned char) (key[0])];
226
+ while (1)
227
+ {
228
+ cl += ml[(unsigned char) (key[cl])];
229
+ if (cl >= kl) break;
230
+
231
+ if ((w = xtree_nget(xt, key, cl, NULL)) != NULL)
232
+ w->flag |= SCWS_WORD_PART;
233
+ else
234
+ {
235
+ w = (word_st *) pmalloc_z(xt->p, sizeof(word_st));
236
+ w->flag = SCWS_WORD_PART;
237
+ xtree_nput(xt, w, sizeof(word), key, cl);
238
+ }
239
+ }
240
+ }
241
+ fclose(fp);
242
+
243
+ // optimize the xtree & save to xdb
244
+ xtree_optimize(xt);
245
+ unlink(tmpfile);
246
+ xtree_to_xdb(xt, tmpfile);
247
+ chmod(tmpfile, 0777);
248
+
249
+ // return xtree
250
+ xd = (xdict_t) malloc(sizeof(xdict_st));
251
+ memset(xd, 0, sizeof(xdict_st));
252
+ xd->ref = 1;
253
+ xd->xdict = (void *) xt;
254
+ xd->xmode = SCWS_XDICT_MEM;
255
+ return xd;
256
+ }
257
+ }
258
+
259
+ /* setup & open the dict */
260
+ xdict_t xdict_open(const char *fpath, int mode)
261
+ {
262
+ xdict_t xd;
263
+ xdb_t x;
264
+
265
+ if (!(x = xdb_open(fpath, 'r')))
266
+ return NULL;
267
+
268
+ xd = (xdict_t) malloc(sizeof(xdict_st));
269
+ memset(xd, 0, sizeof(xdict_st));
270
+ xd->ref = 1;
271
+ if (mode & SCWS_XDICT_MEM)
272
+ {
273
+ xtree_t xt;
274
+
275
+ /* convert the xdb(disk) -> xtree(memory) */
276
+ if ((xt = xdb_to_xtree(x, NULL)) != NULL)
277
+ {
278
+ xdb_close(x);
279
+ xd->xdict = (void *) xt;
280
+ xd->xmode = SCWS_XDICT_MEM;
281
+ return xd;
282
+ }
283
+ }
284
+
285
+ xd->xmode = SCWS_XDICT_XDB;
286
+ xd->xdict = (void *) x;
287
+ return xd;
288
+ }
289
+
290
+ /* add a dict */
291
+ xdict_t xdict_add(xdict_t xd, const char *fpath, int mode, unsigned char *ml)
292
+ {
293
+ xdict_t xx;
294
+
295
+ xx = (mode & SCWS_XDICT_TXT ? _xdict_open_txt(fpath, mode, ml) : xdict_open(fpath, mode));
296
+ if (xx != NULL)
297
+ {
298
+ xx->next = xd;
299
+ return xx;
300
+ }
301
+ return xd;
302
+ }
303
+
304
+ /* fork the dict */
305
+ xdict_t xdict_fork(xdict_t xd)
306
+ {
307
+ xdict_t xx;
308
+ for (xx = xd; xx != NULL; xx = xx->next)
309
+ {
310
+ xx->ref++;
311
+ }
312
+ return xd;
313
+ }
314
+
315
+ /* close the dict */
316
+ void xdict_close(xdict_t xd)
317
+ {
318
+ xdict_t xx;
319
+
320
+ while ((xx = xd) != NULL)
321
+ {
322
+ xd = xx->next;
323
+ xx->ref--;
324
+ if (xx->ref == 0)
325
+ {
326
+ if (xx->xmode == SCWS_XDICT_MEM)
327
+ xtree_free((xtree_t) xx->xdict);
328
+ else
329
+ {
330
+ xdb_close((xdb_t) xx->xdict);
331
+ }
332
+ free(xx);
333
+ }
334
+ }
335
+ }
336
+
337
+ /* query the word */
338
+ #define _FLAG_BOTH(x) (((x)->flag & (SCWS_WORD_PART|SCWS_WORD_FULL)) == (SCWS_WORD_PART|SCWS_WORD_FULL))
339
+ #define _FLAG_FULL(x) ((x)->flag & SCWS_WORD_FULL)
340
+ #define _FLAG_PART(x) ((x)->flag & SCWS_WORD_PART)
341
+ #define _FLAG_MALLOC(x) ((x)->flag & SCWS_WORD_MALLOCED)
342
+
343
+ word_t xdict_query(xdict_t xd, const char *key, int len)
344
+ {
345
+ word_t value, value2;
346
+
347
+ value = value2 = NULL;
348
+ while (xd != NULL)
349
+ {
350
+ if (xd->xmode == SCWS_XDICT_MEM)
351
+ {
352
+ /* this is ThreadSafe, recommend. */
353
+ value = (word_t) xtree_nget((xtree_t) xd->xdict, key, len, NULL);
354
+ }
355
+ else
356
+ {
357
+ /* the value malloced in lib-XDB. free required */
358
+ value = (word_t) xdb_nget((xdb_t) xd->xdict, key, len, NULL);
359
+ if (value != NULL) value->flag |= SCWS_WORD_MALLOCED;
360
+ }
361
+ xd = xd->next;
362
+
363
+ // check value2
364
+ if (value != NULL)
365
+ {
366
+ if (value2 == NULL)
367
+ {
368
+ if (_FLAG_BOTH(value))
369
+ return value;
370
+ value2 = value;
371
+ }
372
+ else
373
+ {
374
+ if (_FLAG_FULL(value2) && _FLAG_PART(value))
375
+ {
376
+ value2->flag |= SCWS_WORD_PART;
377
+ if (_FLAG_MALLOC(value))
378
+ free(value);
379
+ return value2;
380
+ }
381
+ if (_FLAG_FULL(value) && _FLAG_PART(value2))
382
+ {
383
+ value->flag |= SCWS_WORD_PART;
384
+ if (_FLAG_MALLOC(value2))
385
+ free(value2);
386
+ return value;
387
+ }
388
+ if (_FLAG_MALLOC(value))
389
+ free(value);
390
+ }
391
+ }
392
+ }
393
+ return value2;
394
+ }
@@ -0,0 +1,73 @@
1
+ /**
2
+ * @file xdict (dictionary)
3
+ * @author Hightman Mar
4
+ * @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
5
+ * $Id$
6
+ */
7
+
8
+ #ifndef _SCWS_XDICT_20070528_H_
9
+ #define _SCWS_XDICT_20070528_H_
10
+
11
+ #ifdef __cplusplus
12
+ extern "C" {
13
+ #endif
14
+
15
+ /* constant var define */
16
+ #define SCWS_WORD_FULL 0x01 // 多字: 整词
17
+ #define SCWS_WORD_PART 0x02 // 多字: 前词段
18
+ #define SCWS_WORD_USED 0x04 // 多字: 已使用
19
+ #define SCWS_WORD_RULE 0x08 // 多字: 自动识别的
20
+ #define SCWS_WORD_LONG 0x10 // 多字: 短词组成的长词
21
+
22
+ #define SCWS_WORD_MALLOCED 0x80 // xdict_query 结果必须调用 free
23
+
24
+ #define SCWS_ZFLAG_PUT 0x02 // 单字: 已使用
25
+ #define SCWS_ZFLAG_N2 0x04 // 单字: 双字名词头
26
+ #define SCWS_ZFLAG_NR2 0x08 // 单字: 词头且为双字人名
27
+ #define SCWS_ZFLAG_WHEAD 0x10 // 单字: 词头
28
+ #define SCWS_ZFLAG_WPART 0x20 // 单字: 词尾或词中
29
+ #define SCWS_ZFLAG_ENGLISH 0x40 // 单字: 夹在中间的英文
30
+ #define SCWS_ZFLAG_SYMBOL 0x80 // 单字: 符号系列
31
+ #define SCWS_XDICT_PRIME 0x3ffd // 词典结构树数:16381
32
+
33
+ /* xdict open mode */
34
+ #define SCWS_XDICT_XDB 1
35
+ #define SCWS_XDICT_MEM 2
36
+ #define SCWS_XDICT_TXT 4 // ...
37
+ #define SCWS_XDICT_SET 4096 // set flag.
38
+
39
+ /* data structure for word(12bytes) */
40
+ typedef struct scws_word
41
+ {
42
+ float tf;
43
+ float idf;
44
+ unsigned char flag;
45
+ char attr[3];
46
+ } word_st, *word_t;
47
+
48
+ typedef struct scws_xdict
49
+ {
50
+ void *xdict;
51
+ int xmode;
52
+ int ref; // hightman.20130110: refcount (zero to really free/close)
53
+ struct scws_xdict *next;
54
+ } xdict_st, *xdict_t;
55
+
56
+ /* pub function (api) */
57
+ xdict_t xdict_open(const char *fpath, int mode);
58
+ void xdict_close(xdict_t xd);
59
+
60
+ /* fork xdict */
61
+ xdict_t xdict_fork(xdict_t xd);
62
+
63
+ /* add a new dict file into xd, succ: 0, error: -1, Mblen only used for XDICT_TXT */
64
+ xdict_t xdict_add(xdict_t xd, const char *fpath, int mode, unsigned char *ml);
65
+
66
+ /* NOW this is ThreadSafe function */
67
+ word_t xdict_query(xdict_t xd, const char *key, int len);
68
+
69
+ #ifdef __cplusplus
70
+ }
71
+ #endif
72
+
73
+ #endif