scws4r 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +13 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +13 -0
- data/Gemfile.lock +65 -0
- data/LICENSE.txt +21 -0
- data/README.md +56 -0
- data/Rakefile +20 -0
- data/defaults/dict.utf8.xdb +0 -0
- data/defaults/rules.utf8.ini +291 -0
- data/ext/scws4r/Makefile +267 -0
- data/ext/scws4r/Makefile.am +15 -0
- data/ext/scws4r/charset.c +90 -0
- data/ext/scws4r/charset.h +14 -0
- data/ext/scws4r/config_win32.h +22 -0
- data/ext/scws4r/crc32.c +103 -0
- data/ext/scws4r/crc32.h +13 -0
- data/ext/scws4r/darray.c +35 -0
- data/ext/scws4r/darray.h +22 -0
- data/ext/scws4r/extconf.rb +3 -0
- data/ext/scws4r/lock.c +153 -0
- data/ext/scws4r/lock.h +44 -0
- data/ext/scws4r/pool.c +141 -0
- data/ext/scws4r/pool.h +53 -0
- data/ext/scws4r/rule.c +407 -0
- data/ext/scws4r/rule.h +83 -0
- data/ext/scws4r/scws.c +1581 -0
- data/ext/scws4r/scws.h +118 -0
- data/ext/scws4r/scws4r.c +207 -0
- data/ext/scws4r/scws4r.h +4 -0
- data/ext/scws4r/version.h.in +4 -0
- data/ext/scws4r/xdb.c +636 -0
- data/ext/scws4r/xdb.h +88 -0
- data/ext/scws4r/xdict.c +394 -0
- data/ext/scws4r/xdict.h +73 -0
- data/ext/scws4r/xtree.c +337 -0
- data/ext/scws4r/xtree.h +65 -0
- data/lib/scws4r/version.rb +5 -0
- data/lib/scws4r.rb +15 -0
- data/scws4r.gemspec +30 -0
- data/sig/scws.rbs +4 -0
- data/test.rb +16 -0
- metadata +88 -0
data/ext/scws4r/xdict.c
ADDED
@@ -0,0 +1,394 @@
|
|
1
|
+
/**
|
2
|
+
* @file xdict.c (dictionary query)
|
3
|
+
* @author Hightman Mar
|
4
|
+
* @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
|
5
|
+
* $Id$
|
6
|
+
*/
|
7
|
+
|
8
|
+
#ifdef HAVE_CONFIG_H
|
9
|
+
# include "config.h"
|
10
|
+
#endif
|
11
|
+
|
12
|
+
#ifdef WIN32
|
13
|
+
# include "config_win32.h"
|
14
|
+
#endif
|
15
|
+
|
16
|
+
#include "xdict.h"
|
17
|
+
#include "xtree.h"
|
18
|
+
#include "xdb.h"
|
19
|
+
#include "crc32.h"
|
20
|
+
#include <stdio.h>
|
21
|
+
#include <stdlib.h>
|
22
|
+
#include <string.h>
|
23
|
+
#include <unistd.h>
|
24
|
+
#ifndef WIN32
|
25
|
+
# include <sys/param.h>
|
26
|
+
#endif
|
27
|
+
#include <sys/types.h>
|
28
|
+
#include <sys/stat.h>
|
29
|
+
|
30
|
+
/* temp file format for TEXT xdb */
|
31
|
+
#if !defined(PATH_MAX) || (PATH_MAX < 1024)
|
32
|
+
# define XDICT_PATH_MAX 1024
|
33
|
+
#else
|
34
|
+
# define XDICT_PATH_MAX PATH_MAX
|
35
|
+
#endif
|
36
|
+
|
37
|
+
#ifdef HAVE_STRTOK_R
|
38
|
+
# define _strtok_r strtok_r
|
39
|
+
#else
|
40
|
+
|
41
|
+
static char *_strtok_r(char *s, char *delim, char **lasts)
|
42
|
+
{
|
43
|
+
register char *spanp;
|
44
|
+
register int c, sc;
|
45
|
+
char *tok;
|
46
|
+
|
47
|
+
if (s == NULL && (s = *lasts) == NULL)
|
48
|
+
return NULL;
|
49
|
+
|
50
|
+
/*
|
51
|
+
* Skip (span) leading delimiters (s += strspn(s, delim), sort of).
|
52
|
+
*/
|
53
|
+
cont:
|
54
|
+
c = *s++;
|
55
|
+
for (spanp = (char *) delim; (sc = *spanp++) != 0;)
|
56
|
+
{
|
57
|
+
if (c == sc) goto cont;
|
58
|
+
}
|
59
|
+
|
60
|
+
if (c == 0)
|
61
|
+
{ /* no non-delimiter characters */
|
62
|
+
*lasts = NULL;
|
63
|
+
return NULL;
|
64
|
+
}
|
65
|
+
tok = s - 1;
|
66
|
+
|
67
|
+
/*
|
68
|
+
* Scan token (scan for delimiters: s += strcspn(s, delim), sort of).
|
69
|
+
* Note that delim must have one NUL; we stop if we see that, too.
|
70
|
+
*/
|
71
|
+
for (;;)
|
72
|
+
{
|
73
|
+
c = *s++;
|
74
|
+
spanp = (char *) delim;
|
75
|
+
do
|
76
|
+
{
|
77
|
+
if ((sc = *spanp++) == c)
|
78
|
+
{
|
79
|
+
if (c == 0) s = NULL;
|
80
|
+
else s[-1] = '\0';
|
81
|
+
*lasts = s;
|
82
|
+
return tok;
|
83
|
+
}
|
84
|
+
}
|
85
|
+
while (sc != 0);
|
86
|
+
}
|
87
|
+
}
|
88
|
+
#endif
|
89
|
+
|
90
|
+
#ifdef WIN32
|
91
|
+
# include <direct.h>
|
92
|
+
|
93
|
+
static void _realpath(const char *src, char *dst)
|
94
|
+
{
|
95
|
+
int len = strlen(src);
|
96
|
+
if (strchr(src, ':') != NULL)
|
97
|
+
memcpy(dst, src, len + 1);
|
98
|
+
else
|
99
|
+
{
|
100
|
+
char *ptr;
|
101
|
+
getcwd(dst, XDICT_PATH_MAX - len - 2);
|
102
|
+
ptr = dst + strlen(dst);
|
103
|
+
*ptr++ = '/';
|
104
|
+
memcpy(ptr, src, len + 1);
|
105
|
+
}
|
106
|
+
}
|
107
|
+
#else
|
108
|
+
# define _realpath realpath
|
109
|
+
#endif
|
110
|
+
|
111
|
+
/* open the text dict */
|
112
|
+
static xdict_t _xdict_open_txt(const char *fpath, int mode, unsigned char *ml)
|
113
|
+
{
|
114
|
+
xdict_t xd;
|
115
|
+
xtree_t xt;
|
116
|
+
char buf[XDICT_PATH_MAX], tmpfile[XDICT_PATH_MAX];
|
117
|
+
struct stat st1, st2;
|
118
|
+
|
119
|
+
// check the input filepath
|
120
|
+
_realpath(fpath, buf);
|
121
|
+
if (stat(buf, &st1) < 0)
|
122
|
+
return NULL;
|
123
|
+
|
124
|
+
// check dest file & orginal file, compare there mtime
|
125
|
+
#ifdef WIN32
|
126
|
+
{
|
127
|
+
char *tmp_ptr;
|
128
|
+
GetTempPath(sizeof(tmpfile) - 20, tmpfile);
|
129
|
+
tmp_ptr = tmpfile + strlen(tmpfile);
|
130
|
+
if (tmp_ptr[-1] == '\\') tmp_ptr--;
|
131
|
+
sprintf(tmp_ptr, "\\scws-%08x.xdb", scws_crc32(buf));
|
132
|
+
}
|
133
|
+
#else
|
134
|
+
sprintf(tmpfile, "/tmp/scws-%08x.xdb", scws_crc32(buf));
|
135
|
+
#endif
|
136
|
+
if (!stat(tmpfile, &st2) && st2.st_mtime > st1.st_mtime)
|
137
|
+
{
|
138
|
+
xdb_t x;
|
139
|
+
if ((x = xdb_open(tmpfile, 'r')) != NULL)
|
140
|
+
{
|
141
|
+
xd = (xdict_t) malloc(sizeof(xdict_st));
|
142
|
+
memset(xd, 0, sizeof(xdict_st));
|
143
|
+
xd->ref = 1;
|
144
|
+
|
145
|
+
if (mode & SCWS_XDICT_MEM)
|
146
|
+
{
|
147
|
+
/* convert the xdb(disk) -> xtree(memory) */
|
148
|
+
if ((xt = xdb_to_xtree(x, NULL)) != NULL)
|
149
|
+
{
|
150
|
+
xdb_close(x);
|
151
|
+
xd->xdict = (void *) xt;
|
152
|
+
xd->xmode = SCWS_XDICT_MEM;
|
153
|
+
return xd;
|
154
|
+
}
|
155
|
+
}
|
156
|
+
xd->xmode = SCWS_XDICT_XDB;
|
157
|
+
xd->xdict = (void *) x;
|
158
|
+
return xd;
|
159
|
+
}
|
160
|
+
}
|
161
|
+
|
162
|
+
// create xtree
|
163
|
+
if ((xt = xtree_new(0, 0)) == NULL)
|
164
|
+
return NULL;
|
165
|
+
else
|
166
|
+
{
|
167
|
+
int cl, kl;
|
168
|
+
FILE *fp;
|
169
|
+
word_st word, *w;
|
170
|
+
char *key, *part, *last, *delim = " \t\r\n";
|
171
|
+
|
172
|
+
// re-build the xdb file from text file
|
173
|
+
if ((fp = fopen(buf, "r")) == NULL)
|
174
|
+
return NULL;
|
175
|
+
|
176
|
+
// parse every line
|
177
|
+
word.attr[2] = '\0';
|
178
|
+
while (fgets(buf, sizeof(buf) - 1, fp) != NULL)
|
179
|
+
{
|
180
|
+
// <word>[\t<tf>[\t<idf>[\t<attr>]]]
|
181
|
+
if (buf[0] == ';' || buf[0] == '#') continue;
|
182
|
+
|
183
|
+
key = _strtok_r(buf, delim, &last);
|
184
|
+
if (key == NULL) continue;
|
185
|
+
kl = strlen(key);
|
186
|
+
|
187
|
+
// init the word
|
188
|
+
do
|
189
|
+
{
|
190
|
+
word.tf = word.idf = 1.0;
|
191
|
+
word.flag = SCWS_WORD_FULL;
|
192
|
+
word.attr[0] = '@';
|
193
|
+
word.attr[1] = '\0';
|
194
|
+
|
195
|
+
if (!(part = _strtok_r(NULL, delim, &last))) break;
|
196
|
+
word.tf = (float) atof(part);
|
197
|
+
|
198
|
+
if (!(part = _strtok_r(NULL, delim, &last))) break;
|
199
|
+
word.idf = (float) atof(part);
|
200
|
+
|
201
|
+
if (part = _strtok_r(NULL, delim, &last))
|
202
|
+
{
|
203
|
+
word.attr[0] = part[0];
|
204
|
+
if (part[1]) word.attr[1] = part[1];
|
205
|
+
}
|
206
|
+
}
|
207
|
+
while (0);
|
208
|
+
|
209
|
+
// save into xtree
|
210
|
+
if ((w = xtree_nget(xt, key, kl, NULL)) == NULL)
|
211
|
+
{
|
212
|
+
w = (word_st *) pmalloc(xt->p, sizeof(word_st));
|
213
|
+
memcpy(w, &word, sizeof(word));
|
214
|
+
xtree_nput(xt, w, sizeof(word), key, kl);
|
215
|
+
}
|
216
|
+
else
|
217
|
+
{
|
218
|
+
w->tf = word.tf;
|
219
|
+
w->idf = word.idf;
|
220
|
+
w->flag |= word.flag;
|
221
|
+
strcpy(w->attr, word.attr);
|
222
|
+
}
|
223
|
+
|
224
|
+
// parse the part
|
225
|
+
cl = ml[(unsigned char) (key[0])];
|
226
|
+
while (1)
|
227
|
+
{
|
228
|
+
cl += ml[(unsigned char) (key[cl])];
|
229
|
+
if (cl >= kl) break;
|
230
|
+
|
231
|
+
if ((w = xtree_nget(xt, key, cl, NULL)) != NULL)
|
232
|
+
w->flag |= SCWS_WORD_PART;
|
233
|
+
else
|
234
|
+
{
|
235
|
+
w = (word_st *) pmalloc_z(xt->p, sizeof(word_st));
|
236
|
+
w->flag = SCWS_WORD_PART;
|
237
|
+
xtree_nput(xt, w, sizeof(word), key, cl);
|
238
|
+
}
|
239
|
+
}
|
240
|
+
}
|
241
|
+
fclose(fp);
|
242
|
+
|
243
|
+
// optimize the xtree & save to xdb
|
244
|
+
xtree_optimize(xt);
|
245
|
+
unlink(tmpfile);
|
246
|
+
xtree_to_xdb(xt, tmpfile);
|
247
|
+
chmod(tmpfile, 0777);
|
248
|
+
|
249
|
+
// return xtree
|
250
|
+
xd = (xdict_t) malloc(sizeof(xdict_st));
|
251
|
+
memset(xd, 0, sizeof(xdict_st));
|
252
|
+
xd->ref = 1;
|
253
|
+
xd->xdict = (void *) xt;
|
254
|
+
xd->xmode = SCWS_XDICT_MEM;
|
255
|
+
return xd;
|
256
|
+
}
|
257
|
+
}
|
258
|
+
|
259
|
+
/* setup & open the dict */
|
260
|
+
xdict_t xdict_open(const char *fpath, int mode)
|
261
|
+
{
|
262
|
+
xdict_t xd;
|
263
|
+
xdb_t x;
|
264
|
+
|
265
|
+
if (!(x = xdb_open(fpath, 'r')))
|
266
|
+
return NULL;
|
267
|
+
|
268
|
+
xd = (xdict_t) malloc(sizeof(xdict_st));
|
269
|
+
memset(xd, 0, sizeof(xdict_st));
|
270
|
+
xd->ref = 1;
|
271
|
+
if (mode & SCWS_XDICT_MEM)
|
272
|
+
{
|
273
|
+
xtree_t xt;
|
274
|
+
|
275
|
+
/* convert the xdb(disk) -> xtree(memory) */
|
276
|
+
if ((xt = xdb_to_xtree(x, NULL)) != NULL)
|
277
|
+
{
|
278
|
+
xdb_close(x);
|
279
|
+
xd->xdict = (void *) xt;
|
280
|
+
xd->xmode = SCWS_XDICT_MEM;
|
281
|
+
return xd;
|
282
|
+
}
|
283
|
+
}
|
284
|
+
|
285
|
+
xd->xmode = SCWS_XDICT_XDB;
|
286
|
+
xd->xdict = (void *) x;
|
287
|
+
return xd;
|
288
|
+
}
|
289
|
+
|
290
|
+
/* add a dict */
|
291
|
+
xdict_t xdict_add(xdict_t xd, const char *fpath, int mode, unsigned char *ml)
|
292
|
+
{
|
293
|
+
xdict_t xx;
|
294
|
+
|
295
|
+
xx = (mode & SCWS_XDICT_TXT ? _xdict_open_txt(fpath, mode, ml) : xdict_open(fpath, mode));
|
296
|
+
if (xx != NULL)
|
297
|
+
{
|
298
|
+
xx->next = xd;
|
299
|
+
return xx;
|
300
|
+
}
|
301
|
+
return xd;
|
302
|
+
}
|
303
|
+
|
304
|
+
/* fork the dict */
|
305
|
+
xdict_t xdict_fork(xdict_t xd)
|
306
|
+
{
|
307
|
+
xdict_t xx;
|
308
|
+
for (xx = xd; xx != NULL; xx = xx->next)
|
309
|
+
{
|
310
|
+
xx->ref++;
|
311
|
+
}
|
312
|
+
return xd;
|
313
|
+
}
|
314
|
+
|
315
|
+
/* close the dict */
|
316
|
+
void xdict_close(xdict_t xd)
|
317
|
+
{
|
318
|
+
xdict_t xx;
|
319
|
+
|
320
|
+
while ((xx = xd) != NULL)
|
321
|
+
{
|
322
|
+
xd = xx->next;
|
323
|
+
xx->ref--;
|
324
|
+
if (xx->ref == 0)
|
325
|
+
{
|
326
|
+
if (xx->xmode == SCWS_XDICT_MEM)
|
327
|
+
xtree_free((xtree_t) xx->xdict);
|
328
|
+
else
|
329
|
+
{
|
330
|
+
xdb_close((xdb_t) xx->xdict);
|
331
|
+
}
|
332
|
+
free(xx);
|
333
|
+
}
|
334
|
+
}
|
335
|
+
}
|
336
|
+
|
337
|
+
/* query the word */
|
338
|
+
#define _FLAG_BOTH(x) (((x)->flag & (SCWS_WORD_PART|SCWS_WORD_FULL)) == (SCWS_WORD_PART|SCWS_WORD_FULL))
|
339
|
+
#define _FLAG_FULL(x) ((x)->flag & SCWS_WORD_FULL)
|
340
|
+
#define _FLAG_PART(x) ((x)->flag & SCWS_WORD_PART)
|
341
|
+
#define _FLAG_MALLOC(x) ((x)->flag & SCWS_WORD_MALLOCED)
|
342
|
+
|
343
|
+
word_t xdict_query(xdict_t xd, const char *key, int len)
|
344
|
+
{
|
345
|
+
word_t value, value2;
|
346
|
+
|
347
|
+
value = value2 = NULL;
|
348
|
+
while (xd != NULL)
|
349
|
+
{
|
350
|
+
if (xd->xmode == SCWS_XDICT_MEM)
|
351
|
+
{
|
352
|
+
/* this is ThreadSafe, recommend. */
|
353
|
+
value = (word_t) xtree_nget((xtree_t) xd->xdict, key, len, NULL);
|
354
|
+
}
|
355
|
+
else
|
356
|
+
{
|
357
|
+
/* the value malloced in lib-XDB. free required */
|
358
|
+
value = (word_t) xdb_nget((xdb_t) xd->xdict, key, len, NULL);
|
359
|
+
if (value != NULL) value->flag |= SCWS_WORD_MALLOCED;
|
360
|
+
}
|
361
|
+
xd = xd->next;
|
362
|
+
|
363
|
+
// check value2
|
364
|
+
if (value != NULL)
|
365
|
+
{
|
366
|
+
if (value2 == NULL)
|
367
|
+
{
|
368
|
+
if (_FLAG_BOTH(value))
|
369
|
+
return value;
|
370
|
+
value2 = value;
|
371
|
+
}
|
372
|
+
else
|
373
|
+
{
|
374
|
+
if (_FLAG_FULL(value2) && _FLAG_PART(value))
|
375
|
+
{
|
376
|
+
value2->flag |= SCWS_WORD_PART;
|
377
|
+
if (_FLAG_MALLOC(value))
|
378
|
+
free(value);
|
379
|
+
return value2;
|
380
|
+
}
|
381
|
+
if (_FLAG_FULL(value) && _FLAG_PART(value2))
|
382
|
+
{
|
383
|
+
value->flag |= SCWS_WORD_PART;
|
384
|
+
if (_FLAG_MALLOC(value2))
|
385
|
+
free(value2);
|
386
|
+
return value;
|
387
|
+
}
|
388
|
+
if (_FLAG_MALLOC(value))
|
389
|
+
free(value);
|
390
|
+
}
|
391
|
+
}
|
392
|
+
}
|
393
|
+
return value2;
|
394
|
+
}
|
data/ext/scws4r/xdict.h
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
/**
|
2
|
+
* @file xdict (dictionary)
|
3
|
+
* @author Hightman Mar
|
4
|
+
* @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
|
5
|
+
* $Id$
|
6
|
+
*/
|
7
|
+
|
8
|
+
#ifndef _SCWS_XDICT_20070528_H_
|
9
|
+
#define _SCWS_XDICT_20070528_H_
|
10
|
+
|
11
|
+
#ifdef __cplusplus
|
12
|
+
extern "C" {
|
13
|
+
#endif
|
14
|
+
|
15
|
+
/* constant var define */
|
16
|
+
#define SCWS_WORD_FULL 0x01 // 多字: 整词
|
17
|
+
#define SCWS_WORD_PART 0x02 // 多字: 前词段
|
18
|
+
#define SCWS_WORD_USED 0x04 // 多字: 已使用
|
19
|
+
#define SCWS_WORD_RULE 0x08 // 多字: 自动识别的
|
20
|
+
#define SCWS_WORD_LONG 0x10 // 多字: 短词组成的长词
|
21
|
+
|
22
|
+
#define SCWS_WORD_MALLOCED 0x80 // xdict_query 结果必须调用 free
|
23
|
+
|
24
|
+
#define SCWS_ZFLAG_PUT 0x02 // 单字: 已使用
|
25
|
+
#define SCWS_ZFLAG_N2 0x04 // 单字: 双字名词头
|
26
|
+
#define SCWS_ZFLAG_NR2 0x08 // 单字: 词头且为双字人名
|
27
|
+
#define SCWS_ZFLAG_WHEAD 0x10 // 单字: 词头
|
28
|
+
#define SCWS_ZFLAG_WPART 0x20 // 单字: 词尾或词中
|
29
|
+
#define SCWS_ZFLAG_ENGLISH 0x40 // 单字: 夹在中间的英文
|
30
|
+
#define SCWS_ZFLAG_SYMBOL 0x80 // 单字: 符号系列
|
31
|
+
#define SCWS_XDICT_PRIME 0x3ffd // 词典结构树数:16381
|
32
|
+
|
33
|
+
/* xdict open mode */
|
34
|
+
#define SCWS_XDICT_XDB 1
|
35
|
+
#define SCWS_XDICT_MEM 2
|
36
|
+
#define SCWS_XDICT_TXT 4 // ...
|
37
|
+
#define SCWS_XDICT_SET 4096 // set flag.
|
38
|
+
|
39
|
+
/* data structure for word(12bytes) */
|
40
|
+
typedef struct scws_word
|
41
|
+
{
|
42
|
+
float tf;
|
43
|
+
float idf;
|
44
|
+
unsigned char flag;
|
45
|
+
char attr[3];
|
46
|
+
} word_st, *word_t;
|
47
|
+
|
48
|
+
typedef struct scws_xdict
|
49
|
+
{
|
50
|
+
void *xdict;
|
51
|
+
int xmode;
|
52
|
+
int ref; // hightman.20130110: refcount (zero to really free/close)
|
53
|
+
struct scws_xdict *next;
|
54
|
+
} xdict_st, *xdict_t;
|
55
|
+
|
56
|
+
/* pub function (api) */
|
57
|
+
xdict_t xdict_open(const char *fpath, int mode);
|
58
|
+
void xdict_close(xdict_t xd);
|
59
|
+
|
60
|
+
/* fork xdict */
|
61
|
+
xdict_t xdict_fork(xdict_t xd);
|
62
|
+
|
63
|
+
/* add a new dict file into xd, succ: 0, error: -1, Mblen only used for XDICT_TXT */
|
64
|
+
xdict_t xdict_add(xdict_t xd, const char *fpath, int mode, unsigned char *ml);
|
65
|
+
|
66
|
+
/* NOW this is ThreadSafe function */
|
67
|
+
word_t xdict_query(xdict_t xd, const char *key, int len);
|
68
|
+
|
69
|
+
#ifdef __cplusplus
|
70
|
+
}
|
71
|
+
#endif
|
72
|
+
|
73
|
+
#endif
|