scws4r 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +13 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +13 -0
- data/Gemfile.lock +65 -0
- data/LICENSE.txt +21 -0
- data/README.md +56 -0
- data/Rakefile +20 -0
- data/defaults/dict.utf8.xdb +0 -0
- data/defaults/rules.utf8.ini +291 -0
- data/ext/scws4r/Makefile +267 -0
- data/ext/scws4r/Makefile.am +15 -0
- data/ext/scws4r/charset.c +90 -0
- data/ext/scws4r/charset.h +14 -0
- data/ext/scws4r/config_win32.h +22 -0
- data/ext/scws4r/crc32.c +103 -0
- data/ext/scws4r/crc32.h +13 -0
- data/ext/scws4r/darray.c +35 -0
- data/ext/scws4r/darray.h +22 -0
- data/ext/scws4r/extconf.rb +3 -0
- data/ext/scws4r/lock.c +153 -0
- data/ext/scws4r/lock.h +44 -0
- data/ext/scws4r/pool.c +141 -0
- data/ext/scws4r/pool.h +53 -0
- data/ext/scws4r/rule.c +407 -0
- data/ext/scws4r/rule.h +83 -0
- data/ext/scws4r/scws.c +1581 -0
- data/ext/scws4r/scws.h +118 -0
- data/ext/scws4r/scws4r.c +207 -0
- data/ext/scws4r/scws4r.h +4 -0
- data/ext/scws4r/version.h.in +4 -0
- data/ext/scws4r/xdb.c +636 -0
- data/ext/scws4r/xdb.h +88 -0
- data/ext/scws4r/xdict.c +394 -0
- data/ext/scws4r/xdict.h +73 -0
- data/ext/scws4r/xtree.c +337 -0
- data/ext/scws4r/xtree.h +65 -0
- data/lib/scws4r/version.rb +5 -0
- data/lib/scws4r.rb +15 -0
- data/scws4r.gemspec +30 -0
- data/sig/scws.rbs +4 -0
- data/test.rb +16 -0
- metadata +88 -0
data/ext/scws4r/xdict.c
ADDED
@@ -0,0 +1,394 @@
|
|
1
|
+
/**
|
2
|
+
* @file xdict.c (dictionary query)
|
3
|
+
* @author Hightman Mar
|
4
|
+
* @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
|
5
|
+
* $Id$
|
6
|
+
*/
|
7
|
+
|
8
|
+
#ifdef HAVE_CONFIG_H
|
9
|
+
# include "config.h"
|
10
|
+
#endif
|
11
|
+
|
12
|
+
#ifdef WIN32
|
13
|
+
# include "config_win32.h"
|
14
|
+
#endif
|
15
|
+
|
16
|
+
#include "xdict.h"
|
17
|
+
#include "xtree.h"
|
18
|
+
#include "xdb.h"
|
19
|
+
#include "crc32.h"
|
20
|
+
#include <stdio.h>
|
21
|
+
#include <stdlib.h>
|
22
|
+
#include <string.h>
|
23
|
+
#include <unistd.h>
|
24
|
+
#ifndef WIN32
|
25
|
+
# include <sys/param.h>
|
26
|
+
#endif
|
27
|
+
#include <sys/types.h>
|
28
|
+
#include <sys/stat.h>
|
29
|
+
|
30
|
+
/* temp file format for TEXT xdb */
|
31
|
+
#if !defined(PATH_MAX) || (PATH_MAX < 1024)
|
32
|
+
# define XDICT_PATH_MAX 1024
|
33
|
+
#else
|
34
|
+
# define XDICT_PATH_MAX PATH_MAX
|
35
|
+
#endif
|
36
|
+
|
37
|
+
#ifdef HAVE_STRTOK_R
|
38
|
+
# define _strtok_r strtok_r
|
39
|
+
#else
|
40
|
+
|
41
|
+
static char *_strtok_r(char *s, char *delim, char **lasts)
|
42
|
+
{
|
43
|
+
register char *spanp;
|
44
|
+
register int c, sc;
|
45
|
+
char *tok;
|
46
|
+
|
47
|
+
if (s == NULL && (s = *lasts) == NULL)
|
48
|
+
return NULL;
|
49
|
+
|
50
|
+
/*
|
51
|
+
* Skip (span) leading delimiters (s += strspn(s, delim), sort of).
|
52
|
+
*/
|
53
|
+
cont:
|
54
|
+
c = *s++;
|
55
|
+
for (spanp = (char *) delim; (sc = *spanp++) != 0;)
|
56
|
+
{
|
57
|
+
if (c == sc) goto cont;
|
58
|
+
}
|
59
|
+
|
60
|
+
if (c == 0)
|
61
|
+
{ /* no non-delimiter characters */
|
62
|
+
*lasts = NULL;
|
63
|
+
return NULL;
|
64
|
+
}
|
65
|
+
tok = s - 1;
|
66
|
+
|
67
|
+
/*
|
68
|
+
* Scan token (scan for delimiters: s += strcspn(s, delim), sort of).
|
69
|
+
* Note that delim must have one NUL; we stop if we see that, too.
|
70
|
+
*/
|
71
|
+
for (;;)
|
72
|
+
{
|
73
|
+
c = *s++;
|
74
|
+
spanp = (char *) delim;
|
75
|
+
do
|
76
|
+
{
|
77
|
+
if ((sc = *spanp++) == c)
|
78
|
+
{
|
79
|
+
if (c == 0) s = NULL;
|
80
|
+
else s[-1] = '\0';
|
81
|
+
*lasts = s;
|
82
|
+
return tok;
|
83
|
+
}
|
84
|
+
}
|
85
|
+
while (sc != 0);
|
86
|
+
}
|
87
|
+
}
|
88
|
+
#endif
|
89
|
+
|
90
|
+
#ifdef WIN32
|
91
|
+
# include <direct.h>
|
92
|
+
|
93
|
+
static void _realpath(const char *src, char *dst)
|
94
|
+
{
|
95
|
+
int len = strlen(src);
|
96
|
+
if (strchr(src, ':') != NULL)
|
97
|
+
memcpy(dst, src, len + 1);
|
98
|
+
else
|
99
|
+
{
|
100
|
+
char *ptr;
|
101
|
+
getcwd(dst, XDICT_PATH_MAX - len - 2);
|
102
|
+
ptr = dst + strlen(dst);
|
103
|
+
*ptr++ = '/';
|
104
|
+
memcpy(ptr, src, len + 1);
|
105
|
+
}
|
106
|
+
}
|
107
|
+
#else
|
108
|
+
# define _realpath realpath
|
109
|
+
#endif
|
110
|
+
|
111
|
+
/* open the text dict */
|
112
|
+
static xdict_t _xdict_open_txt(const char *fpath, int mode, unsigned char *ml)
|
113
|
+
{
|
114
|
+
xdict_t xd;
|
115
|
+
xtree_t xt;
|
116
|
+
char buf[XDICT_PATH_MAX], tmpfile[XDICT_PATH_MAX];
|
117
|
+
struct stat st1, st2;
|
118
|
+
|
119
|
+
// check the input filepath
|
120
|
+
_realpath(fpath, buf);
|
121
|
+
if (stat(buf, &st1) < 0)
|
122
|
+
return NULL;
|
123
|
+
|
124
|
+
// check dest file & orginal file, compare there mtime
|
125
|
+
#ifdef WIN32
|
126
|
+
{
|
127
|
+
char *tmp_ptr;
|
128
|
+
GetTempPath(sizeof(tmpfile) - 20, tmpfile);
|
129
|
+
tmp_ptr = tmpfile + strlen(tmpfile);
|
130
|
+
if (tmp_ptr[-1] == '\\') tmp_ptr--;
|
131
|
+
sprintf(tmp_ptr, "\\scws-%08x.xdb", scws_crc32(buf));
|
132
|
+
}
|
133
|
+
#else
|
134
|
+
sprintf(tmpfile, "/tmp/scws-%08x.xdb", scws_crc32(buf));
|
135
|
+
#endif
|
136
|
+
if (!stat(tmpfile, &st2) && st2.st_mtime > st1.st_mtime)
|
137
|
+
{
|
138
|
+
xdb_t x;
|
139
|
+
if ((x = xdb_open(tmpfile, 'r')) != NULL)
|
140
|
+
{
|
141
|
+
xd = (xdict_t) malloc(sizeof(xdict_st));
|
142
|
+
memset(xd, 0, sizeof(xdict_st));
|
143
|
+
xd->ref = 1;
|
144
|
+
|
145
|
+
if (mode & SCWS_XDICT_MEM)
|
146
|
+
{
|
147
|
+
/* convert the xdb(disk) -> xtree(memory) */
|
148
|
+
if ((xt = xdb_to_xtree(x, NULL)) != NULL)
|
149
|
+
{
|
150
|
+
xdb_close(x);
|
151
|
+
xd->xdict = (void *) xt;
|
152
|
+
xd->xmode = SCWS_XDICT_MEM;
|
153
|
+
return xd;
|
154
|
+
}
|
155
|
+
}
|
156
|
+
xd->xmode = SCWS_XDICT_XDB;
|
157
|
+
xd->xdict = (void *) x;
|
158
|
+
return xd;
|
159
|
+
}
|
160
|
+
}
|
161
|
+
|
162
|
+
// create xtree
|
163
|
+
if ((xt = xtree_new(0, 0)) == NULL)
|
164
|
+
return NULL;
|
165
|
+
else
|
166
|
+
{
|
167
|
+
int cl, kl;
|
168
|
+
FILE *fp;
|
169
|
+
word_st word, *w;
|
170
|
+
char *key, *part, *last, *delim = " \t\r\n";
|
171
|
+
|
172
|
+
// re-build the xdb file from text file
|
173
|
+
if ((fp = fopen(buf, "r")) == NULL)
|
174
|
+
return NULL;
|
175
|
+
|
176
|
+
// parse every line
|
177
|
+
word.attr[2] = '\0';
|
178
|
+
while (fgets(buf, sizeof(buf) - 1, fp) != NULL)
|
179
|
+
{
|
180
|
+
// <word>[\t<tf>[\t<idf>[\t<attr>]]]
|
181
|
+
if (buf[0] == ';' || buf[0] == '#') continue;
|
182
|
+
|
183
|
+
key = _strtok_r(buf, delim, &last);
|
184
|
+
if (key == NULL) continue;
|
185
|
+
kl = strlen(key);
|
186
|
+
|
187
|
+
// init the word
|
188
|
+
do
|
189
|
+
{
|
190
|
+
word.tf = word.idf = 1.0;
|
191
|
+
word.flag = SCWS_WORD_FULL;
|
192
|
+
word.attr[0] = '@';
|
193
|
+
word.attr[1] = '\0';
|
194
|
+
|
195
|
+
if (!(part = _strtok_r(NULL, delim, &last))) break;
|
196
|
+
word.tf = (float) atof(part);
|
197
|
+
|
198
|
+
if (!(part = _strtok_r(NULL, delim, &last))) break;
|
199
|
+
word.idf = (float) atof(part);
|
200
|
+
|
201
|
+
if (part = _strtok_r(NULL, delim, &last))
|
202
|
+
{
|
203
|
+
word.attr[0] = part[0];
|
204
|
+
if (part[1]) word.attr[1] = part[1];
|
205
|
+
}
|
206
|
+
}
|
207
|
+
while (0);
|
208
|
+
|
209
|
+
// save into xtree
|
210
|
+
if ((w = xtree_nget(xt, key, kl, NULL)) == NULL)
|
211
|
+
{
|
212
|
+
w = (word_st *) pmalloc(xt->p, sizeof(word_st));
|
213
|
+
memcpy(w, &word, sizeof(word));
|
214
|
+
xtree_nput(xt, w, sizeof(word), key, kl);
|
215
|
+
}
|
216
|
+
else
|
217
|
+
{
|
218
|
+
w->tf = word.tf;
|
219
|
+
w->idf = word.idf;
|
220
|
+
w->flag |= word.flag;
|
221
|
+
strcpy(w->attr, word.attr);
|
222
|
+
}
|
223
|
+
|
224
|
+
// parse the part
|
225
|
+
cl = ml[(unsigned char) (key[0])];
|
226
|
+
while (1)
|
227
|
+
{
|
228
|
+
cl += ml[(unsigned char) (key[cl])];
|
229
|
+
if (cl >= kl) break;
|
230
|
+
|
231
|
+
if ((w = xtree_nget(xt, key, cl, NULL)) != NULL)
|
232
|
+
w->flag |= SCWS_WORD_PART;
|
233
|
+
else
|
234
|
+
{
|
235
|
+
w = (word_st *) pmalloc_z(xt->p, sizeof(word_st));
|
236
|
+
w->flag = SCWS_WORD_PART;
|
237
|
+
xtree_nput(xt, w, sizeof(word), key, cl);
|
238
|
+
}
|
239
|
+
}
|
240
|
+
}
|
241
|
+
fclose(fp);
|
242
|
+
|
243
|
+
// optimize the xtree & save to xdb
|
244
|
+
xtree_optimize(xt);
|
245
|
+
unlink(tmpfile);
|
246
|
+
xtree_to_xdb(xt, tmpfile);
|
247
|
+
chmod(tmpfile, 0777);
|
248
|
+
|
249
|
+
// return xtree
|
250
|
+
xd = (xdict_t) malloc(sizeof(xdict_st));
|
251
|
+
memset(xd, 0, sizeof(xdict_st));
|
252
|
+
xd->ref = 1;
|
253
|
+
xd->xdict = (void *) xt;
|
254
|
+
xd->xmode = SCWS_XDICT_MEM;
|
255
|
+
return xd;
|
256
|
+
}
|
257
|
+
}
|
258
|
+
|
259
|
+
/* setup & open the dict */
|
260
|
+
xdict_t xdict_open(const char *fpath, int mode)
|
261
|
+
{
|
262
|
+
xdict_t xd;
|
263
|
+
xdb_t x;
|
264
|
+
|
265
|
+
if (!(x = xdb_open(fpath, 'r')))
|
266
|
+
return NULL;
|
267
|
+
|
268
|
+
xd = (xdict_t) malloc(sizeof(xdict_st));
|
269
|
+
memset(xd, 0, sizeof(xdict_st));
|
270
|
+
xd->ref = 1;
|
271
|
+
if (mode & SCWS_XDICT_MEM)
|
272
|
+
{
|
273
|
+
xtree_t xt;
|
274
|
+
|
275
|
+
/* convert the xdb(disk) -> xtree(memory) */
|
276
|
+
if ((xt = xdb_to_xtree(x, NULL)) != NULL)
|
277
|
+
{
|
278
|
+
xdb_close(x);
|
279
|
+
xd->xdict = (void *) xt;
|
280
|
+
xd->xmode = SCWS_XDICT_MEM;
|
281
|
+
return xd;
|
282
|
+
}
|
283
|
+
}
|
284
|
+
|
285
|
+
xd->xmode = SCWS_XDICT_XDB;
|
286
|
+
xd->xdict = (void *) x;
|
287
|
+
return xd;
|
288
|
+
}
|
289
|
+
|
290
|
+
/* add a dict */
|
291
|
+
xdict_t xdict_add(xdict_t xd, const char *fpath, int mode, unsigned char *ml)
|
292
|
+
{
|
293
|
+
xdict_t xx;
|
294
|
+
|
295
|
+
xx = (mode & SCWS_XDICT_TXT ? _xdict_open_txt(fpath, mode, ml) : xdict_open(fpath, mode));
|
296
|
+
if (xx != NULL)
|
297
|
+
{
|
298
|
+
xx->next = xd;
|
299
|
+
return xx;
|
300
|
+
}
|
301
|
+
return xd;
|
302
|
+
}
|
303
|
+
|
304
|
+
/* fork the dict */
|
305
|
+
xdict_t xdict_fork(xdict_t xd)
|
306
|
+
{
|
307
|
+
xdict_t xx;
|
308
|
+
for (xx = xd; xx != NULL; xx = xx->next)
|
309
|
+
{
|
310
|
+
xx->ref++;
|
311
|
+
}
|
312
|
+
return xd;
|
313
|
+
}
|
314
|
+
|
315
|
+
/* close the dict */
|
316
|
+
void xdict_close(xdict_t xd)
|
317
|
+
{
|
318
|
+
xdict_t xx;
|
319
|
+
|
320
|
+
while ((xx = xd) != NULL)
|
321
|
+
{
|
322
|
+
xd = xx->next;
|
323
|
+
xx->ref--;
|
324
|
+
if (xx->ref == 0)
|
325
|
+
{
|
326
|
+
if (xx->xmode == SCWS_XDICT_MEM)
|
327
|
+
xtree_free((xtree_t) xx->xdict);
|
328
|
+
else
|
329
|
+
{
|
330
|
+
xdb_close((xdb_t) xx->xdict);
|
331
|
+
}
|
332
|
+
free(xx);
|
333
|
+
}
|
334
|
+
}
|
335
|
+
}
|
336
|
+
|
337
|
+
/* query the word */
|
338
|
+
#define _FLAG_BOTH(x) (((x)->flag & (SCWS_WORD_PART|SCWS_WORD_FULL)) == (SCWS_WORD_PART|SCWS_WORD_FULL))
|
339
|
+
#define _FLAG_FULL(x) ((x)->flag & SCWS_WORD_FULL)
|
340
|
+
#define _FLAG_PART(x) ((x)->flag & SCWS_WORD_PART)
|
341
|
+
#define _FLAG_MALLOC(x) ((x)->flag & SCWS_WORD_MALLOCED)
|
342
|
+
|
343
|
+
word_t xdict_query(xdict_t xd, const char *key, int len)
|
344
|
+
{
|
345
|
+
word_t value, value2;
|
346
|
+
|
347
|
+
value = value2 = NULL;
|
348
|
+
while (xd != NULL)
|
349
|
+
{
|
350
|
+
if (xd->xmode == SCWS_XDICT_MEM)
|
351
|
+
{
|
352
|
+
/* this is ThreadSafe, recommend. */
|
353
|
+
value = (word_t) xtree_nget((xtree_t) xd->xdict, key, len, NULL);
|
354
|
+
}
|
355
|
+
else
|
356
|
+
{
|
357
|
+
/* the value malloced in lib-XDB. free required */
|
358
|
+
value = (word_t) xdb_nget((xdb_t) xd->xdict, key, len, NULL);
|
359
|
+
if (value != NULL) value->flag |= SCWS_WORD_MALLOCED;
|
360
|
+
}
|
361
|
+
xd = xd->next;
|
362
|
+
|
363
|
+
// check value2
|
364
|
+
if (value != NULL)
|
365
|
+
{
|
366
|
+
if (value2 == NULL)
|
367
|
+
{
|
368
|
+
if (_FLAG_BOTH(value))
|
369
|
+
return value;
|
370
|
+
value2 = value;
|
371
|
+
}
|
372
|
+
else
|
373
|
+
{
|
374
|
+
if (_FLAG_FULL(value2) && _FLAG_PART(value))
|
375
|
+
{
|
376
|
+
value2->flag |= SCWS_WORD_PART;
|
377
|
+
if (_FLAG_MALLOC(value))
|
378
|
+
free(value);
|
379
|
+
return value2;
|
380
|
+
}
|
381
|
+
if (_FLAG_FULL(value) && _FLAG_PART(value2))
|
382
|
+
{
|
383
|
+
value->flag |= SCWS_WORD_PART;
|
384
|
+
if (_FLAG_MALLOC(value2))
|
385
|
+
free(value2);
|
386
|
+
return value;
|
387
|
+
}
|
388
|
+
if (_FLAG_MALLOC(value))
|
389
|
+
free(value);
|
390
|
+
}
|
391
|
+
}
|
392
|
+
}
|
393
|
+
return value2;
|
394
|
+
}
|
data/ext/scws4r/xdict.h
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
/**
|
2
|
+
* @file xdict (dictionary)
|
3
|
+
* @author Hightman Mar
|
4
|
+
* @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
|
5
|
+
* $Id$
|
6
|
+
*/
|
7
|
+
|
8
|
+
#ifndef _SCWS_XDICT_20070528_H_
|
9
|
+
#define _SCWS_XDICT_20070528_H_
|
10
|
+
|
11
|
+
#ifdef __cplusplus
|
12
|
+
extern "C" {
|
13
|
+
#endif
|
14
|
+
|
15
|
+
/* constant var define */
|
16
|
+
#define SCWS_WORD_FULL 0x01 // 多字: 整词
|
17
|
+
#define SCWS_WORD_PART 0x02 // 多字: 前词段
|
18
|
+
#define SCWS_WORD_USED 0x04 // 多字: 已使用
|
19
|
+
#define SCWS_WORD_RULE 0x08 // 多字: 自动识别的
|
20
|
+
#define SCWS_WORD_LONG 0x10 // 多字: 短词组成的长词
|
21
|
+
|
22
|
+
#define SCWS_WORD_MALLOCED 0x80 // xdict_query 结果必须调用 free
|
23
|
+
|
24
|
+
#define SCWS_ZFLAG_PUT 0x02 // 单字: 已使用
|
25
|
+
#define SCWS_ZFLAG_N2 0x04 // 单字: 双字名词头
|
26
|
+
#define SCWS_ZFLAG_NR2 0x08 // 单字: 词头且为双字人名
|
27
|
+
#define SCWS_ZFLAG_WHEAD 0x10 // 单字: 词头
|
28
|
+
#define SCWS_ZFLAG_WPART 0x20 // 单字: 词尾或词中
|
29
|
+
#define SCWS_ZFLAG_ENGLISH 0x40 // 单字: 夹在中间的英文
|
30
|
+
#define SCWS_ZFLAG_SYMBOL 0x80 // 单字: 符号系列
|
31
|
+
#define SCWS_XDICT_PRIME 0x3ffd // 词典结构树数:16381
|
32
|
+
|
33
|
+
/* xdict open mode */
|
34
|
+
#define SCWS_XDICT_XDB 1
|
35
|
+
#define SCWS_XDICT_MEM 2
|
36
|
+
#define SCWS_XDICT_TXT 4 // ...
|
37
|
+
#define SCWS_XDICT_SET 4096 // set flag.
|
38
|
+
|
39
|
+
/* data structure for word(12bytes) */
|
40
|
+
typedef struct scws_word
|
41
|
+
{
|
42
|
+
float tf;
|
43
|
+
float idf;
|
44
|
+
unsigned char flag;
|
45
|
+
char attr[3];
|
46
|
+
} word_st, *word_t;
|
47
|
+
|
48
|
+
typedef struct scws_xdict
|
49
|
+
{
|
50
|
+
void *xdict;
|
51
|
+
int xmode;
|
52
|
+
int ref; // hightman.20130110: refcount (zero to really free/close)
|
53
|
+
struct scws_xdict *next;
|
54
|
+
} xdict_st, *xdict_t;
|
55
|
+
|
56
|
+
/* pub function (api) */
|
57
|
+
xdict_t xdict_open(const char *fpath, int mode);
|
58
|
+
void xdict_close(xdict_t xd);
|
59
|
+
|
60
|
+
/* fork xdict */
|
61
|
+
xdict_t xdict_fork(xdict_t xd);
|
62
|
+
|
63
|
+
/* add a new dict file into xd, succ: 0, error: -1, Mblen only used for XDICT_TXT */
|
64
|
+
xdict_t xdict_add(xdict_t xd, const char *fpath, int mode, unsigned char *ml);
|
65
|
+
|
66
|
+
/* NOW this is ThreadSafe function */
|
67
|
+
word_t xdict_query(xdict_t xd, const char *key, int len);
|
68
|
+
|
69
|
+
#ifdef __cplusplus
|
70
|
+
}
|
71
|
+
#endif
|
72
|
+
|
73
|
+
#endif
|