scws4r 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +13 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +13 -0
- data/Gemfile.lock +65 -0
- data/LICENSE.txt +21 -0
- data/README.md +56 -0
- data/Rakefile +20 -0
- data/defaults/dict.utf8.xdb +0 -0
- data/defaults/rules.utf8.ini +291 -0
- data/ext/scws4r/Makefile +267 -0
- data/ext/scws4r/Makefile.am +15 -0
- data/ext/scws4r/charset.c +90 -0
- data/ext/scws4r/charset.h +14 -0
- data/ext/scws4r/config_win32.h +22 -0
- data/ext/scws4r/crc32.c +103 -0
- data/ext/scws4r/crc32.h +13 -0
- data/ext/scws4r/darray.c +35 -0
- data/ext/scws4r/darray.h +22 -0
- data/ext/scws4r/extconf.rb +3 -0
- data/ext/scws4r/lock.c +153 -0
- data/ext/scws4r/lock.h +44 -0
- data/ext/scws4r/pool.c +141 -0
- data/ext/scws4r/pool.h +53 -0
- data/ext/scws4r/rule.c +407 -0
- data/ext/scws4r/rule.h +83 -0
- data/ext/scws4r/scws.c +1581 -0
- data/ext/scws4r/scws.h +118 -0
- data/ext/scws4r/scws4r.c +207 -0
- data/ext/scws4r/scws4r.h +4 -0
- data/ext/scws4r/version.h.in +4 -0
- data/ext/scws4r/xdb.c +636 -0
- data/ext/scws4r/xdb.h +88 -0
- data/ext/scws4r/xdict.c +394 -0
- data/ext/scws4r/xdict.h +73 -0
- data/ext/scws4r/xtree.c +337 -0
- data/ext/scws4r/xtree.h +65 -0
- data/lib/scws4r/version.rb +5 -0
- data/lib/scws4r.rb +15 -0
- data/scws4r.gemspec +30 -0
- data/sig/scws.rbs +4 -0
- data/test.rb +16 -0
- metadata +88 -0
data/ext/scws4r/scws.c
ADDED
@@ -0,0 +1,1581 @@
|
|
1
|
+
/*
|
2
|
+
* @file scws.c (core segment functions)
|
3
|
+
* @author Hightman Mar
|
4
|
+
* @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
|
5
|
+
* $Id $
|
6
|
+
*/
|
7
|
+
|
8
|
+
#ifdef HAVE_CONFIG_H
|
9
|
+
# include "config.h"
|
10
|
+
#endif
|
11
|
+
|
12
|
+
#ifdef WIN32
|
13
|
+
# include "config_win32.h"
|
14
|
+
#endif
|
15
|
+
|
16
|
+
#include "scws.h"
|
17
|
+
#include "xdict.h"
|
18
|
+
#include "rule.h"
|
19
|
+
#include "charset.h"
|
20
|
+
#include "darray.h"
|
21
|
+
#include "xtree.h"
|
22
|
+
#include <stdio.h>
|
23
|
+
#include <math.h>
|
24
|
+
#include <stdlib.h>
|
25
|
+
#include <string.h>
|
26
|
+
|
27
|
+
/* quick macro define for frequency usage */
|
28
|
+
#define SCWS_IS_SPECIAL(x,l) scws_rule_checkbit(s->r,x,l,SCWS_RULE_SPECIAL)
|
29
|
+
#define SCWS_IS_NOSTATS(x,l) scws_rule_checkbit(s->r,x,l,SCWS_RULE_NOSTATS)
|
30
|
+
#define SCWS_CHARLEN(x) s->mblen[(x)]
|
31
|
+
#define SCWS_IS_ALNUM(x) (((x)>=48&&(x)<=57)||((x)>=65&&(x)<=90)||((x)>=97&&(x)<=122))
|
32
|
+
#define SCWS_IS_ALPHA(x) (((x)>=65&&(x)<=90)||((x)>=97&&(x)<=122))
|
33
|
+
#define SCWS_IS_UALPHA(x) ((x)>=65&&(x)<=90)
|
34
|
+
#define SCWS_IS_DIGIT(x) ((x)>=48&&(x)<=57)
|
35
|
+
#define SCWS_IS_WHEAD(x) ((x) & SCWS_ZFLAG_WHEAD)
|
36
|
+
#define SCWS_IS_ECHAR(x) ((x) & SCWS_ZFLAG_ENGLISH)
|
37
|
+
#define SCWS_NO_RULE1(x) (((x) & (SCWS_ZFLAG_SYMBOL|SCWS_ZFLAG_ENGLISH))||(((x) & (SCWS_ZFLAG_WHEAD|SCWS_ZFLAG_NR2)) == SCWS_ZFLAG_WHEAD))
|
38
|
+
///#define SCWS_NO_RULE2(x) (((x) & SCWS_ZFLAG_ENGLISH)||(((x) & (SCWS_ZFLAG_WHEAD|SCWS_ZFLAG_N2)) == SCWS_ZFLAG_WHEAD))
|
39
|
+
#define SCWS_NO_RULE2 SCWS_NO_RULE1
|
40
|
+
#define SCWS_MAX_EWLEN 33
|
41
|
+
///hightman.070706: char token
|
42
|
+
#define SCWS_CHAR_TOKEN(x) ((x)=='('||(x)==')'||(x)=='['||(x)==']'||(x)=='{'||(x)=='}'||(x)==':'||(x)=='"')
|
43
|
+
///hightman.070814: max zlen = ?? (4 * zlen * zlen = ??)
|
44
|
+
#define SCWS_MAX_ZLEN 128
|
45
|
+
#define SCWS_EN_IDF(x) (float)(2.5*logf(x))
|
46
|
+
|
47
|
+
static const char *attr_en = "en";
|
48
|
+
static const char *attr_un = "un";
|
49
|
+
static const char *attr_nr = "nr";
|
50
|
+
static const char *attr_na = "!";
|
51
|
+
|
52
|
+
/* create scws engine */
|
53
|
+
scws_t scws_new()
|
54
|
+
{
|
55
|
+
scws_t s;
|
56
|
+
s = (scws_t) malloc(sizeof(scws_st));
|
57
|
+
if (s == NULL)
|
58
|
+
return s;
|
59
|
+
memset(s, 0, sizeof(scws_st));
|
60
|
+
s->mblen = charset_table_get(NULL);
|
61
|
+
s->off = s->len = 0;
|
62
|
+
s->wend = -1;
|
63
|
+
|
64
|
+
return s;
|
65
|
+
}
|
66
|
+
|
67
|
+
/* hightman.110320: fork scws */
|
68
|
+
scws_t scws_fork(scws_t p)
|
69
|
+
{
|
70
|
+
scws_t s = scws_new();
|
71
|
+
|
72
|
+
if (p != NULL && s != NULL)
|
73
|
+
{
|
74
|
+
s->mblen = p->mblen;
|
75
|
+
s->mode = p->mode;
|
76
|
+
// fork dict/rules
|
77
|
+
s->r = scws_rule_fork(p->r);
|
78
|
+
s->d = xdict_fork(p->d);
|
79
|
+
}
|
80
|
+
|
81
|
+
return s;
|
82
|
+
}
|
83
|
+
|
84
|
+
/* close & free the engine */
|
85
|
+
void scws_free(scws_t s)
|
86
|
+
{
|
87
|
+
if (s->d)
|
88
|
+
{
|
89
|
+
xdict_close(s->d);
|
90
|
+
s->d = NULL;
|
91
|
+
}
|
92
|
+
if (s->r)
|
93
|
+
{
|
94
|
+
scws_rule_free(s->r);
|
95
|
+
s->r = NULL;
|
96
|
+
}
|
97
|
+
free(s);
|
98
|
+
}
|
99
|
+
|
100
|
+
/* add a dict into scws */
|
101
|
+
int scws_add_dict(scws_t s, const char *fpath, int mode)
|
102
|
+
{
|
103
|
+
xdict_t xx;
|
104
|
+
if (mode & SCWS_XDICT_SET)
|
105
|
+
{
|
106
|
+
xdict_close(s->d);
|
107
|
+
mode ^= SCWS_XDICT_SET;
|
108
|
+
s->d = NULL;
|
109
|
+
}
|
110
|
+
xx = s->d;
|
111
|
+
s->d = xdict_add(s->d, fpath, mode, s->mblen);
|
112
|
+
return (xx == s->d ? -1 : 0);
|
113
|
+
}
|
114
|
+
|
115
|
+
/* set the dict & open it */
|
116
|
+
int scws_set_dict(scws_t s, const char *fpath, int mode)
|
117
|
+
{
|
118
|
+
return scws_add_dict(s, fpath, mode | SCWS_XDICT_SET);
|
119
|
+
}
|
120
|
+
|
121
|
+
void scws_set_charset(scws_t s, const char *cs)
|
122
|
+
{
|
123
|
+
s->mblen = charset_table_get(cs);
|
124
|
+
}
|
125
|
+
|
126
|
+
void scws_set_rule(scws_t s, const char *fpath)
|
127
|
+
{
|
128
|
+
if (s->r != NULL)
|
129
|
+
scws_rule_free(s->r);
|
130
|
+
|
131
|
+
s->r = scws_rule_new(fpath, s->mblen);
|
132
|
+
}
|
133
|
+
|
134
|
+
/* set ignore symbol or multi segments */
|
135
|
+
void scws_set_ignore(scws_t s, int yes)
|
136
|
+
{
|
137
|
+
if (yes == SCWS_YEA)
|
138
|
+
s->mode |= SCWS_IGN_SYMBOL;
|
139
|
+
|
140
|
+
if (yes == SCWS_NA)
|
141
|
+
s->mode &= ~SCWS_IGN_SYMBOL;
|
142
|
+
}
|
143
|
+
|
144
|
+
void scws_set_multi(scws_t s, int mode)
|
145
|
+
{
|
146
|
+
s->mode &= ~SCWS_MULTI_MASK;
|
147
|
+
|
148
|
+
if (mode & SCWS_MULTI_MASK)
|
149
|
+
s->mode |= mode;
|
150
|
+
}
|
151
|
+
|
152
|
+
void scws_set_debug(scws_t s, int yes)
|
153
|
+
{
|
154
|
+
if (yes == SCWS_YEA)
|
155
|
+
s->mode |= SCWS_DEBUG;
|
156
|
+
|
157
|
+
if (yes == SCWS_NA)
|
158
|
+
s->mode &= ~SCWS_DEBUG;
|
159
|
+
}
|
160
|
+
|
161
|
+
void scws_set_duality(scws_t s, int yes)
|
162
|
+
{
|
163
|
+
if (yes == SCWS_YEA)
|
164
|
+
s->mode |= SCWS_DUALITY;
|
165
|
+
|
166
|
+
if (yes == SCWS_NA)
|
167
|
+
s->mode &= ~SCWS_DUALITY;
|
168
|
+
}
|
169
|
+
|
170
|
+
/* send the text buffer & init some others */
|
171
|
+
void scws_send_text(scws_t s, const char *text, int len)
|
172
|
+
{
|
173
|
+
s->txt = (unsigned char *) text;
|
174
|
+
s->len = len;
|
175
|
+
s->off = 0;
|
176
|
+
}
|
177
|
+
|
178
|
+
/* get some words, if these is not words, return NULL */
|
179
|
+
#define SCWS_PUT_RES(o,i,l,a) \
|
180
|
+
do { \
|
181
|
+
scws_res_t res; \
|
182
|
+
res = (scws_res_t) malloc(sizeof(struct scws_result)); \
|
183
|
+
res->off = o; \
|
184
|
+
res->idf = i; \
|
185
|
+
res->len = l; \
|
186
|
+
strncpy(res->attr, a, 2); \
|
187
|
+
res->attr[2] = '\0'; \
|
188
|
+
res->next = NULL; \
|
189
|
+
if (s->res1 == NULL) \
|
190
|
+
s->res1 = s->res0 = res; \
|
191
|
+
else \
|
192
|
+
{ \
|
193
|
+
s->res1->next = res; \
|
194
|
+
s->res1 = res; \
|
195
|
+
} \
|
196
|
+
} while(0)
|
197
|
+
|
198
|
+
/* single bytes segment (纯单字节字符) */
|
199
|
+
#define PFLAG_WITH_MB 0x01
|
200
|
+
#define PFLAG_ALNUM 0x02
|
201
|
+
#define PFLAG_VALID 0x04
|
202
|
+
#define PFLAG_DIGIT 0x08
|
203
|
+
#define PFLAG_ADDSYM 0x10
|
204
|
+
#define PFLAG_ALPHA 0x20
|
205
|
+
#define PFLAG_LONGDIGIT 0x40
|
206
|
+
#define PFLAG_LONGALPHA 0x80
|
207
|
+
|
208
|
+
static void _str_toupper(char *src, char *dst)
|
209
|
+
{
|
210
|
+
while (*src)
|
211
|
+
{
|
212
|
+
*dst++ = *src++;
|
213
|
+
if (dst[-1] >= 'a' && dst[-1] <= 'z')
|
214
|
+
dst[-1] ^= 0x20;
|
215
|
+
}
|
216
|
+
}
|
217
|
+
|
218
|
+
static void _str_tolower(char *src, char *dst)
|
219
|
+
{
|
220
|
+
while (*src)
|
221
|
+
{
|
222
|
+
*dst++ = *src++;
|
223
|
+
if (dst[-1] >= 'A' && dst[-1] <= 'Z')
|
224
|
+
dst[-1] ^= 0x20;
|
225
|
+
}
|
226
|
+
}
|
227
|
+
|
228
|
+
#ifdef HAVE_STRNDUP
|
229
|
+
#define _mem_ndup strndup
|
230
|
+
#else
|
231
|
+
static inline void *_mem_ndup(const char *src, int len)
|
232
|
+
{
|
233
|
+
char *dst;
|
234
|
+
dst = malloc(len+1);
|
235
|
+
memcpy(dst, src, len);
|
236
|
+
dst[len] = '\0';
|
237
|
+
return dst;
|
238
|
+
}
|
239
|
+
#endif
|
240
|
+
|
241
|
+
static void _scws_alnum_multi(scws_t s, int start, int wlen)
|
242
|
+
{
|
243
|
+
char chunk[SCWS_MAX_EWLEN];
|
244
|
+
int i, j, k, ch, pflag;
|
245
|
+
unsigned char *txt;
|
246
|
+
float idf;
|
247
|
+
|
248
|
+
txt = s->txt;
|
249
|
+
pflag = 0;
|
250
|
+
for (i = j = k = 0; i < wlen; i++)
|
251
|
+
{
|
252
|
+
ch = txt[start + i];
|
253
|
+
if (SCWS_IS_DIGIT(ch))
|
254
|
+
{
|
255
|
+
if (pflag & PFLAG_DIGIT)
|
256
|
+
continue;
|
257
|
+
if (pflag != 0)
|
258
|
+
{
|
259
|
+
chunk[j++] = (char) (i-k);
|
260
|
+
k = i;
|
261
|
+
}
|
262
|
+
pflag = PFLAG_DIGIT;
|
263
|
+
}
|
264
|
+
else if (SCWS_IS_ALPHA(ch))
|
265
|
+
{
|
266
|
+
if (pflag & PFLAG_ALPHA)
|
267
|
+
continue;
|
268
|
+
if (pflag != 0)
|
269
|
+
{
|
270
|
+
chunk[j++] = (char) (i-k);
|
271
|
+
k = i;
|
272
|
+
}
|
273
|
+
pflag = PFLAG_ALPHA;
|
274
|
+
}
|
275
|
+
else
|
276
|
+
{
|
277
|
+
if (pflag & PFLAG_ADDSYM)
|
278
|
+
continue;
|
279
|
+
if (pflag != 0)
|
280
|
+
{
|
281
|
+
chunk[j++] = (char) (i-k);
|
282
|
+
k = i;
|
283
|
+
}
|
284
|
+
pflag = PFLAG_ADDSYM;
|
285
|
+
}
|
286
|
+
}
|
287
|
+
|
288
|
+
if (j > 0)
|
289
|
+
{
|
290
|
+
chunk[j] = (char) (i-k);
|
291
|
+
ch = start;
|
292
|
+
for (i = 0; i <= j; i++)
|
293
|
+
{
|
294
|
+
if (!SCWS_IS_ALNUM(txt[ch]))
|
295
|
+
{
|
296
|
+
// just skip
|
297
|
+
}
|
298
|
+
else if (chunk[i] == 1)
|
299
|
+
{
|
300
|
+
if (i > 0 && chunk[i-1] > 1 && (i != 1 || i != j))
|
301
|
+
{
|
302
|
+
if (!SCWS_IS_ALNUM(txt[ch-1]))
|
303
|
+
{
|
304
|
+
idf = SCWS_EN_IDF(chunk[i]);
|
305
|
+
SCWS_PUT_RES(ch, idf, chunk[i], attr_en);
|
306
|
+
}
|
307
|
+
else
|
308
|
+
{
|
309
|
+
idf = SCWS_EN_IDF(chunk[i-1]+1);
|
310
|
+
SCWS_PUT_RES(ch - chunk[i-1], idf, chunk[i-1]+1, attr_en);
|
311
|
+
}
|
312
|
+
}
|
313
|
+
if (i < j && (i != 0 || j != 1))
|
314
|
+
{
|
315
|
+
if (!SCWS_IS_ALNUM(txt[ch+1]))
|
316
|
+
{
|
317
|
+
idf = SCWS_EN_IDF(chunk[i]);
|
318
|
+
SCWS_PUT_RES(ch, idf, chunk[i], attr_en);
|
319
|
+
}
|
320
|
+
else
|
321
|
+
{
|
322
|
+
idf = SCWS_EN_IDF(chunk[i+1]+1);
|
323
|
+
SCWS_PUT_RES(ch, idf, chunk[i+1]+1, attr_en);
|
324
|
+
}
|
325
|
+
}
|
326
|
+
}
|
327
|
+
else
|
328
|
+
{
|
329
|
+
idf = SCWS_EN_IDF(chunk[i]);
|
330
|
+
SCWS_PUT_RES(ch, idf, chunk[i], attr_en);
|
331
|
+
}
|
332
|
+
ch += chunk[i];
|
333
|
+
}
|
334
|
+
}
|
335
|
+
}
|
336
|
+
|
337
|
+
static void _scws_ssegment(scws_t s, int end)
|
338
|
+
{
|
339
|
+
int start, wlen, ch, pflag, ipflag = 0;
|
340
|
+
unsigned char *txt;
|
341
|
+
float idf;
|
342
|
+
|
343
|
+
start = s->off;
|
344
|
+
wlen = end - start;
|
345
|
+
|
346
|
+
/* check special words (need strtoupper) */
|
347
|
+
if (wlen > 1)
|
348
|
+
{
|
349
|
+
txt = (char *) _mem_ndup(s->txt + start, wlen);
|
350
|
+
_str_toupper(txt, txt);
|
351
|
+
if (SCWS_IS_SPECIAL(txt, wlen))
|
352
|
+
{
|
353
|
+
SCWS_PUT_RES(start, 9.5, wlen, "nz");
|
354
|
+
free(txt);
|
355
|
+
return;
|
356
|
+
}
|
357
|
+
free(txt);
|
358
|
+
}
|
359
|
+
|
360
|
+
txt = s->txt;
|
361
|
+
/* check brief words such as S.H.E M.R. */
|
362
|
+
if (SCWS_IS_ALPHA(txt[start]) && txt[start+1] == '.')
|
363
|
+
{
|
364
|
+
for (ch = start + 2; ch < end; ch++)
|
365
|
+
{
|
366
|
+
if (!SCWS_IS_ALPHA(txt[ch])) break;
|
367
|
+
ch++;
|
368
|
+
if (ch == end || txt[ch] != '.') break;
|
369
|
+
}
|
370
|
+
if (ch == end)
|
371
|
+
{
|
372
|
+
SCWS_PUT_RES(start, 7.5, wlen, "nz");
|
373
|
+
return;
|
374
|
+
}
|
375
|
+
}
|
376
|
+
|
377
|
+
/* 取出单词及标点. 数字允许一个点且下一个为数字,不连续的. 字母允许一个不连续的' */
|
378
|
+
while (start < end)
|
379
|
+
{
|
380
|
+
ch = txt[start++];
|
381
|
+
if (ipflag && ch != 0x2e && !SCWS_IS_DIGIT(ch))
|
382
|
+
ipflag = 0;
|
383
|
+
if (SCWS_IS_ALNUM(ch))
|
384
|
+
{
|
385
|
+
pflag = SCWS_IS_DIGIT(ch) ? PFLAG_DIGIT : 0;
|
386
|
+
wlen = 1;
|
387
|
+
while (start < end)
|
388
|
+
{
|
389
|
+
ch = txt[start];
|
390
|
+
if (pflag & PFLAG_DIGIT)
|
391
|
+
{
|
392
|
+
if (!SCWS_IS_DIGIT(ch))
|
393
|
+
{
|
394
|
+
// check percent % = 0x25
|
395
|
+
if (ch == 0x25 && !SCWS_IS_DIGIT(txt[start+1]))
|
396
|
+
{
|
397
|
+
start++;
|
398
|
+
wlen++;
|
399
|
+
break;
|
400
|
+
}
|
401
|
+
if (ipflag)
|
402
|
+
break;
|
403
|
+
// special for IP address or version number? (find out all digit + dot)
|
404
|
+
if (ch == 0x2e && (pflag & PFLAG_ADDSYM))
|
405
|
+
{
|
406
|
+
ipflag = 1;
|
407
|
+
while(--wlen && txt[--start] != 0x2e);
|
408
|
+
pflag = 0;
|
409
|
+
break;
|
410
|
+
}
|
411
|
+
// wlen = 1
|
412
|
+
if (wlen == 1 && SCWS_IS_ALPHA(ch))
|
413
|
+
{
|
414
|
+
pflag ^= PFLAG_DIGIT;
|
415
|
+
pflag |= PFLAG_ADDSYM;
|
416
|
+
continue;
|
417
|
+
}
|
418
|
+
// strict must add: !$this->_is_digit(ord($this->txt[$start+1])))
|
419
|
+
if ((pflag & PFLAG_ADDSYM) || !(ch == 0x2e && SCWS_IS_DIGIT(txt[start+1])))
|
420
|
+
break;
|
421
|
+
pflag |= PFLAG_ADDSYM;
|
422
|
+
}
|
423
|
+
}
|
424
|
+
else
|
425
|
+
{
|
426
|
+
/* hightman.110419: - 出现在字母中间允许连接(0x2d), _ 允许连接(0x5f) */
|
427
|
+
if ((ch == 0x2d || ch == 0x5f) && SCWS_IS_ALPHA(txt[start+1]))
|
428
|
+
pflag |= PFLAG_ADDSYM;
|
429
|
+
else if (!SCWS_IS_ALPHA(ch))
|
430
|
+
{
|
431
|
+
if ((pflag & PFLAG_ADDSYM)
|
432
|
+
|| !((ch == 0x27 && SCWS_IS_ALPHA(txt[start+1]))
|
433
|
+
|| (SCWS_IS_DIGIT(ch) && !SCWS_IS_DIGIT(txt[start+1]))))
|
434
|
+
{
|
435
|
+
break;
|
436
|
+
}
|
437
|
+
pflag |= PFLAG_ADDSYM;
|
438
|
+
}
|
439
|
+
}
|
440
|
+
start++;
|
441
|
+
wlen++;
|
442
|
+
if (wlen >= SCWS_MAX_EWLEN)
|
443
|
+
break;
|
444
|
+
}
|
445
|
+
idf = SCWS_EN_IDF(wlen);
|
446
|
+
SCWS_PUT_RES(start-wlen, idf, wlen, attr_en);
|
447
|
+
if ((s->mode & SCWS_MULTI_DUALITY) && (pflag & PFLAG_ADDSYM))
|
448
|
+
_scws_alnum_multi(s, start-wlen, wlen);
|
449
|
+
}
|
450
|
+
else if (!(s->mode & SCWS_IGN_SYMBOL))
|
451
|
+
{
|
452
|
+
SCWS_PUT_RES(start-1, 0.0, 1, attr_un);
|
453
|
+
}
|
454
|
+
}
|
455
|
+
}
|
456
|
+
|
457
|
+
/* multibyte segment */
|
458
|
+
static int _scws_mget_word(scws_t s, int i, int j)
|
459
|
+
{
|
460
|
+
int r, k;
|
461
|
+
word_t item;
|
462
|
+
|
463
|
+
if (!(s->wmap[i][i]->flag & SCWS_ZFLAG_WHEAD))
|
464
|
+
return i;
|
465
|
+
|
466
|
+
for (r=i, k=i+1; k <= j; k++)
|
467
|
+
{
|
468
|
+
item = s->wmap[i][k];
|
469
|
+
if (item && (item->flag & SCWS_WORD_FULL))
|
470
|
+
{
|
471
|
+
r = k;
|
472
|
+
if (!(item->flag & SCWS_WORD_PART))
|
473
|
+
break;
|
474
|
+
}
|
475
|
+
}
|
476
|
+
return r;
|
477
|
+
}
|
478
|
+
|
479
|
+
static void _scws_mset_word(scws_t s, int i, int j)
|
480
|
+
{
|
481
|
+
word_t item;
|
482
|
+
|
483
|
+
item = s->wmap[i][j];
|
484
|
+
/* hightman.070705: 加入 item == null 判断, 防止超长词(255字以上)unsigned char溢出 */
|
485
|
+
if ((item == NULL) || ((s->mode & SCWS_IGN_SYMBOL)
|
486
|
+
&& !SCWS_IS_ECHAR(item->flag) && !memcmp(item->attr, attr_un, 2)))
|
487
|
+
return;
|
488
|
+
|
489
|
+
/* hightman.070701: 散字自动二元聚合 */
|
490
|
+
if (s->mode & SCWS_DUALITY)
|
491
|
+
{
|
492
|
+
int k = s->zis;
|
493
|
+
|
494
|
+
if (i == j && !SCWS_IS_ECHAR(item->flag) && memcmp(item->attr, attr_un, 2))
|
495
|
+
{
|
496
|
+
s->zis = i;
|
497
|
+
if (k < 0)
|
498
|
+
return;
|
499
|
+
|
500
|
+
i = (k & ~SCWS_ZIS_USED);
|
501
|
+
if ((i != (j-1)) || (!(k & SCWS_ZIS_USED) && s->wend == i))
|
502
|
+
{
|
503
|
+
SCWS_PUT_RES(s->zmap[i].start, s->wmap[i][i]->idf, (s->zmap[i].end - s->zmap[i].start), s->wmap[i][i]->attr);
|
504
|
+
if (i != (j-1))
|
505
|
+
return;
|
506
|
+
}
|
507
|
+
s->zis |= SCWS_ZIS_USED;
|
508
|
+
}
|
509
|
+
else
|
510
|
+
{
|
511
|
+
if ((k >= 0) && (!(k & SCWS_ZIS_USED) || (j > i)))
|
512
|
+
{
|
513
|
+
k &= ~SCWS_ZIS_USED;
|
514
|
+
SCWS_PUT_RES(s->zmap[k].start, s->wmap[k][k]->idf, (s->zmap[k].end - s->zmap[k].start), s->wmap[k][k]->attr);
|
515
|
+
}
|
516
|
+
if (j > i)
|
517
|
+
s->wend = j + 1;
|
518
|
+
s->zis = -1;
|
519
|
+
}
|
520
|
+
}
|
521
|
+
|
522
|
+
SCWS_PUT_RES(s->zmap[i].start, item->idf, (s->zmap[j].end - s->zmap[i].start), item->attr);
|
523
|
+
|
524
|
+
// hightman.070902: multi segment
|
525
|
+
// step1: split to short words
|
526
|
+
if ((j-i) > 1)
|
527
|
+
{
|
528
|
+
int n, k, m = i;
|
529
|
+
if (s->mode & SCWS_MULTI_SHORT)
|
530
|
+
{
|
531
|
+
while (m < j)
|
532
|
+
{
|
533
|
+
k = m;
|
534
|
+
// hightman.111223: multi short enhanced
|
535
|
+
for (n = m + 1; n <= j; n++)
|
536
|
+
{
|
537
|
+
// 3 chars at most
|
538
|
+
if ((n == j && m == i) || (n - m) > 2) break;
|
539
|
+
item = s->wmap[m][n];
|
540
|
+
if (!item) continue;
|
541
|
+
// first shortest or last longest word
|
542
|
+
if ((item->flag & SCWS_WORD_FULL) && (k == m || n == j))
|
543
|
+
k = n;
|
544
|
+
if (!(item->flag & SCWS_WORD_PART)) break;
|
545
|
+
}
|
546
|
+
// short word not found, stop to find, passed to next loop
|
547
|
+
if (k == m)
|
548
|
+
break;
|
549
|
+
|
550
|
+
// save the short word
|
551
|
+
item = s->wmap[m][k];
|
552
|
+
SCWS_PUT_RES(s->zmap[m].start, item->idf, (s->zmap[k].end - s->zmap[m].start), item->attr);
|
553
|
+
// find the next word or go to prev for duality last word
|
554
|
+
if ((m = k + 1) == j)
|
555
|
+
{
|
556
|
+
m--;
|
557
|
+
break;
|
558
|
+
}
|
559
|
+
}
|
560
|
+
}
|
561
|
+
|
562
|
+
if (s->mode & SCWS_MULTI_DUALITY)
|
563
|
+
{
|
564
|
+
while (m < j)
|
565
|
+
{
|
566
|
+
if (SCWS_IS_ECHAR(s->wmap[m][m]->flag))
|
567
|
+
{
|
568
|
+
SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
|
569
|
+
s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
|
570
|
+
}
|
571
|
+
else if (SCWS_IS_ECHAR(s->wmap[m+1][m+1]->flag))
|
572
|
+
{
|
573
|
+
if (m == i)
|
574
|
+
{
|
575
|
+
SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
|
576
|
+
s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
|
577
|
+
}
|
578
|
+
m++;
|
579
|
+
SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
|
580
|
+
s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
|
581
|
+
}
|
582
|
+
else
|
583
|
+
{
|
584
|
+
SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m+1].end - s->zmap[m].start), s->wmap[m][m]->attr);
|
585
|
+
}
|
586
|
+
m++;
|
587
|
+
if (m == j && (SCWS_IS_ECHAR(s->wmap[m][m]->flag) || SCWS_IS_ECHAR(s->wmap[m-1][m-1]->flag)))
|
588
|
+
{
|
589
|
+
SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
|
590
|
+
s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
|
591
|
+
}
|
592
|
+
}
|
593
|
+
}
|
594
|
+
}
|
595
|
+
|
596
|
+
// step2, split to single char
|
597
|
+
if ((j > i) && (s->mode & (SCWS_MULTI_ZMAIN|SCWS_MULTI_ZALL)))
|
598
|
+
{
|
599
|
+
if ((j - i) == 1 && !s->wmap[i][j])
|
600
|
+
{
|
601
|
+
if (s->wmap[i][i]->flag & SCWS_ZFLAG_PUT) i++;
|
602
|
+
else s->wmap[i][i]->flag |= SCWS_ZFLAG_PUT;
|
603
|
+
s->wmap[j][j]->flag |= SCWS_ZFLAG_PUT;
|
604
|
+
}
|
605
|
+
do
|
606
|
+
{
|
607
|
+
if (s->wmap[i][i]->flag & SCWS_ZFLAG_PUT)
|
608
|
+
continue;
|
609
|
+
if (!(s->mode & SCWS_MULTI_ZALL) && !strchr("jnv", s->wmap[i][i]->attr[0]))
|
610
|
+
continue;
|
611
|
+
SCWS_PUT_RES(s->zmap[i].start, s->wmap[i][i]->idf, (s->zmap[i].end - s->zmap[i].start), s->wmap[i][i]->attr);
|
612
|
+
}
|
613
|
+
while (++i <= j);
|
614
|
+
}
|
615
|
+
}
|
616
|
+
|
617
|
+
static void _scws_mseg_zone(scws_t s, int f, int t)
|
618
|
+
{
|
619
|
+
unsigned char *mpath, *npath;
|
620
|
+
word_t **wmap;
|
621
|
+
int x,i,j,m,n,j2,sz;
|
622
|
+
double weight, nweight;
|
623
|
+
char attr1[3];
|
624
|
+
|
625
|
+
mpath = npath = NULL;
|
626
|
+
weight = nweight = (double) 0.0;
|
627
|
+
|
628
|
+
wmap = s->wmap;
|
629
|
+
j2 = 0;
|
630
|
+
for (x = i = f; i <= t; i++)
|
631
|
+
{
|
632
|
+
j = _scws_mget_word(s, i, (x > i ? x - 1 : t));
|
633
|
+
if (j == i) continue;
|
634
|
+
// skip NR in NR
|
635
|
+
if (j < j2 && wmap[i][j]->attr[0] == 'n' && wmap[i][j]->attr[1] == 'r') continue;
|
636
|
+
if (i > j2 && (wmap[i][j]->flag & SCWS_WORD_USED)) continue;
|
637
|
+
|
638
|
+
/* one word only */
|
639
|
+
if (i == f && j == t)
|
640
|
+
{
|
641
|
+
mpath = (unsigned char *) malloc(2);
|
642
|
+
mpath[0] = j - i;
|
643
|
+
mpath[1] = 0xff;
|
644
|
+
break;
|
645
|
+
}
|
646
|
+
|
647
|
+
if (i != f && (wmap[i][j]->flag & SCWS_WORD_RULE))
|
648
|
+
continue;
|
649
|
+
|
650
|
+
/* create the new path */
|
651
|
+
wmap[i][j]->flag |= SCWS_WORD_USED;
|
652
|
+
nweight = (double) wmap[i][j]->tf * pow(j-i,4);
|
653
|
+
|
654
|
+
if (npath == NULL)
|
655
|
+
{
|
656
|
+
npath = (unsigned char *) malloc(t-f+2);
|
657
|
+
memset(npath, 0xff, t-f+2);
|
658
|
+
}
|
659
|
+
|
660
|
+
/* lookfor backward */
|
661
|
+
x = sz = 0;
|
662
|
+
memset(attr1, 0, sizeof(attr1));
|
663
|
+
for (m = f; m < i; m = n+1)
|
664
|
+
{
|
665
|
+
n = _scws_mget_word(s, m, i-1);
|
666
|
+
nweight *= wmap[m][n]->tf;
|
667
|
+
npath[x++] = n - m;
|
668
|
+
if (n > m)
|
669
|
+
{
|
670
|
+
nweight *= pow(n-m,4);
|
671
|
+
wmap[m][n]->flag |= SCWS_WORD_USED;
|
672
|
+
}
|
673
|
+
else sz++;
|
674
|
+
|
675
|
+
if (attr1[0] != '\0')
|
676
|
+
nweight *= scws_rule_attr_ratio(s->r, attr1, wmap[m][n]->attr, &npath[x-2]);
|
677
|
+
memcpy(attr1, wmap[m][n]->attr, 2);
|
678
|
+
}
|
679
|
+
|
680
|
+
/* my self */
|
681
|
+
npath[x++] = j - i;
|
682
|
+
|
683
|
+
if (attr1[0] != '\0')
|
684
|
+
nweight *= scws_rule_attr_ratio(s->r, attr1, wmap[i][j]->attr, &npath[x-2]);
|
685
|
+
memcpy(attr1, wmap[i][j]->attr, 2);
|
686
|
+
|
687
|
+
/* lookfor forward */
|
688
|
+
for (m = j+1; m <= t; m = n+1)
|
689
|
+
{
|
690
|
+
n = _scws_mget_word(s, m, t);
|
691
|
+
nweight *= wmap[m][n]->tf;
|
692
|
+
npath[x++] = n - m;
|
693
|
+
if (n > m)
|
694
|
+
{
|
695
|
+
nweight *= pow(n-m,4);
|
696
|
+
wmap[m][n]->flag |= SCWS_WORD_USED;
|
697
|
+
}
|
698
|
+
else sz++;
|
699
|
+
|
700
|
+
nweight *= scws_rule_attr_ratio(s->r, attr1, wmap[m][n]->attr, &npath[x-2]);
|
701
|
+
memcpy(attr1, wmap[m][n]->attr, 2);
|
702
|
+
}
|
703
|
+
|
704
|
+
npath[x] = 0xff;
|
705
|
+
nweight /= pow(x+sz-1,5);
|
706
|
+
|
707
|
+
/* draw the path for debug */
|
708
|
+
#ifdef DEBUG
|
709
|
+
if (s->mode & SCWS_DEBUG)
|
710
|
+
{
|
711
|
+
fprintf(stderr, "PATH by keyword = %.*s, (weight=%.4f):\n",
|
712
|
+
s->zmap[j].end - s->zmap[i].start, s->txt + s->zmap[i].start, nweight);
|
713
|
+
for (x = 0, m = f; (n = npath[x]) != 0xff; x++)
|
714
|
+
{
|
715
|
+
n += m;
|
716
|
+
fprintf(stderr, "%.*s ", s->zmap[n].end - s->zmap[m].start, s->txt + s->zmap[m].start);
|
717
|
+
m = n + 1;
|
718
|
+
}
|
719
|
+
fprintf(stderr, "\n--\n");
|
720
|
+
}
|
721
|
+
#endif
|
722
|
+
|
723
|
+
j2 = x = j;
|
724
|
+
if ((x - i) > 1) i--;
|
725
|
+
/* check better path */
|
726
|
+
if (nweight > weight)
|
727
|
+
{
|
728
|
+
unsigned char *swap;
|
729
|
+
|
730
|
+
weight = nweight;
|
731
|
+
swap = mpath;
|
732
|
+
mpath = npath;
|
733
|
+
npath = swap;
|
734
|
+
}
|
735
|
+
}
|
736
|
+
|
737
|
+
/* set the result, mpath != NULL */
|
738
|
+
if (mpath == NULL)
|
739
|
+
return;
|
740
|
+
|
741
|
+
for (x = 0, m = f; (n = mpath[x]) != 0xff; x++)
|
742
|
+
{
|
743
|
+
n += m;
|
744
|
+
_scws_mset_word(s, m, n);
|
745
|
+
m = n + 1;
|
746
|
+
}
|
747
|
+
|
748
|
+
/* 一口.070808: memory leak fixed. */
|
749
|
+
if (mpath) free(mpath);
|
750
|
+
if (npath) free(npath);
|
751
|
+
}
|
752
|
+
|
753
|
+
/* quick define for zrule_checker in loop */
|
754
|
+
#define ___ZRULE_CHECKER1___ \
|
755
|
+
if (j >= zlen || SCWS_NO_RULE2(wmap[j][j]->flag)) \
|
756
|
+
break;
|
757
|
+
|
758
|
+
#define ___ZRULE_CHECKER2___ \
|
759
|
+
if (j < 0 || SCWS_NO_RULE2(wmap[j][j]->flag)) \
|
760
|
+
break;
|
761
|
+
|
762
|
+
#define ___ZRULE_CHECKER3___ \
|
763
|
+
if (!scws_rule_check(s->r, r1, txt + zmap[j].start, zmap[j].end - zmap[j].start)) \
|
764
|
+
break;
|
765
|
+
|
766
|
+
static void _scws_msegment(scws_t s, int end, int zlen)
|
767
|
+
{
|
768
|
+
word_t **wmap, query;
|
769
|
+
struct scws_zchar *zmap;
|
770
|
+
unsigned char *txt;
|
771
|
+
#ifdef HAVE_NAME_RULE /* 20150403: Remove rules, just deepend on dictionary */
|
772
|
+
rule_item_t r1;
|
773
|
+
#endif
|
774
|
+
int i, j, k, ch, clen, start;
|
775
|
+
pool_t p;
|
776
|
+
|
777
|
+
/* pool used to management some dynamic memory */
|
778
|
+
p = pool_new();
|
779
|
+
|
780
|
+
/* create wmap & zmap */
|
781
|
+
wmap = s->wmap = (word_t **) darray_new(zlen, zlen, sizeof(word_t));
|
782
|
+
zmap = s->zmap = (struct scws_zchar *) pmalloc(p, zlen * sizeof(struct scws_zchar));
|
783
|
+
txt = s->txt;
|
784
|
+
start = s->off;
|
785
|
+
s->zis = -1;
|
786
|
+
|
787
|
+
for (i = 0; start < end; i++)
|
788
|
+
{
|
789
|
+
ch = txt[start];
|
790
|
+
clen = SCWS_CHARLEN(ch);
|
791
|
+
if (clen == 1)
|
792
|
+
{
|
793
|
+
while (start++ < end)
|
794
|
+
{
|
795
|
+
ch = txt[start];
|
796
|
+
if (start == end || SCWS_CHARLEN(txt[start]) > 1)
|
797
|
+
break;
|
798
|
+
clen++;
|
799
|
+
}
|
800
|
+
wmap[i][i] = (word_t) pmalloc_z(p, sizeof(word_st));
|
801
|
+
wmap[i][i]->tf = 0.5;
|
802
|
+
wmap[i][i]->flag |= SCWS_ZFLAG_ENGLISH;
|
803
|
+
strcpy(wmap[i][i]->attr, SCWS_IS_ALPHA(txt[start-1]) ? attr_en : attr_un);
|
804
|
+
}
|
805
|
+
else
|
806
|
+
{
|
807
|
+
query = xdict_query(s->d, txt + start, clen);
|
808
|
+
wmap[i][i] = (word_t) pmalloc(p, sizeof(word_st));
|
809
|
+
if (query == NULL)
|
810
|
+
{
|
811
|
+
wmap[i][i]->tf = 0.5;
|
812
|
+
wmap[i][i]->idf = 0.0;
|
813
|
+
wmap[i][i]->flag = 0;
|
814
|
+
strcpy(wmap[i][i]->attr, attr_un);
|
815
|
+
}
|
816
|
+
else
|
817
|
+
{
|
818
|
+
ch = query->flag;
|
819
|
+
query->flag = SCWS_WORD_FULL;
|
820
|
+
memcpy(wmap[i][i], query, sizeof(word_st));
|
821
|
+
if (query->attr[0] == '#')
|
822
|
+
wmap[i][i]->flag |= SCWS_ZFLAG_SYMBOL;
|
823
|
+
|
824
|
+
if (ch & SCWS_WORD_MALLOCED)
|
825
|
+
free(query);
|
826
|
+
}
|
827
|
+
start += clen;
|
828
|
+
}
|
829
|
+
|
830
|
+
zmap[i].start = start - clen;
|
831
|
+
zmap[i].end = start;
|
832
|
+
}
|
833
|
+
|
834
|
+
/* fixed real zlength */
|
835
|
+
zlen = i;
|
836
|
+
|
837
|
+
/* create word query table */
|
838
|
+
for (i = 0; i < zlen; i++)
|
839
|
+
{
|
840
|
+
k = 0;
|
841
|
+
for (j = i+1; j < zlen; j++)
|
842
|
+
{
|
843
|
+
query = xdict_query(s->d, txt + zmap[i].start, zmap[j].end - zmap[i].start);
|
844
|
+
if (query == NULL)
|
845
|
+
break;
|
846
|
+
ch = query->flag;
|
847
|
+
if ((ch & SCWS_WORD_FULL) && memcmp(query->attr, attr_na, 2))
|
848
|
+
{
|
849
|
+
wmap[i][j] = (word_t) pmalloc(p, sizeof(word_st));
|
850
|
+
memcpy(wmap[i][j], query, sizeof(word_st));
|
851
|
+
|
852
|
+
wmap[i][i]->flag |= SCWS_ZFLAG_WHEAD;
|
853
|
+
|
854
|
+
for (k = i+1; k <= j; k++)
|
855
|
+
wmap[k][k]->flag |= SCWS_ZFLAG_WPART;
|
856
|
+
}
|
857
|
+
|
858
|
+
if (ch & SCWS_WORD_MALLOCED)
|
859
|
+
free(query);
|
860
|
+
|
861
|
+
if (!(ch & SCWS_WORD_PART))
|
862
|
+
break;
|
863
|
+
}
|
864
|
+
|
865
|
+
if (k--)
|
866
|
+
{
|
867
|
+
/* set nr2 to some short name */
|
868
|
+
if ((k == (i+1)))
|
869
|
+
{
|
870
|
+
if (!memcmp(wmap[i][k]->attr, attr_nr, 2))
|
871
|
+
wmap[i][i]->flag |= SCWS_ZFLAG_NR2;
|
872
|
+
//if (wmap[i][k]->attr[0] == 'n')
|
873
|
+
//wmap[i][i]->flag |= SCWS_ZFLAG_N2;
|
874
|
+
}
|
875
|
+
|
876
|
+
/* clean the PART flag for the last word */
|
877
|
+
if (k < j)
|
878
|
+
wmap[i][k]->flag ^= SCWS_WORD_PART;
|
879
|
+
}
|
880
|
+
}
|
881
|
+
|
882
|
+
if (s->r == NULL)
|
883
|
+
goto do_segment;
|
884
|
+
|
885
|
+
#ifdef HAVE_NAME_RULE /* 20150403: Remove rules, just deepend on dictionary */
|
886
|
+
/* auto rule set for name & zone & chinese numeric */
|
887
|
+
|
888
|
+
/* one word auto rule check */
|
889
|
+
for (i = 0; i < zlen; i++)
|
890
|
+
{
|
891
|
+
if (SCWS_NO_RULE1(wmap[i][i]->flag))
|
892
|
+
continue;
|
893
|
+
|
894
|
+
r1 = scws_rule_get(s->r, txt + zmap[i].start, zmap[i].end - zmap[i].start);
|
895
|
+
if (r1 == NULL)
|
896
|
+
continue;
|
897
|
+
|
898
|
+
clen = r1->zmin > 0 ? r1->zmin : 1;
|
899
|
+
if ((r1->flag & SCWS_ZRULE_PREFIX) && (i < (zlen - clen)))
|
900
|
+
{
|
901
|
+
/* prefix, check after (zmin~zmax) */
|
902
|
+
// 先检查 zmin 字内是否全部符合要求
|
903
|
+
// 再在 zmax 范围内取得符合要求的字
|
904
|
+
// int i, j, k, ch, clen, start;
|
905
|
+
for (ch = 1; ch <= clen; ch++)
|
906
|
+
{
|
907
|
+
j = i + ch;
|
908
|
+
___ZRULE_CHECKER1___
|
909
|
+
___ZRULE_CHECKER3___
|
910
|
+
}
|
911
|
+
|
912
|
+
if (ch <= clen)
|
913
|
+
continue;
|
914
|
+
|
915
|
+
/* no limit znum or limit to a range */
|
916
|
+
j = i + ch;
|
917
|
+
while (1)
|
918
|
+
{
|
919
|
+
if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
|
920
|
+
break;
|
921
|
+
___ZRULE_CHECKER1___
|
922
|
+
___ZRULE_CHECKER3___
|
923
|
+
clen++;
|
924
|
+
j++;
|
925
|
+
}
|
926
|
+
|
927
|
+
// 注意原来2字人名,识别后仍为2字的情况
|
928
|
+
if (wmap[i][i]->flag & SCWS_ZFLAG_NR2)
|
929
|
+
{
|
930
|
+
if (clen == 1)
|
931
|
+
continue;
|
932
|
+
wmap[i][i+1]->flag |= SCWS_WORD_PART;
|
933
|
+
}
|
934
|
+
|
935
|
+
/* ok, got: i & clen */
|
936
|
+
k = i + clen;
|
937
|
+
wmap[i][k] = (word_t) pmalloc(p, sizeof(word_st));
|
938
|
+
wmap[i][k]->tf = r1->tf;
|
939
|
+
wmap[i][k]->idf = r1->idf;
|
940
|
+
wmap[i][k]->flag = (SCWS_WORD_RULE|SCWS_WORD_FULL);
|
941
|
+
strncpy(wmap[i][k]->attr, r1->attr, 2);
|
942
|
+
|
943
|
+
wmap[i][i]->flag |= SCWS_ZFLAG_WHEAD;
|
944
|
+
for (j = i+1; j <= k; j++)
|
945
|
+
wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
|
946
|
+
|
947
|
+
if (!(wmap[i][i]->flag & SCWS_ZFLAG_WPART))
|
948
|
+
i = k;
|
949
|
+
|
950
|
+
continue;
|
951
|
+
}
|
952
|
+
|
953
|
+
if ((r1->flag & SCWS_ZRULE_SUFFIX) && (i >= clen))
|
954
|
+
{
|
955
|
+
/* suffix, check before */
|
956
|
+
for (ch = 1; ch <= clen; ch++)
|
957
|
+
{
|
958
|
+
j = i - ch;
|
959
|
+
___ZRULE_CHECKER2___
|
960
|
+
___ZRULE_CHECKER3___
|
961
|
+
}
|
962
|
+
|
963
|
+
if (ch <= clen)
|
964
|
+
continue;
|
965
|
+
|
966
|
+
/* no limit znum or limit to a range */
|
967
|
+
j = i - ch;
|
968
|
+
while (1)
|
969
|
+
{
|
970
|
+
if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
|
971
|
+
break;
|
972
|
+
___ZRULE_CHECKER2___
|
973
|
+
___ZRULE_CHECKER3___
|
974
|
+
clen++;
|
975
|
+
j--;
|
976
|
+
}
|
977
|
+
|
978
|
+
/* ok, got: i & clen (maybe clen=1 & [k][i] isset) */
|
979
|
+
k = i - clen;
|
980
|
+
if (wmap[k][i] != NULL)
|
981
|
+
continue;
|
982
|
+
|
983
|
+
wmap[k][i] = (word_t) pmalloc(p, sizeof(word_st));
|
984
|
+
wmap[k][i]->tf = r1->tf;
|
985
|
+
wmap[k][i]->idf = r1->idf;
|
986
|
+
wmap[k][i]->flag = SCWS_WORD_FULL;
|
987
|
+
strncpy(wmap[k][i]->attr, r1->attr, 2);
|
988
|
+
|
989
|
+
wmap[k][k]->flag |= SCWS_ZFLAG_WHEAD;
|
990
|
+
for (j = k+1; j <= i; j++)
|
991
|
+
{
|
992
|
+
wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
|
993
|
+
if ((j != i) && (wmap[k][j] != NULL))
|
994
|
+
wmap[k][j]->flag |= SCWS_WORD_PART;
|
995
|
+
}
|
996
|
+
continue;
|
997
|
+
}
|
998
|
+
}
|
999
|
+
|
1000
|
+
/* two words auto rule check (欧阳** , **西路) */
|
1001
|
+
for (i = zlen - 2; i >= 0; i--)
|
1002
|
+
{
|
1003
|
+
/* with value ==> must be have SCWS_WORD_FULL, so needn't check it ag. */
|
1004
|
+
if ((wmap[i][i+1] == NULL) || (wmap[i][i+1]->flag & SCWS_WORD_PART))
|
1005
|
+
continue;
|
1006
|
+
|
1007
|
+
k = i+1;
|
1008
|
+
r1 = scws_rule_get(s->r, txt + zmap[i].start, zmap[k].end - zmap[i].start);
|
1009
|
+
if (r1 == NULL)
|
1010
|
+
continue;
|
1011
|
+
|
1012
|
+
clen = r1->zmin > 0 ? r1->zmin : 1;
|
1013
|
+
if ((r1->flag & SCWS_ZRULE_PREFIX) && (k < (zlen - clen)))
|
1014
|
+
{
|
1015
|
+
for (ch = 1; ch <= clen; ch++)
|
1016
|
+
{
|
1017
|
+
j = k + ch;
|
1018
|
+
___ZRULE_CHECKER1___
|
1019
|
+
___ZRULE_CHECKER3___
|
1020
|
+
}
|
1021
|
+
|
1022
|
+
if (ch <= clen)
|
1023
|
+
continue;
|
1024
|
+
|
1025
|
+
/* no limit znum or limit to a range */
|
1026
|
+
j = k + ch;
|
1027
|
+
while (1)
|
1028
|
+
{
|
1029
|
+
if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
|
1030
|
+
break;
|
1031
|
+
___ZRULE_CHECKER1___
|
1032
|
+
___ZRULE_CHECKER3___
|
1033
|
+
clen++;
|
1034
|
+
j++;
|
1035
|
+
}
|
1036
|
+
|
1037
|
+
/* ok, got: i & clen */
|
1038
|
+
k = k + clen;
|
1039
|
+
wmap[i][k] = (word_t) pmalloc(p, sizeof(word_st));
|
1040
|
+
wmap[i][k]->tf = r1->tf;
|
1041
|
+
wmap[i][k]->idf = r1->idf;
|
1042
|
+
wmap[i][k]->flag = SCWS_WORD_FULL;
|
1043
|
+
strncpy(wmap[i][k]->attr, r1->attr, 2);
|
1044
|
+
|
1045
|
+
wmap[i][i+1]->flag |= SCWS_WORD_PART;
|
1046
|
+
for (j = i+2; j <= k; j++)
|
1047
|
+
wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
|
1048
|
+
|
1049
|
+
i--;
|
1050
|
+
continue;
|
1051
|
+
}
|
1052
|
+
|
1053
|
+
if ((r1->flag & SCWS_ZRULE_SUFFIX) && (i >= clen))
|
1054
|
+
{
|
1055
|
+
/* suffix, check before */
|
1056
|
+
for (ch = 1; ch <= clen; ch++)
|
1057
|
+
{
|
1058
|
+
j = i - ch;
|
1059
|
+
___ZRULE_CHECKER2___
|
1060
|
+
___ZRULE_CHECKER3___
|
1061
|
+
}
|
1062
|
+
|
1063
|
+
if (ch <= clen)
|
1064
|
+
continue;
|
1065
|
+
|
1066
|
+
/* no limit znum or limit to a range */
|
1067
|
+
j = i - ch;
|
1068
|
+
while (1)
|
1069
|
+
{
|
1070
|
+
if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
|
1071
|
+
break;
|
1072
|
+
___ZRULE_CHECKER2___
|
1073
|
+
___ZRULE_CHECKER3___
|
1074
|
+
clen++;
|
1075
|
+
j--;
|
1076
|
+
}
|
1077
|
+
|
1078
|
+
/* ok, got: i & clen (maybe clen=1 & [k][i] isset) */
|
1079
|
+
k = i - clen;
|
1080
|
+
i = i + 1;
|
1081
|
+
wmap[k][i] = (word_t) pmalloc(p, sizeof(word_st));
|
1082
|
+
wmap[k][i]->tf = r1->tf;
|
1083
|
+
wmap[k][i]->idf = r1->idf;
|
1084
|
+
wmap[k][i]->flag = SCWS_WORD_FULL;
|
1085
|
+
strncpy(wmap[k][i]->attr, r1->attr, 2);
|
1086
|
+
|
1087
|
+
wmap[k][k]->flag |= SCWS_ZFLAG_WHEAD;
|
1088
|
+
for (j = k+1; j <= i; j++)
|
1089
|
+
{
|
1090
|
+
wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
|
1091
|
+
if (wmap[k][j] != NULL)
|
1092
|
+
wmap[k][j]->flag |= SCWS_WORD_PART;
|
1093
|
+
}
|
1094
|
+
|
1095
|
+
i -= (clen+1);
|
1096
|
+
continue;
|
1097
|
+
}
|
1098
|
+
}
|
1099
|
+
#endif
|
1100
|
+
|
1101
|
+
/* real do the segment */
|
1102
|
+
do_segment:
|
1103
|
+
|
1104
|
+
/* find the easy break point */
|
1105
|
+
for (i = 0, j = 0; i < zlen; i++)
|
1106
|
+
{
|
1107
|
+
if (wmap[i][i]->flag & SCWS_ZFLAG_WPART)
|
1108
|
+
continue;
|
1109
|
+
|
1110
|
+
if (i > j)
|
1111
|
+
_scws_mseg_zone(s, j, i-1);
|
1112
|
+
|
1113
|
+
j = i;
|
1114
|
+
if (!(wmap[i][i]->flag & SCWS_ZFLAG_WHEAD))
|
1115
|
+
{
|
1116
|
+
_scws_mset_word(s, i, i);
|
1117
|
+
j++;
|
1118
|
+
}
|
1119
|
+
}
|
1120
|
+
|
1121
|
+
/* the lastest zone */
|
1122
|
+
if (i > j)
|
1123
|
+
_scws_mseg_zone(s, j, i-1);
|
1124
|
+
|
1125
|
+
/* the last single for duality */
|
1126
|
+
if ((s->mode & SCWS_DUALITY) && (s->zis >= 0) && !(s->zis & SCWS_ZIS_USED))
|
1127
|
+
{
|
1128
|
+
i = s->zis;
|
1129
|
+
SCWS_PUT_RES(s->zmap[i].start, s->wmap[i][i]->idf, (s->zmap[i].end - s->zmap[i].start), s->wmap[i][i]->attr);
|
1130
|
+
}
|
1131
|
+
|
1132
|
+
/* free the wmap & zmap */
|
1133
|
+
pool_free(p);
|
1134
|
+
darray_free((void **) wmap);
|
1135
|
+
}
|
1136
|
+
|
1137
|
+
scws_res_t scws_get_result(scws_t s)
|
1138
|
+
{
|
1139
|
+
int off, len, ch, clen, zlen, pflag;
|
1140
|
+
unsigned char *txt;
|
1141
|
+
|
1142
|
+
off = s->off;
|
1143
|
+
len = s->len;
|
1144
|
+
txt = s->txt;
|
1145
|
+
s->res0 = s->res1 = NULL;
|
1146
|
+
while ((off < len) && (txt[off] <= 0x20))
|
1147
|
+
{
|
1148
|
+
if (txt[off] == 0x0a || txt[off] == 0x0d)
|
1149
|
+
{
|
1150
|
+
s->off = off + 1;
|
1151
|
+
SCWS_PUT_RES(off, 0.0, 1, attr_un);
|
1152
|
+
return s->res0;
|
1153
|
+
}
|
1154
|
+
off++;
|
1155
|
+
}
|
1156
|
+
|
1157
|
+
if (off >= len)
|
1158
|
+
return NULL;
|
1159
|
+
|
1160
|
+
/* try to parse the sentence */
|
1161
|
+
s->off = off;
|
1162
|
+
ch = txt[off];
|
1163
|
+
if (SCWS_CHAR_TOKEN(ch) && !(s->mode & SCWS_IGN_SYMBOL))
|
1164
|
+
{
|
1165
|
+
s->off++;
|
1166
|
+
SCWS_PUT_RES(off, 0.0, 1, attr_un);
|
1167
|
+
return s->res0;
|
1168
|
+
}
|
1169
|
+
clen = SCWS_CHARLEN(ch);
|
1170
|
+
zlen = 1;
|
1171
|
+
pflag = (clen > 1 ? PFLAG_WITH_MB : (SCWS_IS_ALNUM(ch) ? PFLAG_ALNUM : 0));
|
1172
|
+
while ((off = (off+clen)) < len)
|
1173
|
+
{
|
1174
|
+
ch = txt[off];
|
1175
|
+
if (ch <= 0x20 || SCWS_CHAR_TOKEN(ch)) break;
|
1176
|
+
clen = SCWS_CHARLEN(ch);
|
1177
|
+
if (!(pflag & PFLAG_WITH_MB))
|
1178
|
+
{
|
1179
|
+
// pure single-byte -> multibyte (2bytes)
|
1180
|
+
if (clen == 1)
|
1181
|
+
{
|
1182
|
+
if (pflag & PFLAG_ALNUM)
|
1183
|
+
{
|
1184
|
+
if (SCWS_IS_ALPHA(ch))
|
1185
|
+
{
|
1186
|
+
if (!(pflag & PFLAG_LONGALPHA) && SCWS_IS_ALPHA(txt[off-1]))
|
1187
|
+
pflag |= PFLAG_LONGALPHA;
|
1188
|
+
}
|
1189
|
+
else if (SCWS_IS_DIGIT(ch))
|
1190
|
+
{
|
1191
|
+
if (!(pflag & PFLAG_LONGDIGIT) && SCWS_IS_DIGIT(txt[off-1]))
|
1192
|
+
pflag |= PFLAG_LONGDIGIT;
|
1193
|
+
}
|
1194
|
+
else
|
1195
|
+
pflag ^= PFLAG_ALNUM;
|
1196
|
+
}
|
1197
|
+
}
|
1198
|
+
else
|
1199
|
+
{
|
1200
|
+
if (!(pflag & PFLAG_ALNUM) || zlen > 2)
|
1201
|
+
break;
|
1202
|
+
|
1203
|
+
pflag |= PFLAG_WITH_MB;
|
1204
|
+
/* zlen = 1; */
|
1205
|
+
}
|
1206
|
+
}
|
1207
|
+
else if ((pflag & PFLAG_WITH_MB) && clen == 1)
|
1208
|
+
{
|
1209
|
+
int i;
|
1210
|
+
|
1211
|
+
// mb + single-byte. allowd: alpha+num + 中文
|
1212
|
+
if (!SCWS_IS_ALNUM(ch))
|
1213
|
+
break;
|
1214
|
+
|
1215
|
+
pflag &= ~PFLAG_VALID;
|
1216
|
+
// 夹在中文间的英文数字最多允许 2 个字符 (超过2可以独立成词没啥问题)
|
1217
|
+
for (i = off+1; i < (off+3); i++)
|
1218
|
+
{
|
1219
|
+
ch = txt[i];
|
1220
|
+
if ((i >= len) || (ch <= 0x20) || (SCWS_CHARLEN(ch) > 1))
|
1221
|
+
{
|
1222
|
+
pflag |= PFLAG_VALID;
|
1223
|
+
break;
|
1224
|
+
}
|
1225
|
+
|
1226
|
+
if (!SCWS_IS_ALNUM(ch))
|
1227
|
+
break;
|
1228
|
+
}
|
1229
|
+
|
1230
|
+
if (!(pflag & PFLAG_VALID))
|
1231
|
+
break;
|
1232
|
+
|
1233
|
+
clen += (i - off - 1);
|
1234
|
+
}
|
1235
|
+
/* hightman.070813: add max zlen limit */
|
1236
|
+
if (++zlen >= SCWS_MAX_ZLEN)
|
1237
|
+
break;
|
1238
|
+
}
|
1239
|
+
|
1240
|
+
/* hightman.070624: 处理半个字的问题 */
|
1241
|
+
if ((ch = off) > len)
|
1242
|
+
off -= clen;
|
1243
|
+
|
1244
|
+
/* do the real segment */
|
1245
|
+
if (off <= s->off)
|
1246
|
+
return NULL;
|
1247
|
+
else if (pflag & PFLAG_WITH_MB)
|
1248
|
+
_scws_msegment(s, off, zlen);
|
1249
|
+
else if (!(pflag & PFLAG_ALNUM) || ((off - s->off) >= SCWS_MAX_EWLEN))
|
1250
|
+
_scws_ssegment(s, off);
|
1251
|
+
else
|
1252
|
+
{
|
1253
|
+
zlen = off - s->off;
|
1254
|
+
if ((pflag & (PFLAG_LONGALPHA|PFLAG_LONGDIGIT)) == (PFLAG_LONGALPHA|PFLAG_LONGDIGIT))
|
1255
|
+
_scws_alnum_multi(s, s->off, zlen);
|
1256
|
+
else
|
1257
|
+
{
|
1258
|
+
float idf;
|
1259
|
+
|
1260
|
+
idf = SCWS_EN_IDF(zlen);
|
1261
|
+
SCWS_PUT_RES(s->off, idf, zlen, attr_en);
|
1262
|
+
|
1263
|
+
/* hightman.090523: 为字母数字混合再度拆解, 纯数字, (>1 ? 纯字母 : 数字+字母) */
|
1264
|
+
if ((s->mode & SCWS_MULTI_DUALITY) && zlen > 2)
|
1265
|
+
_scws_alnum_multi(s, s->off, zlen);
|
1266
|
+
}
|
1267
|
+
}
|
1268
|
+
|
1269
|
+
/* reutrn the result */
|
1270
|
+
s->off = (ch > len ? len : off);
|
1271
|
+
if (s->res0 == NULL)
|
1272
|
+
return scws_get_result(s);
|
1273
|
+
|
1274
|
+
return s->res0;
|
1275
|
+
}
|
1276
|
+
|
1277
|
+
/* free the result retunned by scws_get_result */
|
1278
|
+
void scws_free_result(scws_res_t result)
|
1279
|
+
{
|
1280
|
+
scws_res_t cur;
|
1281
|
+
|
1282
|
+
while ((cur = result) != NULL)
|
1283
|
+
{
|
1284
|
+
result = cur->next;
|
1285
|
+
free(cur);
|
1286
|
+
}
|
1287
|
+
}
|
1288
|
+
|
1289
|
+
/* top words count */
|
1290
|
+
// xattr = ~v,p,c
|
1291
|
+
// xattr = v,pn,c
|
1292
|
+
|
1293
|
+
static int _tops_cmp(a, b)
|
1294
|
+
scws_top_t *a,*b;
|
1295
|
+
{
|
1296
|
+
if ((*b)->weight > (*a)->weight)
|
1297
|
+
return 1;
|
1298
|
+
return -1;
|
1299
|
+
}
|
1300
|
+
|
1301
|
+
static void _tops_load_node(node_t node, scws_top_t *values, int *start)
|
1302
|
+
{
|
1303
|
+
int i = *start;
|
1304
|
+
|
1305
|
+
if (node == NULL)
|
1306
|
+
return;
|
1307
|
+
|
1308
|
+
values[i] = node->value;
|
1309
|
+
values[i]->word = node->key;
|
1310
|
+
|
1311
|
+
*start = ++i;
|
1312
|
+
_tops_load_node(node->left, values, start);
|
1313
|
+
_tops_load_node(node->right, values, start);
|
1314
|
+
}
|
1315
|
+
|
1316
|
+
static void _tops_load_all(xtree_t xt, scws_top_t *values)
|
1317
|
+
{
|
1318
|
+
int i, start;
|
1319
|
+
|
1320
|
+
for (i = 0, start = 0; i < xt->prime; i++)
|
1321
|
+
_tops_load_node(xt->trees[i], values, &start);
|
1322
|
+
}
|
1323
|
+
|
1324
|
+
typedef char word_attr[4];
|
1325
|
+
static inline int _attr_belong(const char *a, word_attr *at)
|
1326
|
+
{
|
1327
|
+
if ((*at)[0] == '\0') return 1;
|
1328
|
+
while ((*at)[0])
|
1329
|
+
{
|
1330
|
+
if (!strcmp(a, *at)) return 1;
|
1331
|
+
at++;
|
1332
|
+
}
|
1333
|
+
return 0;
|
1334
|
+
}
|
1335
|
+
|
1336
|
+
/* macro to parse xattr -> xmode, at */
|
1337
|
+
#define __PARSE_XATTR__ do { \
|
1338
|
+
if (xattr == NULL) break; \
|
1339
|
+
if (*xattr == '~') { xattr++; xmode = SCWS_YEA; } \
|
1340
|
+
if (*xattr == '\0') break; \
|
1341
|
+
cnt = ((strlen(xattr)/2) + 2) * sizeof(word_attr); \
|
1342
|
+
at = (word_attr *) malloc(cnt); \
|
1343
|
+
memset(at, 0, cnt); \
|
1344
|
+
cnt = 0; \
|
1345
|
+
for (cnt = 0; (word = strchr(xattr, ',')); cnt++) { \
|
1346
|
+
at[cnt][0] = *xattr++; \
|
1347
|
+
at[cnt][1] = xattr == word ? '\0' : *xattr; \
|
1348
|
+
xattr = word + 1; \
|
1349
|
+
} \
|
1350
|
+
strncpy(at[cnt], xattr, 2); \
|
1351
|
+
} while (0)
|
1352
|
+
|
1353
|
+
scws_top_t scws_get_tops(scws_t s, int limit, char *xattr)
|
1354
|
+
{
|
1355
|
+
int off, cnt, xmode = SCWS_NA;
|
1356
|
+
xtree_t xt;
|
1357
|
+
scws_res_t res, cur;
|
1358
|
+
scws_top_t top, *list, tail, base;
|
1359
|
+
char *word;
|
1360
|
+
word_attr *at = NULL;
|
1361
|
+
|
1362
|
+
if (!s || !s->txt || !(xt = xtree_new(0,1)))
|
1363
|
+
return NULL;
|
1364
|
+
|
1365
|
+
__PARSE_XATTR__;
|
1366
|
+
|
1367
|
+
// save the offset.
|
1368
|
+
off = s->off;
|
1369
|
+
s->off = cnt = 0;
|
1370
|
+
while ((cur = res = scws_get_result(s)) != NULL)
|
1371
|
+
{
|
1372
|
+
do
|
1373
|
+
{
|
1374
|
+
if (cur->idf < 0.2 || cur->attr[0] == '#')
|
1375
|
+
continue;
|
1376
|
+
|
1377
|
+
/* check attribute filter */
|
1378
|
+
if (at != NULL)
|
1379
|
+
{
|
1380
|
+
if ((xmode == SCWS_NA) && !_attr_belong(cur->attr, at))
|
1381
|
+
continue;
|
1382
|
+
|
1383
|
+
if ((xmode == SCWS_YEA) && _attr_belong(cur->attr, at))
|
1384
|
+
continue;
|
1385
|
+
}
|
1386
|
+
|
1387
|
+
/* check stopwords */
|
1388
|
+
if (!strncmp(cur->attr, attr_en, 2) && cur->len > 6)
|
1389
|
+
{
|
1390
|
+
word = _mem_ndup(s->txt + cur->off, cur->len);
|
1391
|
+
_str_tolower(word, word);
|
1392
|
+
if (SCWS_IS_NOSTATS(word, cur->len))
|
1393
|
+
{
|
1394
|
+
free(word);
|
1395
|
+
continue;
|
1396
|
+
}
|
1397
|
+
free(word);
|
1398
|
+
}
|
1399
|
+
|
1400
|
+
/* put to the stats */
|
1401
|
+
if (!(top = xtree_nget(xt, s->txt + cur->off, cur->len, NULL)))
|
1402
|
+
{
|
1403
|
+
top = (scws_top_t) pmalloc_z(xt->p, sizeof(struct scws_topword));
|
1404
|
+
top->weight = cur->idf;
|
1405
|
+
top->times = 1;
|
1406
|
+
strncpy(top->attr, cur->attr, 2);
|
1407
|
+
xtree_nput(xt, top, sizeof(struct scws_topword), s->txt + cur->off, cur->len);
|
1408
|
+
cnt++;
|
1409
|
+
}
|
1410
|
+
else
|
1411
|
+
{
|
1412
|
+
top->weight += cur->idf;
|
1413
|
+
top->times++;
|
1414
|
+
}
|
1415
|
+
}
|
1416
|
+
while ((cur = cur->next) != NULL);
|
1417
|
+
scws_free_result(res);
|
1418
|
+
}
|
1419
|
+
|
1420
|
+
// free at
|
1421
|
+
if (at != NULL)
|
1422
|
+
free(at);
|
1423
|
+
top = NULL;
|
1424
|
+
if (cnt > 0)
|
1425
|
+
{
|
1426
|
+
/* sort the list */
|
1427
|
+
list = (scws_top_t *) malloc(sizeof(scws_top_t) * cnt);
|
1428
|
+
_tops_load_all(xt, list);
|
1429
|
+
qsort(list, cnt, sizeof(scws_top_t), _tops_cmp);
|
1430
|
+
|
1431
|
+
/* save to return pointer */
|
1432
|
+
if (!limit || limit > cnt)
|
1433
|
+
limit = cnt;
|
1434
|
+
|
1435
|
+
top = tail = (scws_top_t) malloc(sizeof(struct scws_topword));
|
1436
|
+
memcpy(top, list[0], sizeof(struct scws_topword));
|
1437
|
+
top->word = strdup(list[0]->word);
|
1438
|
+
top->next = NULL;
|
1439
|
+
|
1440
|
+
for (cnt = 1; cnt < limit; cnt++)
|
1441
|
+
{
|
1442
|
+
base = (scws_top_t) malloc(sizeof(struct scws_topword));
|
1443
|
+
memcpy(base, list[cnt], sizeof(struct scws_topword));
|
1444
|
+
base->word = strdup(list[cnt]->word);
|
1445
|
+
base->next = NULL;
|
1446
|
+
tail->next = base;
|
1447
|
+
tail = base;
|
1448
|
+
}
|
1449
|
+
free(list);
|
1450
|
+
}
|
1451
|
+
|
1452
|
+
// restore the offset
|
1453
|
+
s->off = off;
|
1454
|
+
xtree_free(xt);
|
1455
|
+
return top;
|
1456
|
+
}
|
1457
|
+
|
1458
|
+
// word check by attr.
|
1459
|
+
int scws_has_word(scws_t s, char *xattr)
|
1460
|
+
{
|
1461
|
+
int off, cnt, xmode = SCWS_NA;
|
1462
|
+
scws_res_t res, cur;
|
1463
|
+
char *word;
|
1464
|
+
word_attr *at = NULL;
|
1465
|
+
|
1466
|
+
if (!s || !s->txt)
|
1467
|
+
return 0;
|
1468
|
+
|
1469
|
+
__PARSE_XATTR__;
|
1470
|
+
|
1471
|
+
// save the offset. (cnt -> return_value)
|
1472
|
+
off = s->off;
|
1473
|
+
cnt = s->off = 0;
|
1474
|
+
while (!cnt && (cur = res = scws_get_result(s)) != NULL)
|
1475
|
+
{
|
1476
|
+
do
|
1477
|
+
{
|
1478
|
+
/* check attribute filter */
|
1479
|
+
if (at != NULL)
|
1480
|
+
{
|
1481
|
+
if ((xmode == SCWS_NA) && _attr_belong(cur->attr, at))
|
1482
|
+
cnt = 1;
|
1483
|
+
|
1484
|
+
if ((xmode == SCWS_YEA) && !_attr_belong(cur->attr, at))
|
1485
|
+
cnt = 1;
|
1486
|
+
}
|
1487
|
+
}
|
1488
|
+
while (!cnt && (cur = cur->next) != NULL);
|
1489
|
+
scws_free_result(res);
|
1490
|
+
}
|
1491
|
+
// memory leak fixed, thanks to lauxinz
|
1492
|
+
if (at != NULL)
|
1493
|
+
free(at);
|
1494
|
+
s->off = off;
|
1495
|
+
return cnt;
|
1496
|
+
}
|
1497
|
+
|
1498
|
+
// get words by attr (rand order)
|
1499
|
+
scws_top_t scws_get_words(scws_t s, char *xattr)
|
1500
|
+
{
|
1501
|
+
int off, cnt, xmode = SCWS_NA;
|
1502
|
+
xtree_t xt;
|
1503
|
+
scws_res_t res, cur;
|
1504
|
+
scws_top_t top, tail, base;
|
1505
|
+
char *word;
|
1506
|
+
word_attr *at = NULL;
|
1507
|
+
|
1508
|
+
if (!s || !s->txt || !(xt = xtree_new(0,1)))
|
1509
|
+
return NULL;
|
1510
|
+
|
1511
|
+
__PARSE_XATTR__;
|
1512
|
+
|
1513
|
+
// save the offset.
|
1514
|
+
off = s->off;
|
1515
|
+
s->off = 0;
|
1516
|
+
base = tail = NULL;
|
1517
|
+
while ((cur = res = scws_get_result(s)) != NULL)
|
1518
|
+
{
|
1519
|
+
do
|
1520
|
+
{
|
1521
|
+
/* check attribute filter */
|
1522
|
+
if (at != NULL)
|
1523
|
+
{
|
1524
|
+
if ((xmode == SCWS_NA) && !_attr_belong(cur->attr, at))
|
1525
|
+
continue;
|
1526
|
+
|
1527
|
+
if ((xmode == SCWS_YEA) && _attr_belong(cur->attr, at))
|
1528
|
+
continue;
|
1529
|
+
}
|
1530
|
+
|
1531
|
+
/* put to the stats */
|
1532
|
+
if (!(top = xtree_nget(xt, s->txt + cur->off, cur->len, NULL)))
|
1533
|
+
{
|
1534
|
+
top = (scws_top_t) malloc(sizeof(struct scws_topword));
|
1535
|
+
top->weight = cur->idf;
|
1536
|
+
top->times = 1;
|
1537
|
+
top->next = NULL;
|
1538
|
+
top->word = (char *)_mem_ndup(s->txt + cur->off, cur->len);
|
1539
|
+
strncpy(top->attr, cur->attr, 2);
|
1540
|
+
// add to the chain
|
1541
|
+
if (tail == NULL)
|
1542
|
+
base = tail = top;
|
1543
|
+
else
|
1544
|
+
{
|
1545
|
+
tail->next = top;
|
1546
|
+
tail = top;
|
1547
|
+
}
|
1548
|
+
xtree_nput(xt, top, sizeof(struct scws_topword), s->txt + cur->off, cur->len);
|
1549
|
+
}
|
1550
|
+
else
|
1551
|
+
{
|
1552
|
+
top->weight += cur->idf;
|
1553
|
+
top->times++;
|
1554
|
+
}
|
1555
|
+
}
|
1556
|
+
while ((cur = cur->next) != NULL);
|
1557
|
+
scws_free_result(res);
|
1558
|
+
}
|
1559
|
+
|
1560
|
+
// free at & xtree
|
1561
|
+
if (at != NULL)
|
1562
|
+
free(at);
|
1563
|
+
xtree_free(xt);
|
1564
|
+
|
1565
|
+
// restore the offset
|
1566
|
+
s->off = off;
|
1567
|
+
return base;
|
1568
|
+
}
|
1569
|
+
|
1570
|
+
void scws_free_tops(scws_top_t tops)
|
1571
|
+
{
|
1572
|
+
scws_top_t cur;
|
1573
|
+
|
1574
|
+
while ((cur = tops) != NULL)
|
1575
|
+
{
|
1576
|
+
tops = cur->next;
|
1577
|
+
if (cur->word)
|
1578
|
+
free(cur->word);
|
1579
|
+
free(cur);
|
1580
|
+
}
|
1581
|
+
}
|