scws4r 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +13 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +13 -0
- data/Gemfile.lock +65 -0
- data/LICENSE.txt +21 -0
- data/README.md +56 -0
- data/Rakefile +20 -0
- data/defaults/dict.utf8.xdb +0 -0
- data/defaults/rules.utf8.ini +291 -0
- data/ext/scws4r/Makefile +267 -0
- data/ext/scws4r/Makefile.am +15 -0
- data/ext/scws4r/charset.c +90 -0
- data/ext/scws4r/charset.h +14 -0
- data/ext/scws4r/config_win32.h +22 -0
- data/ext/scws4r/crc32.c +103 -0
- data/ext/scws4r/crc32.h +13 -0
- data/ext/scws4r/darray.c +35 -0
- data/ext/scws4r/darray.h +22 -0
- data/ext/scws4r/extconf.rb +3 -0
- data/ext/scws4r/lock.c +153 -0
- data/ext/scws4r/lock.h +44 -0
- data/ext/scws4r/pool.c +141 -0
- data/ext/scws4r/pool.h +53 -0
- data/ext/scws4r/rule.c +407 -0
- data/ext/scws4r/rule.h +83 -0
- data/ext/scws4r/scws.c +1581 -0
- data/ext/scws4r/scws.h +118 -0
- data/ext/scws4r/scws4r.c +207 -0
- data/ext/scws4r/scws4r.h +4 -0
- data/ext/scws4r/version.h.in +4 -0
- data/ext/scws4r/xdb.c +636 -0
- data/ext/scws4r/xdb.h +88 -0
- data/ext/scws4r/xdict.c +394 -0
- data/ext/scws4r/xdict.h +73 -0
- data/ext/scws4r/xtree.c +337 -0
- data/ext/scws4r/xtree.h +65 -0
- data/lib/scws4r/version.rb +5 -0
- data/lib/scws4r.rb +15 -0
- data/scws4r.gemspec +30 -0
- data/sig/scws.rbs +4 -0
- data/test.rb +16 -0
- metadata +88 -0
data/ext/scws4r/scws.c
ADDED
@@ -0,0 +1,1581 @@
|
|
1
|
+
/*
|
2
|
+
* @file scws.c (core segment functions)
|
3
|
+
* @author Hightman Mar
|
4
|
+
* @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
|
5
|
+
* $Id $
|
6
|
+
*/
|
7
|
+
|
8
|
+
#ifdef HAVE_CONFIG_H
|
9
|
+
# include "config.h"
|
10
|
+
#endif
|
11
|
+
|
12
|
+
#ifdef WIN32
|
13
|
+
# include "config_win32.h"
|
14
|
+
#endif
|
15
|
+
|
16
|
+
#include "scws.h"
|
17
|
+
#include "xdict.h"
|
18
|
+
#include "rule.h"
|
19
|
+
#include "charset.h"
|
20
|
+
#include "darray.h"
|
21
|
+
#include "xtree.h"
|
22
|
+
#include <stdio.h>
|
23
|
+
#include <math.h>
|
24
|
+
#include <stdlib.h>
|
25
|
+
#include <string.h>
|
26
|
+
|
27
|
+
/* quick macro define for frequency usage */
|
28
|
+
#define SCWS_IS_SPECIAL(x,l) scws_rule_checkbit(s->r,x,l,SCWS_RULE_SPECIAL)
|
29
|
+
#define SCWS_IS_NOSTATS(x,l) scws_rule_checkbit(s->r,x,l,SCWS_RULE_NOSTATS)
|
30
|
+
#define SCWS_CHARLEN(x) s->mblen[(x)]
|
31
|
+
#define SCWS_IS_ALNUM(x) (((x)>=48&&(x)<=57)||((x)>=65&&(x)<=90)||((x)>=97&&(x)<=122))
|
32
|
+
#define SCWS_IS_ALPHA(x) (((x)>=65&&(x)<=90)||((x)>=97&&(x)<=122))
|
33
|
+
#define SCWS_IS_UALPHA(x) ((x)>=65&&(x)<=90)
|
34
|
+
#define SCWS_IS_DIGIT(x) ((x)>=48&&(x)<=57)
|
35
|
+
#define SCWS_IS_WHEAD(x) ((x) & SCWS_ZFLAG_WHEAD)
|
36
|
+
#define SCWS_IS_ECHAR(x) ((x) & SCWS_ZFLAG_ENGLISH)
|
37
|
+
#define SCWS_NO_RULE1(x) (((x) & (SCWS_ZFLAG_SYMBOL|SCWS_ZFLAG_ENGLISH))||(((x) & (SCWS_ZFLAG_WHEAD|SCWS_ZFLAG_NR2)) == SCWS_ZFLAG_WHEAD))
|
38
|
+
///#define SCWS_NO_RULE2(x) (((x) & SCWS_ZFLAG_ENGLISH)||(((x) & (SCWS_ZFLAG_WHEAD|SCWS_ZFLAG_N2)) == SCWS_ZFLAG_WHEAD))
|
39
|
+
#define SCWS_NO_RULE2 SCWS_NO_RULE1
|
40
|
+
#define SCWS_MAX_EWLEN 33
|
41
|
+
///hightman.070706: char token
|
42
|
+
#define SCWS_CHAR_TOKEN(x) ((x)=='('||(x)==')'||(x)=='['||(x)==']'||(x)=='{'||(x)=='}'||(x)==':'||(x)=='"')
|
43
|
+
///hightman.070814: max zlen = ?? (4 * zlen * zlen = ??)
|
44
|
+
#define SCWS_MAX_ZLEN 128
|
45
|
+
#define SCWS_EN_IDF(x) (float)(2.5*logf(x))
|
46
|
+
|
47
|
+
static const char *attr_en = "en";
|
48
|
+
static const char *attr_un = "un";
|
49
|
+
static const char *attr_nr = "nr";
|
50
|
+
static const char *attr_na = "!";
|
51
|
+
|
52
|
+
/* create scws engine */
|
53
|
+
scws_t scws_new()
|
54
|
+
{
|
55
|
+
scws_t s;
|
56
|
+
s = (scws_t) malloc(sizeof(scws_st));
|
57
|
+
if (s == NULL)
|
58
|
+
return s;
|
59
|
+
memset(s, 0, sizeof(scws_st));
|
60
|
+
s->mblen = charset_table_get(NULL);
|
61
|
+
s->off = s->len = 0;
|
62
|
+
s->wend = -1;
|
63
|
+
|
64
|
+
return s;
|
65
|
+
}
|
66
|
+
|
67
|
+
/* hightman.110320: fork scws */
|
68
|
+
scws_t scws_fork(scws_t p)
|
69
|
+
{
|
70
|
+
scws_t s = scws_new();
|
71
|
+
|
72
|
+
if (p != NULL && s != NULL)
|
73
|
+
{
|
74
|
+
s->mblen = p->mblen;
|
75
|
+
s->mode = p->mode;
|
76
|
+
// fork dict/rules
|
77
|
+
s->r = scws_rule_fork(p->r);
|
78
|
+
s->d = xdict_fork(p->d);
|
79
|
+
}
|
80
|
+
|
81
|
+
return s;
|
82
|
+
}
|
83
|
+
|
84
|
+
/* close & free the engine */
|
85
|
+
void scws_free(scws_t s)
|
86
|
+
{
|
87
|
+
if (s->d)
|
88
|
+
{
|
89
|
+
xdict_close(s->d);
|
90
|
+
s->d = NULL;
|
91
|
+
}
|
92
|
+
if (s->r)
|
93
|
+
{
|
94
|
+
scws_rule_free(s->r);
|
95
|
+
s->r = NULL;
|
96
|
+
}
|
97
|
+
free(s);
|
98
|
+
}
|
99
|
+
|
100
|
+
/* add a dict into scws */
|
101
|
+
int scws_add_dict(scws_t s, const char *fpath, int mode)
|
102
|
+
{
|
103
|
+
xdict_t xx;
|
104
|
+
if (mode & SCWS_XDICT_SET)
|
105
|
+
{
|
106
|
+
xdict_close(s->d);
|
107
|
+
mode ^= SCWS_XDICT_SET;
|
108
|
+
s->d = NULL;
|
109
|
+
}
|
110
|
+
xx = s->d;
|
111
|
+
s->d = xdict_add(s->d, fpath, mode, s->mblen);
|
112
|
+
return (xx == s->d ? -1 : 0);
|
113
|
+
}
|
114
|
+
|
115
|
+
/* set the dict & open it */
|
116
|
+
int scws_set_dict(scws_t s, const char *fpath, int mode)
|
117
|
+
{
|
118
|
+
return scws_add_dict(s, fpath, mode | SCWS_XDICT_SET);
|
119
|
+
}
|
120
|
+
|
121
|
+
void scws_set_charset(scws_t s, const char *cs)
|
122
|
+
{
|
123
|
+
s->mblen = charset_table_get(cs);
|
124
|
+
}
|
125
|
+
|
126
|
+
void scws_set_rule(scws_t s, const char *fpath)
|
127
|
+
{
|
128
|
+
if (s->r != NULL)
|
129
|
+
scws_rule_free(s->r);
|
130
|
+
|
131
|
+
s->r = scws_rule_new(fpath, s->mblen);
|
132
|
+
}
|
133
|
+
|
134
|
+
/* set ignore symbol or multi segments */
|
135
|
+
void scws_set_ignore(scws_t s, int yes)
|
136
|
+
{
|
137
|
+
if (yes == SCWS_YEA)
|
138
|
+
s->mode |= SCWS_IGN_SYMBOL;
|
139
|
+
|
140
|
+
if (yes == SCWS_NA)
|
141
|
+
s->mode &= ~SCWS_IGN_SYMBOL;
|
142
|
+
}
|
143
|
+
|
144
|
+
void scws_set_multi(scws_t s, int mode)
|
145
|
+
{
|
146
|
+
s->mode &= ~SCWS_MULTI_MASK;
|
147
|
+
|
148
|
+
if (mode & SCWS_MULTI_MASK)
|
149
|
+
s->mode |= mode;
|
150
|
+
}
|
151
|
+
|
152
|
+
void scws_set_debug(scws_t s, int yes)
|
153
|
+
{
|
154
|
+
if (yes == SCWS_YEA)
|
155
|
+
s->mode |= SCWS_DEBUG;
|
156
|
+
|
157
|
+
if (yes == SCWS_NA)
|
158
|
+
s->mode &= ~SCWS_DEBUG;
|
159
|
+
}
|
160
|
+
|
161
|
+
void scws_set_duality(scws_t s, int yes)
|
162
|
+
{
|
163
|
+
if (yes == SCWS_YEA)
|
164
|
+
s->mode |= SCWS_DUALITY;
|
165
|
+
|
166
|
+
if (yes == SCWS_NA)
|
167
|
+
s->mode &= ~SCWS_DUALITY;
|
168
|
+
}
|
169
|
+
|
170
|
+
/* send the text buffer & init some others */
|
171
|
+
void scws_send_text(scws_t s, const char *text, int len)
|
172
|
+
{
|
173
|
+
s->txt = (unsigned char *) text;
|
174
|
+
s->len = len;
|
175
|
+
s->off = 0;
|
176
|
+
}
|
177
|
+
|
178
|
+
/* get some words, if these is not words, return NULL */
|
179
|
+
#define SCWS_PUT_RES(o,i,l,a) \
|
180
|
+
do { \
|
181
|
+
scws_res_t res; \
|
182
|
+
res = (scws_res_t) malloc(sizeof(struct scws_result)); \
|
183
|
+
res->off = o; \
|
184
|
+
res->idf = i; \
|
185
|
+
res->len = l; \
|
186
|
+
strncpy(res->attr, a, 2); \
|
187
|
+
res->attr[2] = '\0'; \
|
188
|
+
res->next = NULL; \
|
189
|
+
if (s->res1 == NULL) \
|
190
|
+
s->res1 = s->res0 = res; \
|
191
|
+
else \
|
192
|
+
{ \
|
193
|
+
s->res1->next = res; \
|
194
|
+
s->res1 = res; \
|
195
|
+
} \
|
196
|
+
} while(0)
|
197
|
+
|
198
|
+
/* single bytes segment (纯单字节字符) */
|
199
|
+
#define PFLAG_WITH_MB 0x01
|
200
|
+
#define PFLAG_ALNUM 0x02
|
201
|
+
#define PFLAG_VALID 0x04
|
202
|
+
#define PFLAG_DIGIT 0x08
|
203
|
+
#define PFLAG_ADDSYM 0x10
|
204
|
+
#define PFLAG_ALPHA 0x20
|
205
|
+
#define PFLAG_LONGDIGIT 0x40
|
206
|
+
#define PFLAG_LONGALPHA 0x80
|
207
|
+
|
208
|
+
static void _str_toupper(char *src, char *dst)
|
209
|
+
{
|
210
|
+
while (*src)
|
211
|
+
{
|
212
|
+
*dst++ = *src++;
|
213
|
+
if (dst[-1] >= 'a' && dst[-1] <= 'z')
|
214
|
+
dst[-1] ^= 0x20;
|
215
|
+
}
|
216
|
+
}
|
217
|
+
|
218
|
+
static void _str_tolower(char *src, char *dst)
|
219
|
+
{
|
220
|
+
while (*src)
|
221
|
+
{
|
222
|
+
*dst++ = *src++;
|
223
|
+
if (dst[-1] >= 'A' && dst[-1] <= 'Z')
|
224
|
+
dst[-1] ^= 0x20;
|
225
|
+
}
|
226
|
+
}
|
227
|
+
|
228
|
+
#ifdef HAVE_STRNDUP
|
229
|
+
#define _mem_ndup strndup
|
230
|
+
#else
|
231
|
+
static inline void *_mem_ndup(const char *src, int len)
|
232
|
+
{
|
233
|
+
char *dst;
|
234
|
+
dst = malloc(len+1);
|
235
|
+
memcpy(dst, src, len);
|
236
|
+
dst[len] = '\0';
|
237
|
+
return dst;
|
238
|
+
}
|
239
|
+
#endif
|
240
|
+
|
241
|
+
static void _scws_alnum_multi(scws_t s, int start, int wlen)
|
242
|
+
{
|
243
|
+
char chunk[SCWS_MAX_EWLEN];
|
244
|
+
int i, j, k, ch, pflag;
|
245
|
+
unsigned char *txt;
|
246
|
+
float idf;
|
247
|
+
|
248
|
+
txt = s->txt;
|
249
|
+
pflag = 0;
|
250
|
+
for (i = j = k = 0; i < wlen; i++)
|
251
|
+
{
|
252
|
+
ch = txt[start + i];
|
253
|
+
if (SCWS_IS_DIGIT(ch))
|
254
|
+
{
|
255
|
+
if (pflag & PFLAG_DIGIT)
|
256
|
+
continue;
|
257
|
+
if (pflag != 0)
|
258
|
+
{
|
259
|
+
chunk[j++] = (char) (i-k);
|
260
|
+
k = i;
|
261
|
+
}
|
262
|
+
pflag = PFLAG_DIGIT;
|
263
|
+
}
|
264
|
+
else if (SCWS_IS_ALPHA(ch))
|
265
|
+
{
|
266
|
+
if (pflag & PFLAG_ALPHA)
|
267
|
+
continue;
|
268
|
+
if (pflag != 0)
|
269
|
+
{
|
270
|
+
chunk[j++] = (char) (i-k);
|
271
|
+
k = i;
|
272
|
+
}
|
273
|
+
pflag = PFLAG_ALPHA;
|
274
|
+
}
|
275
|
+
else
|
276
|
+
{
|
277
|
+
if (pflag & PFLAG_ADDSYM)
|
278
|
+
continue;
|
279
|
+
if (pflag != 0)
|
280
|
+
{
|
281
|
+
chunk[j++] = (char) (i-k);
|
282
|
+
k = i;
|
283
|
+
}
|
284
|
+
pflag = PFLAG_ADDSYM;
|
285
|
+
}
|
286
|
+
}
|
287
|
+
|
288
|
+
if (j > 0)
|
289
|
+
{
|
290
|
+
chunk[j] = (char) (i-k);
|
291
|
+
ch = start;
|
292
|
+
for (i = 0; i <= j; i++)
|
293
|
+
{
|
294
|
+
if (!SCWS_IS_ALNUM(txt[ch]))
|
295
|
+
{
|
296
|
+
// just skip
|
297
|
+
}
|
298
|
+
else if (chunk[i] == 1)
|
299
|
+
{
|
300
|
+
if (i > 0 && chunk[i-1] > 1 && (i != 1 || i != j))
|
301
|
+
{
|
302
|
+
if (!SCWS_IS_ALNUM(txt[ch-1]))
|
303
|
+
{
|
304
|
+
idf = SCWS_EN_IDF(chunk[i]);
|
305
|
+
SCWS_PUT_RES(ch, idf, chunk[i], attr_en);
|
306
|
+
}
|
307
|
+
else
|
308
|
+
{
|
309
|
+
idf = SCWS_EN_IDF(chunk[i-1]+1);
|
310
|
+
SCWS_PUT_RES(ch - chunk[i-1], idf, chunk[i-1]+1, attr_en);
|
311
|
+
}
|
312
|
+
}
|
313
|
+
if (i < j && (i != 0 || j != 1))
|
314
|
+
{
|
315
|
+
if (!SCWS_IS_ALNUM(txt[ch+1]))
|
316
|
+
{
|
317
|
+
idf = SCWS_EN_IDF(chunk[i]);
|
318
|
+
SCWS_PUT_RES(ch, idf, chunk[i], attr_en);
|
319
|
+
}
|
320
|
+
else
|
321
|
+
{
|
322
|
+
idf = SCWS_EN_IDF(chunk[i+1]+1);
|
323
|
+
SCWS_PUT_RES(ch, idf, chunk[i+1]+1, attr_en);
|
324
|
+
}
|
325
|
+
}
|
326
|
+
}
|
327
|
+
else
|
328
|
+
{
|
329
|
+
idf = SCWS_EN_IDF(chunk[i]);
|
330
|
+
SCWS_PUT_RES(ch, idf, chunk[i], attr_en);
|
331
|
+
}
|
332
|
+
ch += chunk[i];
|
333
|
+
}
|
334
|
+
}
|
335
|
+
}
|
336
|
+
|
337
|
+
static void _scws_ssegment(scws_t s, int end)
|
338
|
+
{
|
339
|
+
int start, wlen, ch, pflag, ipflag = 0;
|
340
|
+
unsigned char *txt;
|
341
|
+
float idf;
|
342
|
+
|
343
|
+
start = s->off;
|
344
|
+
wlen = end - start;
|
345
|
+
|
346
|
+
/* check special words (need strtoupper) */
|
347
|
+
if (wlen > 1)
|
348
|
+
{
|
349
|
+
txt = (char *) _mem_ndup(s->txt + start, wlen);
|
350
|
+
_str_toupper(txt, txt);
|
351
|
+
if (SCWS_IS_SPECIAL(txt, wlen))
|
352
|
+
{
|
353
|
+
SCWS_PUT_RES(start, 9.5, wlen, "nz");
|
354
|
+
free(txt);
|
355
|
+
return;
|
356
|
+
}
|
357
|
+
free(txt);
|
358
|
+
}
|
359
|
+
|
360
|
+
txt = s->txt;
|
361
|
+
/* check brief words such as S.H.E M.R. */
|
362
|
+
if (SCWS_IS_ALPHA(txt[start]) && txt[start+1] == '.')
|
363
|
+
{
|
364
|
+
for (ch = start + 2; ch < end; ch++)
|
365
|
+
{
|
366
|
+
if (!SCWS_IS_ALPHA(txt[ch])) break;
|
367
|
+
ch++;
|
368
|
+
if (ch == end || txt[ch] != '.') break;
|
369
|
+
}
|
370
|
+
if (ch == end)
|
371
|
+
{
|
372
|
+
SCWS_PUT_RES(start, 7.5, wlen, "nz");
|
373
|
+
return;
|
374
|
+
}
|
375
|
+
}
|
376
|
+
|
377
|
+
/* 取出单词及标点. 数字允许一个点且下一个为数字,不连续的. 字母允许一个不连续的' */
|
378
|
+
while (start < end)
|
379
|
+
{
|
380
|
+
ch = txt[start++];
|
381
|
+
if (ipflag && ch != 0x2e && !SCWS_IS_DIGIT(ch))
|
382
|
+
ipflag = 0;
|
383
|
+
if (SCWS_IS_ALNUM(ch))
|
384
|
+
{
|
385
|
+
pflag = SCWS_IS_DIGIT(ch) ? PFLAG_DIGIT : 0;
|
386
|
+
wlen = 1;
|
387
|
+
while (start < end)
|
388
|
+
{
|
389
|
+
ch = txt[start];
|
390
|
+
if (pflag & PFLAG_DIGIT)
|
391
|
+
{
|
392
|
+
if (!SCWS_IS_DIGIT(ch))
|
393
|
+
{
|
394
|
+
// check percent % = 0x25
|
395
|
+
if (ch == 0x25 && !SCWS_IS_DIGIT(txt[start+1]))
|
396
|
+
{
|
397
|
+
start++;
|
398
|
+
wlen++;
|
399
|
+
break;
|
400
|
+
}
|
401
|
+
if (ipflag)
|
402
|
+
break;
|
403
|
+
// special for IP address or version number? (find out all digit + dot)
|
404
|
+
if (ch == 0x2e && (pflag & PFLAG_ADDSYM))
|
405
|
+
{
|
406
|
+
ipflag = 1;
|
407
|
+
while(--wlen && txt[--start] != 0x2e);
|
408
|
+
pflag = 0;
|
409
|
+
break;
|
410
|
+
}
|
411
|
+
// wlen = 1
|
412
|
+
if (wlen == 1 && SCWS_IS_ALPHA(ch))
|
413
|
+
{
|
414
|
+
pflag ^= PFLAG_DIGIT;
|
415
|
+
pflag |= PFLAG_ADDSYM;
|
416
|
+
continue;
|
417
|
+
}
|
418
|
+
// strict must add: !$this->_is_digit(ord($this->txt[$start+1])))
|
419
|
+
if ((pflag & PFLAG_ADDSYM) || !(ch == 0x2e && SCWS_IS_DIGIT(txt[start+1])))
|
420
|
+
break;
|
421
|
+
pflag |= PFLAG_ADDSYM;
|
422
|
+
}
|
423
|
+
}
|
424
|
+
else
|
425
|
+
{
|
426
|
+
/* hightman.110419: - 出现在字母中间允许连接(0x2d), _ 允许连接(0x5f) */
|
427
|
+
if ((ch == 0x2d || ch == 0x5f) && SCWS_IS_ALPHA(txt[start+1]))
|
428
|
+
pflag |= PFLAG_ADDSYM;
|
429
|
+
else if (!SCWS_IS_ALPHA(ch))
|
430
|
+
{
|
431
|
+
if ((pflag & PFLAG_ADDSYM)
|
432
|
+
|| !((ch == 0x27 && SCWS_IS_ALPHA(txt[start+1]))
|
433
|
+
|| (SCWS_IS_DIGIT(ch) && !SCWS_IS_DIGIT(txt[start+1]))))
|
434
|
+
{
|
435
|
+
break;
|
436
|
+
}
|
437
|
+
pflag |= PFLAG_ADDSYM;
|
438
|
+
}
|
439
|
+
}
|
440
|
+
start++;
|
441
|
+
wlen++;
|
442
|
+
if (wlen >= SCWS_MAX_EWLEN)
|
443
|
+
break;
|
444
|
+
}
|
445
|
+
idf = SCWS_EN_IDF(wlen);
|
446
|
+
SCWS_PUT_RES(start-wlen, idf, wlen, attr_en);
|
447
|
+
if ((s->mode & SCWS_MULTI_DUALITY) && (pflag & PFLAG_ADDSYM))
|
448
|
+
_scws_alnum_multi(s, start-wlen, wlen);
|
449
|
+
}
|
450
|
+
else if (!(s->mode & SCWS_IGN_SYMBOL))
|
451
|
+
{
|
452
|
+
SCWS_PUT_RES(start-1, 0.0, 1, attr_un);
|
453
|
+
}
|
454
|
+
}
|
455
|
+
}
|
456
|
+
|
457
|
+
/* multibyte segment */
|
458
|
+
static int _scws_mget_word(scws_t s, int i, int j)
|
459
|
+
{
|
460
|
+
int r, k;
|
461
|
+
word_t item;
|
462
|
+
|
463
|
+
if (!(s->wmap[i][i]->flag & SCWS_ZFLAG_WHEAD))
|
464
|
+
return i;
|
465
|
+
|
466
|
+
for (r=i, k=i+1; k <= j; k++)
|
467
|
+
{
|
468
|
+
item = s->wmap[i][k];
|
469
|
+
if (item && (item->flag & SCWS_WORD_FULL))
|
470
|
+
{
|
471
|
+
r = k;
|
472
|
+
if (!(item->flag & SCWS_WORD_PART))
|
473
|
+
break;
|
474
|
+
}
|
475
|
+
}
|
476
|
+
return r;
|
477
|
+
}
|
478
|
+
|
479
|
+
static void _scws_mset_word(scws_t s, int i, int j)
|
480
|
+
{
|
481
|
+
word_t item;
|
482
|
+
|
483
|
+
item = s->wmap[i][j];
|
484
|
+
/* hightman.070705: 加入 item == null 判断, 防止超长词(255字以上)unsigned char溢出 */
|
485
|
+
if ((item == NULL) || ((s->mode & SCWS_IGN_SYMBOL)
|
486
|
+
&& !SCWS_IS_ECHAR(item->flag) && !memcmp(item->attr, attr_un, 2)))
|
487
|
+
return;
|
488
|
+
|
489
|
+
/* hightman.070701: 散字自动二元聚合 */
|
490
|
+
if (s->mode & SCWS_DUALITY)
|
491
|
+
{
|
492
|
+
int k = s->zis;
|
493
|
+
|
494
|
+
if (i == j && !SCWS_IS_ECHAR(item->flag) && memcmp(item->attr, attr_un, 2))
|
495
|
+
{
|
496
|
+
s->zis = i;
|
497
|
+
if (k < 0)
|
498
|
+
return;
|
499
|
+
|
500
|
+
i = (k & ~SCWS_ZIS_USED);
|
501
|
+
if ((i != (j-1)) || (!(k & SCWS_ZIS_USED) && s->wend == i))
|
502
|
+
{
|
503
|
+
SCWS_PUT_RES(s->zmap[i].start, s->wmap[i][i]->idf, (s->zmap[i].end - s->zmap[i].start), s->wmap[i][i]->attr);
|
504
|
+
if (i != (j-1))
|
505
|
+
return;
|
506
|
+
}
|
507
|
+
s->zis |= SCWS_ZIS_USED;
|
508
|
+
}
|
509
|
+
else
|
510
|
+
{
|
511
|
+
if ((k >= 0) && (!(k & SCWS_ZIS_USED) || (j > i)))
|
512
|
+
{
|
513
|
+
k &= ~SCWS_ZIS_USED;
|
514
|
+
SCWS_PUT_RES(s->zmap[k].start, s->wmap[k][k]->idf, (s->zmap[k].end - s->zmap[k].start), s->wmap[k][k]->attr);
|
515
|
+
}
|
516
|
+
if (j > i)
|
517
|
+
s->wend = j + 1;
|
518
|
+
s->zis = -1;
|
519
|
+
}
|
520
|
+
}
|
521
|
+
|
522
|
+
SCWS_PUT_RES(s->zmap[i].start, item->idf, (s->zmap[j].end - s->zmap[i].start), item->attr);
|
523
|
+
|
524
|
+
// hightman.070902: multi segment
|
525
|
+
// step1: split to short words
|
526
|
+
if ((j-i) > 1)
|
527
|
+
{
|
528
|
+
int n, k, m = i;
|
529
|
+
if (s->mode & SCWS_MULTI_SHORT)
|
530
|
+
{
|
531
|
+
while (m < j)
|
532
|
+
{
|
533
|
+
k = m;
|
534
|
+
// hightman.111223: multi short enhanced
|
535
|
+
for (n = m + 1; n <= j; n++)
|
536
|
+
{
|
537
|
+
// 3 chars at most
|
538
|
+
if ((n == j && m == i) || (n - m) > 2) break;
|
539
|
+
item = s->wmap[m][n];
|
540
|
+
if (!item) continue;
|
541
|
+
// first shortest or last longest word
|
542
|
+
if ((item->flag & SCWS_WORD_FULL) && (k == m || n == j))
|
543
|
+
k = n;
|
544
|
+
if (!(item->flag & SCWS_WORD_PART)) break;
|
545
|
+
}
|
546
|
+
// short word not found, stop to find, passed to next loop
|
547
|
+
if (k == m)
|
548
|
+
break;
|
549
|
+
|
550
|
+
// save the short word
|
551
|
+
item = s->wmap[m][k];
|
552
|
+
SCWS_PUT_RES(s->zmap[m].start, item->idf, (s->zmap[k].end - s->zmap[m].start), item->attr);
|
553
|
+
// find the next word or go to prev for duality last word
|
554
|
+
if ((m = k + 1) == j)
|
555
|
+
{
|
556
|
+
m--;
|
557
|
+
break;
|
558
|
+
}
|
559
|
+
}
|
560
|
+
}
|
561
|
+
|
562
|
+
if (s->mode & SCWS_MULTI_DUALITY)
|
563
|
+
{
|
564
|
+
while (m < j)
|
565
|
+
{
|
566
|
+
if (SCWS_IS_ECHAR(s->wmap[m][m]->flag))
|
567
|
+
{
|
568
|
+
SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
|
569
|
+
s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
|
570
|
+
}
|
571
|
+
else if (SCWS_IS_ECHAR(s->wmap[m+1][m+1]->flag))
|
572
|
+
{
|
573
|
+
if (m == i)
|
574
|
+
{
|
575
|
+
SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
|
576
|
+
s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
|
577
|
+
}
|
578
|
+
m++;
|
579
|
+
SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
|
580
|
+
s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
|
581
|
+
}
|
582
|
+
else
|
583
|
+
{
|
584
|
+
SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m+1].end - s->zmap[m].start), s->wmap[m][m]->attr);
|
585
|
+
}
|
586
|
+
m++;
|
587
|
+
if (m == j && (SCWS_IS_ECHAR(s->wmap[m][m]->flag) || SCWS_IS_ECHAR(s->wmap[m-1][m-1]->flag)))
|
588
|
+
{
|
589
|
+
SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
|
590
|
+
s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
|
591
|
+
}
|
592
|
+
}
|
593
|
+
}
|
594
|
+
}
|
595
|
+
|
596
|
+
// step2, split to single char
|
597
|
+
if ((j > i) && (s->mode & (SCWS_MULTI_ZMAIN|SCWS_MULTI_ZALL)))
|
598
|
+
{
|
599
|
+
if ((j - i) == 1 && !s->wmap[i][j])
|
600
|
+
{
|
601
|
+
if (s->wmap[i][i]->flag & SCWS_ZFLAG_PUT) i++;
|
602
|
+
else s->wmap[i][i]->flag |= SCWS_ZFLAG_PUT;
|
603
|
+
s->wmap[j][j]->flag |= SCWS_ZFLAG_PUT;
|
604
|
+
}
|
605
|
+
do
|
606
|
+
{
|
607
|
+
if (s->wmap[i][i]->flag & SCWS_ZFLAG_PUT)
|
608
|
+
continue;
|
609
|
+
if (!(s->mode & SCWS_MULTI_ZALL) && !strchr("jnv", s->wmap[i][i]->attr[0]))
|
610
|
+
continue;
|
611
|
+
SCWS_PUT_RES(s->zmap[i].start, s->wmap[i][i]->idf, (s->zmap[i].end - s->zmap[i].start), s->wmap[i][i]->attr);
|
612
|
+
}
|
613
|
+
while (++i <= j);
|
614
|
+
}
|
615
|
+
}
|
616
|
+
|
617
|
+
static void _scws_mseg_zone(scws_t s, int f, int t)
|
618
|
+
{
|
619
|
+
unsigned char *mpath, *npath;
|
620
|
+
word_t **wmap;
|
621
|
+
int x,i,j,m,n,j2,sz;
|
622
|
+
double weight, nweight;
|
623
|
+
char attr1[3];
|
624
|
+
|
625
|
+
mpath = npath = NULL;
|
626
|
+
weight = nweight = (double) 0.0;
|
627
|
+
|
628
|
+
wmap = s->wmap;
|
629
|
+
j2 = 0;
|
630
|
+
for (x = i = f; i <= t; i++)
|
631
|
+
{
|
632
|
+
j = _scws_mget_word(s, i, (x > i ? x - 1 : t));
|
633
|
+
if (j == i) continue;
|
634
|
+
// skip NR in NR
|
635
|
+
if (j < j2 && wmap[i][j]->attr[0] == 'n' && wmap[i][j]->attr[1] == 'r') continue;
|
636
|
+
if (i > j2 && (wmap[i][j]->flag & SCWS_WORD_USED)) continue;
|
637
|
+
|
638
|
+
/* one word only */
|
639
|
+
if (i == f && j == t)
|
640
|
+
{
|
641
|
+
mpath = (unsigned char *) malloc(2);
|
642
|
+
mpath[0] = j - i;
|
643
|
+
mpath[1] = 0xff;
|
644
|
+
break;
|
645
|
+
}
|
646
|
+
|
647
|
+
if (i != f && (wmap[i][j]->flag & SCWS_WORD_RULE))
|
648
|
+
continue;
|
649
|
+
|
650
|
+
/* create the new path */
|
651
|
+
wmap[i][j]->flag |= SCWS_WORD_USED;
|
652
|
+
nweight = (double) wmap[i][j]->tf * pow(j-i,4);
|
653
|
+
|
654
|
+
if (npath == NULL)
|
655
|
+
{
|
656
|
+
npath = (unsigned char *) malloc(t-f+2);
|
657
|
+
memset(npath, 0xff, t-f+2);
|
658
|
+
}
|
659
|
+
|
660
|
+
/* lookfor backward */
|
661
|
+
x = sz = 0;
|
662
|
+
memset(attr1, 0, sizeof(attr1));
|
663
|
+
for (m = f; m < i; m = n+1)
|
664
|
+
{
|
665
|
+
n = _scws_mget_word(s, m, i-1);
|
666
|
+
nweight *= wmap[m][n]->tf;
|
667
|
+
npath[x++] = n - m;
|
668
|
+
if (n > m)
|
669
|
+
{
|
670
|
+
nweight *= pow(n-m,4);
|
671
|
+
wmap[m][n]->flag |= SCWS_WORD_USED;
|
672
|
+
}
|
673
|
+
else sz++;
|
674
|
+
|
675
|
+
if (attr1[0] != '\0')
|
676
|
+
nweight *= scws_rule_attr_ratio(s->r, attr1, wmap[m][n]->attr, &npath[x-2]);
|
677
|
+
memcpy(attr1, wmap[m][n]->attr, 2);
|
678
|
+
}
|
679
|
+
|
680
|
+
/* my self */
|
681
|
+
npath[x++] = j - i;
|
682
|
+
|
683
|
+
if (attr1[0] != '\0')
|
684
|
+
nweight *= scws_rule_attr_ratio(s->r, attr1, wmap[i][j]->attr, &npath[x-2]);
|
685
|
+
memcpy(attr1, wmap[i][j]->attr, 2);
|
686
|
+
|
687
|
+
/* lookfor forward */
|
688
|
+
for (m = j+1; m <= t; m = n+1)
|
689
|
+
{
|
690
|
+
n = _scws_mget_word(s, m, t);
|
691
|
+
nweight *= wmap[m][n]->tf;
|
692
|
+
npath[x++] = n - m;
|
693
|
+
if (n > m)
|
694
|
+
{
|
695
|
+
nweight *= pow(n-m,4);
|
696
|
+
wmap[m][n]->flag |= SCWS_WORD_USED;
|
697
|
+
}
|
698
|
+
else sz++;
|
699
|
+
|
700
|
+
nweight *= scws_rule_attr_ratio(s->r, attr1, wmap[m][n]->attr, &npath[x-2]);
|
701
|
+
memcpy(attr1, wmap[m][n]->attr, 2);
|
702
|
+
}
|
703
|
+
|
704
|
+
npath[x] = 0xff;
|
705
|
+
nweight /= pow(x+sz-1,5);
|
706
|
+
|
707
|
+
/* draw the path for debug */
|
708
|
+
#ifdef DEBUG
|
709
|
+
if (s->mode & SCWS_DEBUG)
|
710
|
+
{
|
711
|
+
fprintf(stderr, "PATH by keyword = %.*s, (weight=%.4f):\n",
|
712
|
+
s->zmap[j].end - s->zmap[i].start, s->txt + s->zmap[i].start, nweight);
|
713
|
+
for (x = 0, m = f; (n = npath[x]) != 0xff; x++)
|
714
|
+
{
|
715
|
+
n += m;
|
716
|
+
fprintf(stderr, "%.*s ", s->zmap[n].end - s->zmap[m].start, s->txt + s->zmap[m].start);
|
717
|
+
m = n + 1;
|
718
|
+
}
|
719
|
+
fprintf(stderr, "\n--\n");
|
720
|
+
}
|
721
|
+
#endif
|
722
|
+
|
723
|
+
j2 = x = j;
|
724
|
+
if ((x - i) > 1) i--;
|
725
|
+
/* check better path */
|
726
|
+
if (nweight > weight)
|
727
|
+
{
|
728
|
+
unsigned char *swap;
|
729
|
+
|
730
|
+
weight = nweight;
|
731
|
+
swap = mpath;
|
732
|
+
mpath = npath;
|
733
|
+
npath = swap;
|
734
|
+
}
|
735
|
+
}
|
736
|
+
|
737
|
+
/* set the result, mpath != NULL */
|
738
|
+
if (mpath == NULL)
|
739
|
+
return;
|
740
|
+
|
741
|
+
for (x = 0, m = f; (n = mpath[x]) != 0xff; x++)
|
742
|
+
{
|
743
|
+
n += m;
|
744
|
+
_scws_mset_word(s, m, n);
|
745
|
+
m = n + 1;
|
746
|
+
}
|
747
|
+
|
748
|
+
/* 一口.070808: memory leak fixed. */
|
749
|
+
if (mpath) free(mpath);
|
750
|
+
if (npath) free(npath);
|
751
|
+
}
|
752
|
+
|
753
|
+
/* quick define for zrule_checker in loop */
|
754
|
+
#define ___ZRULE_CHECKER1___ \
|
755
|
+
if (j >= zlen || SCWS_NO_RULE2(wmap[j][j]->flag)) \
|
756
|
+
break;
|
757
|
+
|
758
|
+
#define ___ZRULE_CHECKER2___ \
|
759
|
+
if (j < 0 || SCWS_NO_RULE2(wmap[j][j]->flag)) \
|
760
|
+
break;
|
761
|
+
|
762
|
+
#define ___ZRULE_CHECKER3___ \
|
763
|
+
if (!scws_rule_check(s->r, r1, txt + zmap[j].start, zmap[j].end - zmap[j].start)) \
|
764
|
+
break;
|
765
|
+
|
766
|
+
static void _scws_msegment(scws_t s, int end, int zlen)
|
767
|
+
{
|
768
|
+
word_t **wmap, query;
|
769
|
+
struct scws_zchar *zmap;
|
770
|
+
unsigned char *txt;
|
771
|
+
#ifdef HAVE_NAME_RULE /* 20150403: Remove rules, just deepend on dictionary */
|
772
|
+
rule_item_t r1;
|
773
|
+
#endif
|
774
|
+
int i, j, k, ch, clen, start;
|
775
|
+
pool_t p;
|
776
|
+
|
777
|
+
/* pool used to management some dynamic memory */
|
778
|
+
p = pool_new();
|
779
|
+
|
780
|
+
/* create wmap & zmap */
|
781
|
+
wmap = s->wmap = (word_t **) darray_new(zlen, zlen, sizeof(word_t));
|
782
|
+
zmap = s->zmap = (struct scws_zchar *) pmalloc(p, zlen * sizeof(struct scws_zchar));
|
783
|
+
txt = s->txt;
|
784
|
+
start = s->off;
|
785
|
+
s->zis = -1;
|
786
|
+
|
787
|
+
for (i = 0; start < end; i++)
|
788
|
+
{
|
789
|
+
ch = txt[start];
|
790
|
+
clen = SCWS_CHARLEN(ch);
|
791
|
+
if (clen == 1)
|
792
|
+
{
|
793
|
+
while (start++ < end)
|
794
|
+
{
|
795
|
+
ch = txt[start];
|
796
|
+
if (start == end || SCWS_CHARLEN(txt[start]) > 1)
|
797
|
+
break;
|
798
|
+
clen++;
|
799
|
+
}
|
800
|
+
wmap[i][i] = (word_t) pmalloc_z(p, sizeof(word_st));
|
801
|
+
wmap[i][i]->tf = 0.5;
|
802
|
+
wmap[i][i]->flag |= SCWS_ZFLAG_ENGLISH;
|
803
|
+
strcpy(wmap[i][i]->attr, SCWS_IS_ALPHA(txt[start-1]) ? attr_en : attr_un);
|
804
|
+
}
|
805
|
+
else
|
806
|
+
{
|
807
|
+
query = xdict_query(s->d, txt + start, clen);
|
808
|
+
wmap[i][i] = (word_t) pmalloc(p, sizeof(word_st));
|
809
|
+
if (query == NULL)
|
810
|
+
{
|
811
|
+
wmap[i][i]->tf = 0.5;
|
812
|
+
wmap[i][i]->idf = 0.0;
|
813
|
+
wmap[i][i]->flag = 0;
|
814
|
+
strcpy(wmap[i][i]->attr, attr_un);
|
815
|
+
}
|
816
|
+
else
|
817
|
+
{
|
818
|
+
ch = query->flag;
|
819
|
+
query->flag = SCWS_WORD_FULL;
|
820
|
+
memcpy(wmap[i][i], query, sizeof(word_st));
|
821
|
+
if (query->attr[0] == '#')
|
822
|
+
wmap[i][i]->flag |= SCWS_ZFLAG_SYMBOL;
|
823
|
+
|
824
|
+
if (ch & SCWS_WORD_MALLOCED)
|
825
|
+
free(query);
|
826
|
+
}
|
827
|
+
start += clen;
|
828
|
+
}
|
829
|
+
|
830
|
+
zmap[i].start = start - clen;
|
831
|
+
zmap[i].end = start;
|
832
|
+
}
|
833
|
+
|
834
|
+
/* fixed real zlength */
|
835
|
+
zlen = i;
|
836
|
+
|
837
|
+
/* create word query table */
|
838
|
+
for (i = 0; i < zlen; i++)
|
839
|
+
{
|
840
|
+
k = 0;
|
841
|
+
for (j = i+1; j < zlen; j++)
|
842
|
+
{
|
843
|
+
query = xdict_query(s->d, txt + zmap[i].start, zmap[j].end - zmap[i].start);
|
844
|
+
if (query == NULL)
|
845
|
+
break;
|
846
|
+
ch = query->flag;
|
847
|
+
if ((ch & SCWS_WORD_FULL) && memcmp(query->attr, attr_na, 2))
|
848
|
+
{
|
849
|
+
wmap[i][j] = (word_t) pmalloc(p, sizeof(word_st));
|
850
|
+
memcpy(wmap[i][j], query, sizeof(word_st));
|
851
|
+
|
852
|
+
wmap[i][i]->flag |= SCWS_ZFLAG_WHEAD;
|
853
|
+
|
854
|
+
for (k = i+1; k <= j; k++)
|
855
|
+
wmap[k][k]->flag |= SCWS_ZFLAG_WPART;
|
856
|
+
}
|
857
|
+
|
858
|
+
if (ch & SCWS_WORD_MALLOCED)
|
859
|
+
free(query);
|
860
|
+
|
861
|
+
if (!(ch & SCWS_WORD_PART))
|
862
|
+
break;
|
863
|
+
}
|
864
|
+
|
865
|
+
if (k--)
|
866
|
+
{
|
867
|
+
/* set nr2 to some short name */
|
868
|
+
if ((k == (i+1)))
|
869
|
+
{
|
870
|
+
if (!memcmp(wmap[i][k]->attr, attr_nr, 2))
|
871
|
+
wmap[i][i]->flag |= SCWS_ZFLAG_NR2;
|
872
|
+
//if (wmap[i][k]->attr[0] == 'n')
|
873
|
+
//wmap[i][i]->flag |= SCWS_ZFLAG_N2;
|
874
|
+
}
|
875
|
+
|
876
|
+
/* clean the PART flag for the last word */
|
877
|
+
if (k < j)
|
878
|
+
wmap[i][k]->flag ^= SCWS_WORD_PART;
|
879
|
+
}
|
880
|
+
}
|
881
|
+
|
882
|
+
if (s->r == NULL)
|
883
|
+
goto do_segment;
|
884
|
+
|
885
|
+
#ifdef HAVE_NAME_RULE /* 20150403: Remove rules, just deepend on dictionary */
|
886
|
+
/* auto rule set for name & zone & chinese numeric */
|
887
|
+
|
888
|
+
/* one word auto rule check */
|
889
|
+
for (i = 0; i < zlen; i++)
|
890
|
+
{
|
891
|
+
if (SCWS_NO_RULE1(wmap[i][i]->flag))
|
892
|
+
continue;
|
893
|
+
|
894
|
+
r1 = scws_rule_get(s->r, txt + zmap[i].start, zmap[i].end - zmap[i].start);
|
895
|
+
if (r1 == NULL)
|
896
|
+
continue;
|
897
|
+
|
898
|
+
clen = r1->zmin > 0 ? r1->zmin : 1;
|
899
|
+
if ((r1->flag & SCWS_ZRULE_PREFIX) && (i < (zlen - clen)))
|
900
|
+
{
|
901
|
+
/* prefix, check after (zmin~zmax) */
|
902
|
+
// 先检查 zmin 字内是否全部符合要求
|
903
|
+
// 再在 zmax 范围内取得符合要求的字
|
904
|
+
// int i, j, k, ch, clen, start;
|
905
|
+
for (ch = 1; ch <= clen; ch++)
|
906
|
+
{
|
907
|
+
j = i + ch;
|
908
|
+
___ZRULE_CHECKER1___
|
909
|
+
___ZRULE_CHECKER3___
|
910
|
+
}
|
911
|
+
|
912
|
+
if (ch <= clen)
|
913
|
+
continue;
|
914
|
+
|
915
|
+
/* no limit znum or limit to a range */
|
916
|
+
j = i + ch;
|
917
|
+
while (1)
|
918
|
+
{
|
919
|
+
if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
|
920
|
+
break;
|
921
|
+
___ZRULE_CHECKER1___
|
922
|
+
___ZRULE_CHECKER3___
|
923
|
+
clen++;
|
924
|
+
j++;
|
925
|
+
}
|
926
|
+
|
927
|
+
// 注意原来2字人名,识别后仍为2字的情况
|
928
|
+
if (wmap[i][i]->flag & SCWS_ZFLAG_NR2)
|
929
|
+
{
|
930
|
+
if (clen == 1)
|
931
|
+
continue;
|
932
|
+
wmap[i][i+1]->flag |= SCWS_WORD_PART;
|
933
|
+
}
|
934
|
+
|
935
|
+
/* ok, got: i & clen */
|
936
|
+
k = i + clen;
|
937
|
+
wmap[i][k] = (word_t) pmalloc(p, sizeof(word_st));
|
938
|
+
wmap[i][k]->tf = r1->tf;
|
939
|
+
wmap[i][k]->idf = r1->idf;
|
940
|
+
wmap[i][k]->flag = (SCWS_WORD_RULE|SCWS_WORD_FULL);
|
941
|
+
strncpy(wmap[i][k]->attr, r1->attr, 2);
|
942
|
+
|
943
|
+
wmap[i][i]->flag |= SCWS_ZFLAG_WHEAD;
|
944
|
+
for (j = i+1; j <= k; j++)
|
945
|
+
wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
|
946
|
+
|
947
|
+
if (!(wmap[i][i]->flag & SCWS_ZFLAG_WPART))
|
948
|
+
i = k;
|
949
|
+
|
950
|
+
continue;
|
951
|
+
}
|
952
|
+
|
953
|
+
if ((r1->flag & SCWS_ZRULE_SUFFIX) && (i >= clen))
|
954
|
+
{
|
955
|
+
/* suffix, check before */
|
956
|
+
for (ch = 1; ch <= clen; ch++)
|
957
|
+
{
|
958
|
+
j = i - ch;
|
959
|
+
___ZRULE_CHECKER2___
|
960
|
+
___ZRULE_CHECKER3___
|
961
|
+
}
|
962
|
+
|
963
|
+
if (ch <= clen)
|
964
|
+
continue;
|
965
|
+
|
966
|
+
/* no limit znum or limit to a range */
|
967
|
+
j = i - ch;
|
968
|
+
while (1)
|
969
|
+
{
|
970
|
+
if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
|
971
|
+
break;
|
972
|
+
___ZRULE_CHECKER2___
|
973
|
+
___ZRULE_CHECKER3___
|
974
|
+
clen++;
|
975
|
+
j--;
|
976
|
+
}
|
977
|
+
|
978
|
+
/* ok, got: i & clen (maybe clen=1 & [k][i] isset) */
|
979
|
+
k = i - clen;
|
980
|
+
if (wmap[k][i] != NULL)
|
981
|
+
continue;
|
982
|
+
|
983
|
+
wmap[k][i] = (word_t) pmalloc(p, sizeof(word_st));
|
984
|
+
wmap[k][i]->tf = r1->tf;
|
985
|
+
wmap[k][i]->idf = r1->idf;
|
986
|
+
wmap[k][i]->flag = SCWS_WORD_FULL;
|
987
|
+
strncpy(wmap[k][i]->attr, r1->attr, 2);
|
988
|
+
|
989
|
+
wmap[k][k]->flag |= SCWS_ZFLAG_WHEAD;
|
990
|
+
for (j = k+1; j <= i; j++)
|
991
|
+
{
|
992
|
+
wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
|
993
|
+
if ((j != i) && (wmap[k][j] != NULL))
|
994
|
+
wmap[k][j]->flag |= SCWS_WORD_PART;
|
995
|
+
}
|
996
|
+
continue;
|
997
|
+
}
|
998
|
+
}
|
999
|
+
|
1000
|
+
/* two words auto rule check (欧阳** , **西路) */
|
1001
|
+
for (i = zlen - 2; i >= 0; i--)
|
1002
|
+
{
|
1003
|
+
/* with value ==> must be have SCWS_WORD_FULL, so needn't check it ag. */
|
1004
|
+
if ((wmap[i][i+1] == NULL) || (wmap[i][i+1]->flag & SCWS_WORD_PART))
|
1005
|
+
continue;
|
1006
|
+
|
1007
|
+
k = i+1;
|
1008
|
+
r1 = scws_rule_get(s->r, txt + zmap[i].start, zmap[k].end - zmap[i].start);
|
1009
|
+
if (r1 == NULL)
|
1010
|
+
continue;
|
1011
|
+
|
1012
|
+
clen = r1->zmin > 0 ? r1->zmin : 1;
|
1013
|
+
if ((r1->flag & SCWS_ZRULE_PREFIX) && (k < (zlen - clen)))
|
1014
|
+
{
|
1015
|
+
for (ch = 1; ch <= clen; ch++)
|
1016
|
+
{
|
1017
|
+
j = k + ch;
|
1018
|
+
___ZRULE_CHECKER1___
|
1019
|
+
___ZRULE_CHECKER3___
|
1020
|
+
}
|
1021
|
+
|
1022
|
+
if (ch <= clen)
|
1023
|
+
continue;
|
1024
|
+
|
1025
|
+
/* no limit znum or limit to a range */
|
1026
|
+
j = k + ch;
|
1027
|
+
while (1)
|
1028
|
+
{
|
1029
|
+
if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
|
1030
|
+
break;
|
1031
|
+
___ZRULE_CHECKER1___
|
1032
|
+
___ZRULE_CHECKER3___
|
1033
|
+
clen++;
|
1034
|
+
j++;
|
1035
|
+
}
|
1036
|
+
|
1037
|
+
/* ok, got: i & clen */
|
1038
|
+
k = k + clen;
|
1039
|
+
wmap[i][k] = (word_t) pmalloc(p, sizeof(word_st));
|
1040
|
+
wmap[i][k]->tf = r1->tf;
|
1041
|
+
wmap[i][k]->idf = r1->idf;
|
1042
|
+
wmap[i][k]->flag = SCWS_WORD_FULL;
|
1043
|
+
strncpy(wmap[i][k]->attr, r1->attr, 2);
|
1044
|
+
|
1045
|
+
wmap[i][i+1]->flag |= SCWS_WORD_PART;
|
1046
|
+
for (j = i+2; j <= k; j++)
|
1047
|
+
wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
|
1048
|
+
|
1049
|
+
i--;
|
1050
|
+
continue;
|
1051
|
+
}
|
1052
|
+
|
1053
|
+
if ((r1->flag & SCWS_ZRULE_SUFFIX) && (i >= clen))
|
1054
|
+
{
|
1055
|
+
/* suffix, check before */
|
1056
|
+
for (ch = 1; ch <= clen; ch++)
|
1057
|
+
{
|
1058
|
+
j = i - ch;
|
1059
|
+
___ZRULE_CHECKER2___
|
1060
|
+
___ZRULE_CHECKER3___
|
1061
|
+
}
|
1062
|
+
|
1063
|
+
if (ch <= clen)
|
1064
|
+
continue;
|
1065
|
+
|
1066
|
+
/* no limit znum or limit to a range */
|
1067
|
+
j = i - ch;
|
1068
|
+
while (1)
|
1069
|
+
{
|
1070
|
+
if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
|
1071
|
+
break;
|
1072
|
+
___ZRULE_CHECKER2___
|
1073
|
+
___ZRULE_CHECKER3___
|
1074
|
+
clen++;
|
1075
|
+
j--;
|
1076
|
+
}
|
1077
|
+
|
1078
|
+
/* ok, got: i & clen (maybe clen=1 & [k][i] isset) */
|
1079
|
+
k = i - clen;
|
1080
|
+
i = i + 1;
|
1081
|
+
wmap[k][i] = (word_t) pmalloc(p, sizeof(word_st));
|
1082
|
+
wmap[k][i]->tf = r1->tf;
|
1083
|
+
wmap[k][i]->idf = r1->idf;
|
1084
|
+
wmap[k][i]->flag = SCWS_WORD_FULL;
|
1085
|
+
strncpy(wmap[k][i]->attr, r1->attr, 2);
|
1086
|
+
|
1087
|
+
wmap[k][k]->flag |= SCWS_ZFLAG_WHEAD;
|
1088
|
+
for (j = k+1; j <= i; j++)
|
1089
|
+
{
|
1090
|
+
wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
|
1091
|
+
if (wmap[k][j] != NULL)
|
1092
|
+
wmap[k][j]->flag |= SCWS_WORD_PART;
|
1093
|
+
}
|
1094
|
+
|
1095
|
+
i -= (clen+1);
|
1096
|
+
continue;
|
1097
|
+
}
|
1098
|
+
}
|
1099
|
+
#endif
|
1100
|
+
|
1101
|
+
/* real do the segment */
|
1102
|
+
do_segment:
|
1103
|
+
|
1104
|
+
/* find the easy break point */
|
1105
|
+
for (i = 0, j = 0; i < zlen; i++)
|
1106
|
+
{
|
1107
|
+
if (wmap[i][i]->flag & SCWS_ZFLAG_WPART)
|
1108
|
+
continue;
|
1109
|
+
|
1110
|
+
if (i > j)
|
1111
|
+
_scws_mseg_zone(s, j, i-1);
|
1112
|
+
|
1113
|
+
j = i;
|
1114
|
+
if (!(wmap[i][i]->flag & SCWS_ZFLAG_WHEAD))
|
1115
|
+
{
|
1116
|
+
_scws_mset_word(s, i, i);
|
1117
|
+
j++;
|
1118
|
+
}
|
1119
|
+
}
|
1120
|
+
|
1121
|
+
/* the lastest zone */
|
1122
|
+
if (i > j)
|
1123
|
+
_scws_mseg_zone(s, j, i-1);
|
1124
|
+
|
1125
|
+
/* the last single for duality */
|
1126
|
+
if ((s->mode & SCWS_DUALITY) && (s->zis >= 0) && !(s->zis & SCWS_ZIS_USED))
|
1127
|
+
{
|
1128
|
+
i = s->zis;
|
1129
|
+
SCWS_PUT_RES(s->zmap[i].start, s->wmap[i][i]->idf, (s->zmap[i].end - s->zmap[i].start), s->wmap[i][i]->attr);
|
1130
|
+
}
|
1131
|
+
|
1132
|
+
/* free the wmap & zmap */
|
1133
|
+
pool_free(p);
|
1134
|
+
darray_free((void **) wmap);
|
1135
|
+
}
|
1136
|
+
|
1137
|
+
scws_res_t scws_get_result(scws_t s)
|
1138
|
+
{
|
1139
|
+
int off, len, ch, clen, zlen, pflag;
|
1140
|
+
unsigned char *txt;
|
1141
|
+
|
1142
|
+
off = s->off;
|
1143
|
+
len = s->len;
|
1144
|
+
txt = s->txt;
|
1145
|
+
s->res0 = s->res1 = NULL;
|
1146
|
+
while ((off < len) && (txt[off] <= 0x20))
|
1147
|
+
{
|
1148
|
+
if (txt[off] == 0x0a || txt[off] == 0x0d)
|
1149
|
+
{
|
1150
|
+
s->off = off + 1;
|
1151
|
+
SCWS_PUT_RES(off, 0.0, 1, attr_un);
|
1152
|
+
return s->res0;
|
1153
|
+
}
|
1154
|
+
off++;
|
1155
|
+
}
|
1156
|
+
|
1157
|
+
if (off >= len)
|
1158
|
+
return NULL;
|
1159
|
+
|
1160
|
+
/* try to parse the sentence */
|
1161
|
+
s->off = off;
|
1162
|
+
ch = txt[off];
|
1163
|
+
if (SCWS_CHAR_TOKEN(ch) && !(s->mode & SCWS_IGN_SYMBOL))
|
1164
|
+
{
|
1165
|
+
s->off++;
|
1166
|
+
SCWS_PUT_RES(off, 0.0, 1, attr_un);
|
1167
|
+
return s->res0;
|
1168
|
+
}
|
1169
|
+
clen = SCWS_CHARLEN(ch);
|
1170
|
+
zlen = 1;
|
1171
|
+
pflag = (clen > 1 ? PFLAG_WITH_MB : (SCWS_IS_ALNUM(ch) ? PFLAG_ALNUM : 0));
|
1172
|
+
while ((off = (off+clen)) < len)
|
1173
|
+
{
|
1174
|
+
ch = txt[off];
|
1175
|
+
if (ch <= 0x20 || SCWS_CHAR_TOKEN(ch)) break;
|
1176
|
+
clen = SCWS_CHARLEN(ch);
|
1177
|
+
if (!(pflag & PFLAG_WITH_MB))
|
1178
|
+
{
|
1179
|
+
// pure single-byte -> multibyte (2bytes)
|
1180
|
+
if (clen == 1)
|
1181
|
+
{
|
1182
|
+
if (pflag & PFLAG_ALNUM)
|
1183
|
+
{
|
1184
|
+
if (SCWS_IS_ALPHA(ch))
|
1185
|
+
{
|
1186
|
+
if (!(pflag & PFLAG_LONGALPHA) && SCWS_IS_ALPHA(txt[off-1]))
|
1187
|
+
pflag |= PFLAG_LONGALPHA;
|
1188
|
+
}
|
1189
|
+
else if (SCWS_IS_DIGIT(ch))
|
1190
|
+
{
|
1191
|
+
if (!(pflag & PFLAG_LONGDIGIT) && SCWS_IS_DIGIT(txt[off-1]))
|
1192
|
+
pflag |= PFLAG_LONGDIGIT;
|
1193
|
+
}
|
1194
|
+
else
|
1195
|
+
pflag ^= PFLAG_ALNUM;
|
1196
|
+
}
|
1197
|
+
}
|
1198
|
+
else
|
1199
|
+
{
|
1200
|
+
if (!(pflag & PFLAG_ALNUM) || zlen > 2)
|
1201
|
+
break;
|
1202
|
+
|
1203
|
+
pflag |= PFLAG_WITH_MB;
|
1204
|
+
/* zlen = 1; */
|
1205
|
+
}
|
1206
|
+
}
|
1207
|
+
else if ((pflag & PFLAG_WITH_MB) && clen == 1)
|
1208
|
+
{
|
1209
|
+
int i;
|
1210
|
+
|
1211
|
+
// mb + single-byte. allowd: alpha+num + 中文
|
1212
|
+
if (!SCWS_IS_ALNUM(ch))
|
1213
|
+
break;
|
1214
|
+
|
1215
|
+
pflag &= ~PFLAG_VALID;
|
1216
|
+
// 夹在中文间的英文数字最多允许 2 个字符 (超过2可以独立成词没啥问题)
|
1217
|
+
for (i = off+1; i < (off+3); i++)
|
1218
|
+
{
|
1219
|
+
ch = txt[i];
|
1220
|
+
if ((i >= len) || (ch <= 0x20) || (SCWS_CHARLEN(ch) > 1))
|
1221
|
+
{
|
1222
|
+
pflag |= PFLAG_VALID;
|
1223
|
+
break;
|
1224
|
+
}
|
1225
|
+
|
1226
|
+
if (!SCWS_IS_ALNUM(ch))
|
1227
|
+
break;
|
1228
|
+
}
|
1229
|
+
|
1230
|
+
if (!(pflag & PFLAG_VALID))
|
1231
|
+
break;
|
1232
|
+
|
1233
|
+
clen += (i - off - 1);
|
1234
|
+
}
|
1235
|
+
/* hightman.070813: add max zlen limit */
|
1236
|
+
if (++zlen >= SCWS_MAX_ZLEN)
|
1237
|
+
break;
|
1238
|
+
}
|
1239
|
+
|
1240
|
+
/* hightman.070624: 处理半个字的问题 */
|
1241
|
+
if ((ch = off) > len)
|
1242
|
+
off -= clen;
|
1243
|
+
|
1244
|
+
/* do the real segment */
|
1245
|
+
if (off <= s->off)
|
1246
|
+
return NULL;
|
1247
|
+
else if (pflag & PFLAG_WITH_MB)
|
1248
|
+
_scws_msegment(s, off, zlen);
|
1249
|
+
else if (!(pflag & PFLAG_ALNUM) || ((off - s->off) >= SCWS_MAX_EWLEN))
|
1250
|
+
_scws_ssegment(s, off);
|
1251
|
+
else
|
1252
|
+
{
|
1253
|
+
zlen = off - s->off;
|
1254
|
+
if ((pflag & (PFLAG_LONGALPHA|PFLAG_LONGDIGIT)) == (PFLAG_LONGALPHA|PFLAG_LONGDIGIT))
|
1255
|
+
_scws_alnum_multi(s, s->off, zlen);
|
1256
|
+
else
|
1257
|
+
{
|
1258
|
+
float idf;
|
1259
|
+
|
1260
|
+
idf = SCWS_EN_IDF(zlen);
|
1261
|
+
SCWS_PUT_RES(s->off, idf, zlen, attr_en);
|
1262
|
+
|
1263
|
+
/* hightman.090523: 为字母数字混合再度拆解, 纯数字, (>1 ? 纯字母 : 数字+字母) */
|
1264
|
+
if ((s->mode & SCWS_MULTI_DUALITY) && zlen > 2)
|
1265
|
+
_scws_alnum_multi(s, s->off, zlen);
|
1266
|
+
}
|
1267
|
+
}
|
1268
|
+
|
1269
|
+
/* reutrn the result */
|
1270
|
+
s->off = (ch > len ? len : off);
|
1271
|
+
if (s->res0 == NULL)
|
1272
|
+
return scws_get_result(s);
|
1273
|
+
|
1274
|
+
return s->res0;
|
1275
|
+
}
|
1276
|
+
|
1277
|
+
/* free the result retunned by scws_get_result */
|
1278
|
+
void scws_free_result(scws_res_t result)
|
1279
|
+
{
|
1280
|
+
scws_res_t cur;
|
1281
|
+
|
1282
|
+
while ((cur = result) != NULL)
|
1283
|
+
{
|
1284
|
+
result = cur->next;
|
1285
|
+
free(cur);
|
1286
|
+
}
|
1287
|
+
}
|
1288
|
+
|
1289
|
+
/* top words count */
|
1290
|
+
// xattr = ~v,p,c
|
1291
|
+
// xattr = v,pn,c
|
1292
|
+
|
1293
|
+
static int _tops_cmp(a, b)
|
1294
|
+
scws_top_t *a,*b;
|
1295
|
+
{
|
1296
|
+
if ((*b)->weight > (*a)->weight)
|
1297
|
+
return 1;
|
1298
|
+
return -1;
|
1299
|
+
}
|
1300
|
+
|
1301
|
+
static void _tops_load_node(node_t node, scws_top_t *values, int *start)
|
1302
|
+
{
|
1303
|
+
int i = *start;
|
1304
|
+
|
1305
|
+
if (node == NULL)
|
1306
|
+
return;
|
1307
|
+
|
1308
|
+
values[i] = node->value;
|
1309
|
+
values[i]->word = node->key;
|
1310
|
+
|
1311
|
+
*start = ++i;
|
1312
|
+
_tops_load_node(node->left, values, start);
|
1313
|
+
_tops_load_node(node->right, values, start);
|
1314
|
+
}
|
1315
|
+
|
1316
|
+
static void _tops_load_all(xtree_t xt, scws_top_t *values)
|
1317
|
+
{
|
1318
|
+
int i, start;
|
1319
|
+
|
1320
|
+
for (i = 0, start = 0; i < xt->prime; i++)
|
1321
|
+
_tops_load_node(xt->trees[i], values, &start);
|
1322
|
+
}
|
1323
|
+
|
1324
|
+
typedef char word_attr[4];
|
1325
|
+
static inline int _attr_belong(const char *a, word_attr *at)
|
1326
|
+
{
|
1327
|
+
if ((*at)[0] == '\0') return 1;
|
1328
|
+
while ((*at)[0])
|
1329
|
+
{
|
1330
|
+
if (!strcmp(a, *at)) return 1;
|
1331
|
+
at++;
|
1332
|
+
}
|
1333
|
+
return 0;
|
1334
|
+
}
|
1335
|
+
|
1336
|
+
/* macro to parse xattr -> xmode, at */
|
1337
|
+
#define __PARSE_XATTR__ do { \
|
1338
|
+
if (xattr == NULL) break; \
|
1339
|
+
if (*xattr == '~') { xattr++; xmode = SCWS_YEA; } \
|
1340
|
+
if (*xattr == '\0') break; \
|
1341
|
+
cnt = ((strlen(xattr)/2) + 2) * sizeof(word_attr); \
|
1342
|
+
at = (word_attr *) malloc(cnt); \
|
1343
|
+
memset(at, 0, cnt); \
|
1344
|
+
cnt = 0; \
|
1345
|
+
for (cnt = 0; (word = strchr(xattr, ',')); cnt++) { \
|
1346
|
+
at[cnt][0] = *xattr++; \
|
1347
|
+
at[cnt][1] = xattr == word ? '\0' : *xattr; \
|
1348
|
+
xattr = word + 1; \
|
1349
|
+
} \
|
1350
|
+
strncpy(at[cnt], xattr, 2); \
|
1351
|
+
} while (0)
|
1352
|
+
|
1353
|
+
scws_top_t scws_get_tops(scws_t s, int limit, char *xattr)
|
1354
|
+
{
|
1355
|
+
int off, cnt, xmode = SCWS_NA;
|
1356
|
+
xtree_t xt;
|
1357
|
+
scws_res_t res, cur;
|
1358
|
+
scws_top_t top, *list, tail, base;
|
1359
|
+
char *word;
|
1360
|
+
word_attr *at = NULL;
|
1361
|
+
|
1362
|
+
if (!s || !s->txt || !(xt = xtree_new(0,1)))
|
1363
|
+
return NULL;
|
1364
|
+
|
1365
|
+
__PARSE_XATTR__;
|
1366
|
+
|
1367
|
+
// save the offset.
|
1368
|
+
off = s->off;
|
1369
|
+
s->off = cnt = 0;
|
1370
|
+
while ((cur = res = scws_get_result(s)) != NULL)
|
1371
|
+
{
|
1372
|
+
do
|
1373
|
+
{
|
1374
|
+
if (cur->idf < 0.2 || cur->attr[0] == '#')
|
1375
|
+
continue;
|
1376
|
+
|
1377
|
+
/* check attribute filter */
|
1378
|
+
if (at != NULL)
|
1379
|
+
{
|
1380
|
+
if ((xmode == SCWS_NA) && !_attr_belong(cur->attr, at))
|
1381
|
+
continue;
|
1382
|
+
|
1383
|
+
if ((xmode == SCWS_YEA) && _attr_belong(cur->attr, at))
|
1384
|
+
continue;
|
1385
|
+
}
|
1386
|
+
|
1387
|
+
/* check stopwords */
|
1388
|
+
if (!strncmp(cur->attr, attr_en, 2) && cur->len > 6)
|
1389
|
+
{
|
1390
|
+
word = _mem_ndup(s->txt + cur->off, cur->len);
|
1391
|
+
_str_tolower(word, word);
|
1392
|
+
if (SCWS_IS_NOSTATS(word, cur->len))
|
1393
|
+
{
|
1394
|
+
free(word);
|
1395
|
+
continue;
|
1396
|
+
}
|
1397
|
+
free(word);
|
1398
|
+
}
|
1399
|
+
|
1400
|
+
/* put to the stats */
|
1401
|
+
if (!(top = xtree_nget(xt, s->txt + cur->off, cur->len, NULL)))
|
1402
|
+
{
|
1403
|
+
top = (scws_top_t) pmalloc_z(xt->p, sizeof(struct scws_topword));
|
1404
|
+
top->weight = cur->idf;
|
1405
|
+
top->times = 1;
|
1406
|
+
strncpy(top->attr, cur->attr, 2);
|
1407
|
+
xtree_nput(xt, top, sizeof(struct scws_topword), s->txt + cur->off, cur->len);
|
1408
|
+
cnt++;
|
1409
|
+
}
|
1410
|
+
else
|
1411
|
+
{
|
1412
|
+
top->weight += cur->idf;
|
1413
|
+
top->times++;
|
1414
|
+
}
|
1415
|
+
}
|
1416
|
+
while ((cur = cur->next) != NULL);
|
1417
|
+
scws_free_result(res);
|
1418
|
+
}
|
1419
|
+
|
1420
|
+
// free at
|
1421
|
+
if (at != NULL)
|
1422
|
+
free(at);
|
1423
|
+
top = NULL;
|
1424
|
+
if (cnt > 0)
|
1425
|
+
{
|
1426
|
+
/* sort the list */
|
1427
|
+
list = (scws_top_t *) malloc(sizeof(scws_top_t) * cnt);
|
1428
|
+
_tops_load_all(xt, list);
|
1429
|
+
qsort(list, cnt, sizeof(scws_top_t), _tops_cmp);
|
1430
|
+
|
1431
|
+
/* save to return pointer */
|
1432
|
+
if (!limit || limit > cnt)
|
1433
|
+
limit = cnt;
|
1434
|
+
|
1435
|
+
top = tail = (scws_top_t) malloc(sizeof(struct scws_topword));
|
1436
|
+
memcpy(top, list[0], sizeof(struct scws_topword));
|
1437
|
+
top->word = strdup(list[0]->word);
|
1438
|
+
top->next = NULL;
|
1439
|
+
|
1440
|
+
for (cnt = 1; cnt < limit; cnt++)
|
1441
|
+
{
|
1442
|
+
base = (scws_top_t) malloc(sizeof(struct scws_topword));
|
1443
|
+
memcpy(base, list[cnt], sizeof(struct scws_topword));
|
1444
|
+
base->word = strdup(list[cnt]->word);
|
1445
|
+
base->next = NULL;
|
1446
|
+
tail->next = base;
|
1447
|
+
tail = base;
|
1448
|
+
}
|
1449
|
+
free(list);
|
1450
|
+
}
|
1451
|
+
|
1452
|
+
// restore the offset
|
1453
|
+
s->off = off;
|
1454
|
+
xtree_free(xt);
|
1455
|
+
return top;
|
1456
|
+
}
|
1457
|
+
|
1458
|
+
// word check by attr.
|
1459
|
+
int scws_has_word(scws_t s, char *xattr)
|
1460
|
+
{
|
1461
|
+
int off, cnt, xmode = SCWS_NA;
|
1462
|
+
scws_res_t res, cur;
|
1463
|
+
char *word;
|
1464
|
+
word_attr *at = NULL;
|
1465
|
+
|
1466
|
+
if (!s || !s->txt)
|
1467
|
+
return 0;
|
1468
|
+
|
1469
|
+
__PARSE_XATTR__;
|
1470
|
+
|
1471
|
+
// save the offset. (cnt -> return_value)
|
1472
|
+
off = s->off;
|
1473
|
+
cnt = s->off = 0;
|
1474
|
+
while (!cnt && (cur = res = scws_get_result(s)) != NULL)
|
1475
|
+
{
|
1476
|
+
do
|
1477
|
+
{
|
1478
|
+
/* check attribute filter */
|
1479
|
+
if (at != NULL)
|
1480
|
+
{
|
1481
|
+
if ((xmode == SCWS_NA) && _attr_belong(cur->attr, at))
|
1482
|
+
cnt = 1;
|
1483
|
+
|
1484
|
+
if ((xmode == SCWS_YEA) && !_attr_belong(cur->attr, at))
|
1485
|
+
cnt = 1;
|
1486
|
+
}
|
1487
|
+
}
|
1488
|
+
while (!cnt && (cur = cur->next) != NULL);
|
1489
|
+
scws_free_result(res);
|
1490
|
+
}
|
1491
|
+
// memory leak fixed, thanks to lauxinz
|
1492
|
+
if (at != NULL)
|
1493
|
+
free(at);
|
1494
|
+
s->off = off;
|
1495
|
+
return cnt;
|
1496
|
+
}
|
1497
|
+
|
1498
|
+
// get words by attr (rand order)
|
1499
|
+
scws_top_t scws_get_words(scws_t s, char *xattr)
|
1500
|
+
{
|
1501
|
+
int off, cnt, xmode = SCWS_NA;
|
1502
|
+
xtree_t xt;
|
1503
|
+
scws_res_t res, cur;
|
1504
|
+
scws_top_t top, tail, base;
|
1505
|
+
char *word;
|
1506
|
+
word_attr *at = NULL;
|
1507
|
+
|
1508
|
+
if (!s || !s->txt || !(xt = xtree_new(0,1)))
|
1509
|
+
return NULL;
|
1510
|
+
|
1511
|
+
__PARSE_XATTR__;
|
1512
|
+
|
1513
|
+
// save the offset.
|
1514
|
+
off = s->off;
|
1515
|
+
s->off = 0;
|
1516
|
+
base = tail = NULL;
|
1517
|
+
while ((cur = res = scws_get_result(s)) != NULL)
|
1518
|
+
{
|
1519
|
+
do
|
1520
|
+
{
|
1521
|
+
/* check attribute filter */
|
1522
|
+
if (at != NULL)
|
1523
|
+
{
|
1524
|
+
if ((xmode == SCWS_NA) && !_attr_belong(cur->attr, at))
|
1525
|
+
continue;
|
1526
|
+
|
1527
|
+
if ((xmode == SCWS_YEA) && _attr_belong(cur->attr, at))
|
1528
|
+
continue;
|
1529
|
+
}
|
1530
|
+
|
1531
|
+
/* put to the stats */
|
1532
|
+
if (!(top = xtree_nget(xt, s->txt + cur->off, cur->len, NULL)))
|
1533
|
+
{
|
1534
|
+
top = (scws_top_t) malloc(sizeof(struct scws_topword));
|
1535
|
+
top->weight = cur->idf;
|
1536
|
+
top->times = 1;
|
1537
|
+
top->next = NULL;
|
1538
|
+
top->word = (char *)_mem_ndup(s->txt + cur->off, cur->len);
|
1539
|
+
strncpy(top->attr, cur->attr, 2);
|
1540
|
+
// add to the chain
|
1541
|
+
if (tail == NULL)
|
1542
|
+
base = tail = top;
|
1543
|
+
else
|
1544
|
+
{
|
1545
|
+
tail->next = top;
|
1546
|
+
tail = top;
|
1547
|
+
}
|
1548
|
+
xtree_nput(xt, top, sizeof(struct scws_topword), s->txt + cur->off, cur->len);
|
1549
|
+
}
|
1550
|
+
else
|
1551
|
+
{
|
1552
|
+
top->weight += cur->idf;
|
1553
|
+
top->times++;
|
1554
|
+
}
|
1555
|
+
}
|
1556
|
+
while ((cur = cur->next) != NULL);
|
1557
|
+
scws_free_result(res);
|
1558
|
+
}
|
1559
|
+
|
1560
|
+
// free at & xtree
|
1561
|
+
if (at != NULL)
|
1562
|
+
free(at);
|
1563
|
+
xtree_free(xt);
|
1564
|
+
|
1565
|
+
// restore the offset
|
1566
|
+
s->off = off;
|
1567
|
+
return base;
|
1568
|
+
}
|
1569
|
+
|
1570
|
+
void scws_free_tops(scws_top_t tops)
|
1571
|
+
{
|
1572
|
+
scws_top_t cur;
|
1573
|
+
|
1574
|
+
while ((cur = tops) != NULL)
|
1575
|
+
{
|
1576
|
+
tops = cur->next;
|
1577
|
+
if (cur->word)
|
1578
|
+
free(cur->word);
|
1579
|
+
free(cur);
|
1580
|
+
}
|
1581
|
+
}
|