scws4r 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/ext/scws4r/scws.c ADDED
@@ -0,0 +1,1581 @@
1
+ /*
2
+ * @file scws.c (core segment functions)
3
+ * @author Hightman Mar
4
+ * @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
5
+ * $Id $
6
+ */
7
+
8
+ #ifdef HAVE_CONFIG_H
9
+ # include "config.h"
10
+ #endif
11
+
12
+ #ifdef WIN32
13
+ # include "config_win32.h"
14
+ #endif
15
+
16
+ #include "scws.h"
17
+ #include "xdict.h"
18
+ #include "rule.h"
19
+ #include "charset.h"
20
+ #include "darray.h"
21
+ #include "xtree.h"
22
+ #include <stdio.h>
23
+ #include <math.h>
24
+ #include <stdlib.h>
25
+ #include <string.h>
26
+
27
+ /* quick macro define for frequency usage */
28
+ #define SCWS_IS_SPECIAL(x,l) scws_rule_checkbit(s->r,x,l,SCWS_RULE_SPECIAL)
29
+ #define SCWS_IS_NOSTATS(x,l) scws_rule_checkbit(s->r,x,l,SCWS_RULE_NOSTATS)
30
+ #define SCWS_CHARLEN(x) s->mblen[(x)]
31
+ #define SCWS_IS_ALNUM(x) (((x)>=48&&(x)<=57)||((x)>=65&&(x)<=90)||((x)>=97&&(x)<=122))
32
+ #define SCWS_IS_ALPHA(x) (((x)>=65&&(x)<=90)||((x)>=97&&(x)<=122))
33
+ #define SCWS_IS_UALPHA(x) ((x)>=65&&(x)<=90)
34
+ #define SCWS_IS_DIGIT(x) ((x)>=48&&(x)<=57)
35
+ #define SCWS_IS_WHEAD(x) ((x) & SCWS_ZFLAG_WHEAD)
36
+ #define SCWS_IS_ECHAR(x) ((x) & SCWS_ZFLAG_ENGLISH)
37
+ #define SCWS_NO_RULE1(x) (((x) & (SCWS_ZFLAG_SYMBOL|SCWS_ZFLAG_ENGLISH))||(((x) & (SCWS_ZFLAG_WHEAD|SCWS_ZFLAG_NR2)) == SCWS_ZFLAG_WHEAD))
38
+ ///#define SCWS_NO_RULE2(x) (((x) & SCWS_ZFLAG_ENGLISH)||(((x) & (SCWS_ZFLAG_WHEAD|SCWS_ZFLAG_N2)) == SCWS_ZFLAG_WHEAD))
39
+ #define SCWS_NO_RULE2 SCWS_NO_RULE1
40
+ #define SCWS_MAX_EWLEN 33
41
+ ///hightman.070706: char token
42
+ #define SCWS_CHAR_TOKEN(x) ((x)=='('||(x)==')'||(x)=='['||(x)==']'||(x)=='{'||(x)=='}'||(x)==':'||(x)=='"')
43
+ ///hightman.070814: max zlen = ?? (4 * zlen * zlen = ??)
44
+ #define SCWS_MAX_ZLEN 128
45
+ #define SCWS_EN_IDF(x) (float)(2.5*logf(x))
46
+
47
+ static const char *attr_en = "en";
48
+ static const char *attr_un = "un";
49
+ static const char *attr_nr = "nr";
50
+ static const char *attr_na = "!";
51
+
52
+ /* create scws engine */
53
+ scws_t scws_new()
54
+ {
55
+ scws_t s;
56
+ s = (scws_t) malloc(sizeof(scws_st));
57
+ if (s == NULL)
58
+ return s;
59
+ memset(s, 0, sizeof(scws_st));
60
+ s->mblen = charset_table_get(NULL);
61
+ s->off = s->len = 0;
62
+ s->wend = -1;
63
+
64
+ return s;
65
+ }
66
+
67
+ /* hightman.110320: fork scws */
68
+ scws_t scws_fork(scws_t p)
69
+ {
70
+ scws_t s = scws_new();
71
+
72
+ if (p != NULL && s != NULL)
73
+ {
74
+ s->mblen = p->mblen;
75
+ s->mode = p->mode;
76
+ // fork dict/rules
77
+ s->r = scws_rule_fork(p->r);
78
+ s->d = xdict_fork(p->d);
79
+ }
80
+
81
+ return s;
82
+ }
83
+
84
+ /* close & free the engine */
85
+ void scws_free(scws_t s)
86
+ {
87
+ if (s->d)
88
+ {
89
+ xdict_close(s->d);
90
+ s->d = NULL;
91
+ }
92
+ if (s->r)
93
+ {
94
+ scws_rule_free(s->r);
95
+ s->r = NULL;
96
+ }
97
+ free(s);
98
+ }
99
+
100
+ /* add a dict into scws */
101
+ int scws_add_dict(scws_t s, const char *fpath, int mode)
102
+ {
103
+ xdict_t xx;
104
+ if (mode & SCWS_XDICT_SET)
105
+ {
106
+ xdict_close(s->d);
107
+ mode ^= SCWS_XDICT_SET;
108
+ s->d = NULL;
109
+ }
110
+ xx = s->d;
111
+ s->d = xdict_add(s->d, fpath, mode, s->mblen);
112
+ return (xx == s->d ? -1 : 0);
113
+ }
114
+
115
+ /* set the dict & open it */
116
+ int scws_set_dict(scws_t s, const char *fpath, int mode)
117
+ {
118
+ return scws_add_dict(s, fpath, mode | SCWS_XDICT_SET);
119
+ }
120
+
121
+ void scws_set_charset(scws_t s, const char *cs)
122
+ {
123
+ s->mblen = charset_table_get(cs);
124
+ }
125
+
126
+ void scws_set_rule(scws_t s, const char *fpath)
127
+ {
128
+ if (s->r != NULL)
129
+ scws_rule_free(s->r);
130
+
131
+ s->r = scws_rule_new(fpath, s->mblen);
132
+ }
133
+
134
+ /* set ignore symbol or multi segments */
135
+ void scws_set_ignore(scws_t s, int yes)
136
+ {
137
+ if (yes == SCWS_YEA)
138
+ s->mode |= SCWS_IGN_SYMBOL;
139
+
140
+ if (yes == SCWS_NA)
141
+ s->mode &= ~SCWS_IGN_SYMBOL;
142
+ }
143
+
144
+ void scws_set_multi(scws_t s, int mode)
145
+ {
146
+ s->mode &= ~SCWS_MULTI_MASK;
147
+
148
+ if (mode & SCWS_MULTI_MASK)
149
+ s->mode |= mode;
150
+ }
151
+
152
+ void scws_set_debug(scws_t s, int yes)
153
+ {
154
+ if (yes == SCWS_YEA)
155
+ s->mode |= SCWS_DEBUG;
156
+
157
+ if (yes == SCWS_NA)
158
+ s->mode &= ~SCWS_DEBUG;
159
+ }
160
+
161
+ void scws_set_duality(scws_t s, int yes)
162
+ {
163
+ if (yes == SCWS_YEA)
164
+ s->mode |= SCWS_DUALITY;
165
+
166
+ if (yes == SCWS_NA)
167
+ s->mode &= ~SCWS_DUALITY;
168
+ }
169
+
170
+ /* send the text buffer & init some others */
171
+ void scws_send_text(scws_t s, const char *text, int len)
172
+ {
173
+ s->txt = (unsigned char *) text;
174
+ s->len = len;
175
+ s->off = 0;
176
+ }
177
+
178
+ /* get some words, if these is not words, return NULL */
179
+ #define SCWS_PUT_RES(o,i,l,a) \
180
+ do { \
181
+ scws_res_t res; \
182
+ res = (scws_res_t) malloc(sizeof(struct scws_result)); \
183
+ res->off = o; \
184
+ res->idf = i; \
185
+ res->len = l; \
186
+ strncpy(res->attr, a, 2); \
187
+ res->attr[2] = '\0'; \
188
+ res->next = NULL; \
189
+ if (s->res1 == NULL) \
190
+ s->res1 = s->res0 = res; \
191
+ else \
192
+ { \
193
+ s->res1->next = res; \
194
+ s->res1 = res; \
195
+ } \
196
+ } while(0)
197
+
198
+ /* single bytes segment (纯单字节字符) */
199
+ #define PFLAG_WITH_MB 0x01
200
+ #define PFLAG_ALNUM 0x02
201
+ #define PFLAG_VALID 0x04
202
+ #define PFLAG_DIGIT 0x08
203
+ #define PFLAG_ADDSYM 0x10
204
+ #define PFLAG_ALPHA 0x20
205
+ #define PFLAG_LONGDIGIT 0x40
206
+ #define PFLAG_LONGALPHA 0x80
207
+
208
+ static void _str_toupper(char *src, char *dst)
209
+ {
210
+ while (*src)
211
+ {
212
+ *dst++ = *src++;
213
+ if (dst[-1] >= 'a' && dst[-1] <= 'z')
214
+ dst[-1] ^= 0x20;
215
+ }
216
+ }
217
+
218
+ static void _str_tolower(char *src, char *dst)
219
+ {
220
+ while (*src)
221
+ {
222
+ *dst++ = *src++;
223
+ if (dst[-1] >= 'A' && dst[-1] <= 'Z')
224
+ dst[-1] ^= 0x20;
225
+ }
226
+ }
227
+
228
+ #ifdef HAVE_STRNDUP
229
+ #define _mem_ndup strndup
230
+ #else
231
+ static inline void *_mem_ndup(const char *src, int len)
232
+ {
233
+ char *dst;
234
+ dst = malloc(len+1);
235
+ memcpy(dst, src, len);
236
+ dst[len] = '\0';
237
+ return dst;
238
+ }
239
+ #endif
240
+
241
+ static void _scws_alnum_multi(scws_t s, int start, int wlen)
242
+ {
243
+ char chunk[SCWS_MAX_EWLEN];
244
+ int i, j, k, ch, pflag;
245
+ unsigned char *txt;
246
+ float idf;
247
+
248
+ txt = s->txt;
249
+ pflag = 0;
250
+ for (i = j = k = 0; i < wlen; i++)
251
+ {
252
+ ch = txt[start + i];
253
+ if (SCWS_IS_DIGIT(ch))
254
+ {
255
+ if (pflag & PFLAG_DIGIT)
256
+ continue;
257
+ if (pflag != 0)
258
+ {
259
+ chunk[j++] = (char) (i-k);
260
+ k = i;
261
+ }
262
+ pflag = PFLAG_DIGIT;
263
+ }
264
+ else if (SCWS_IS_ALPHA(ch))
265
+ {
266
+ if (pflag & PFLAG_ALPHA)
267
+ continue;
268
+ if (pflag != 0)
269
+ {
270
+ chunk[j++] = (char) (i-k);
271
+ k = i;
272
+ }
273
+ pflag = PFLAG_ALPHA;
274
+ }
275
+ else
276
+ {
277
+ if (pflag & PFLAG_ADDSYM)
278
+ continue;
279
+ if (pflag != 0)
280
+ {
281
+ chunk[j++] = (char) (i-k);
282
+ k = i;
283
+ }
284
+ pflag = PFLAG_ADDSYM;
285
+ }
286
+ }
287
+
288
+ if (j > 0)
289
+ {
290
+ chunk[j] = (char) (i-k);
291
+ ch = start;
292
+ for (i = 0; i <= j; i++)
293
+ {
294
+ if (!SCWS_IS_ALNUM(txt[ch]))
295
+ {
296
+ // just skip
297
+ }
298
+ else if (chunk[i] == 1)
299
+ {
300
+ if (i > 0 && chunk[i-1] > 1 && (i != 1 || i != j))
301
+ {
302
+ if (!SCWS_IS_ALNUM(txt[ch-1]))
303
+ {
304
+ idf = SCWS_EN_IDF(chunk[i]);
305
+ SCWS_PUT_RES(ch, idf, chunk[i], attr_en);
306
+ }
307
+ else
308
+ {
309
+ idf = SCWS_EN_IDF(chunk[i-1]+1);
310
+ SCWS_PUT_RES(ch - chunk[i-1], idf, chunk[i-1]+1, attr_en);
311
+ }
312
+ }
313
+ if (i < j && (i != 0 || j != 1))
314
+ {
315
+ if (!SCWS_IS_ALNUM(txt[ch+1]))
316
+ {
317
+ idf = SCWS_EN_IDF(chunk[i]);
318
+ SCWS_PUT_RES(ch, idf, chunk[i], attr_en);
319
+ }
320
+ else
321
+ {
322
+ idf = SCWS_EN_IDF(chunk[i+1]+1);
323
+ SCWS_PUT_RES(ch, idf, chunk[i+1]+1, attr_en);
324
+ }
325
+ }
326
+ }
327
+ else
328
+ {
329
+ idf = SCWS_EN_IDF(chunk[i]);
330
+ SCWS_PUT_RES(ch, idf, chunk[i], attr_en);
331
+ }
332
+ ch += chunk[i];
333
+ }
334
+ }
335
+ }
336
+
337
+ static void _scws_ssegment(scws_t s, int end)
338
+ {
339
+ int start, wlen, ch, pflag, ipflag = 0;
340
+ unsigned char *txt;
341
+ float idf;
342
+
343
+ start = s->off;
344
+ wlen = end - start;
345
+
346
+ /* check special words (need strtoupper) */
347
+ if (wlen > 1)
348
+ {
349
+ txt = (char *) _mem_ndup(s->txt + start, wlen);
350
+ _str_toupper(txt, txt);
351
+ if (SCWS_IS_SPECIAL(txt, wlen))
352
+ {
353
+ SCWS_PUT_RES(start, 9.5, wlen, "nz");
354
+ free(txt);
355
+ return;
356
+ }
357
+ free(txt);
358
+ }
359
+
360
+ txt = s->txt;
361
+ /* check brief words such as S.H.E M.R. */
362
+ if (SCWS_IS_ALPHA(txt[start]) && txt[start+1] == '.')
363
+ {
364
+ for (ch = start + 2; ch < end; ch++)
365
+ {
366
+ if (!SCWS_IS_ALPHA(txt[ch])) break;
367
+ ch++;
368
+ if (ch == end || txt[ch] != '.') break;
369
+ }
370
+ if (ch == end)
371
+ {
372
+ SCWS_PUT_RES(start, 7.5, wlen, "nz");
373
+ return;
374
+ }
375
+ }
376
+
377
+ /* 取出单词及标点. 数字允许一个点且下一个为数字,不连续的. 字母允许一个不连续的' */
378
+ while (start < end)
379
+ {
380
+ ch = txt[start++];
381
+ if (ipflag && ch != 0x2e && !SCWS_IS_DIGIT(ch))
382
+ ipflag = 0;
383
+ if (SCWS_IS_ALNUM(ch))
384
+ {
385
+ pflag = SCWS_IS_DIGIT(ch) ? PFLAG_DIGIT : 0;
386
+ wlen = 1;
387
+ while (start < end)
388
+ {
389
+ ch = txt[start];
390
+ if (pflag & PFLAG_DIGIT)
391
+ {
392
+ if (!SCWS_IS_DIGIT(ch))
393
+ {
394
+ // check percent % = 0x25
395
+ if (ch == 0x25 && !SCWS_IS_DIGIT(txt[start+1]))
396
+ {
397
+ start++;
398
+ wlen++;
399
+ break;
400
+ }
401
+ if (ipflag)
402
+ break;
403
+ // special for IP address or version number? (find out all digit + dot)
404
+ if (ch == 0x2e && (pflag & PFLAG_ADDSYM))
405
+ {
406
+ ipflag = 1;
407
+ while(--wlen && txt[--start] != 0x2e);
408
+ pflag = 0;
409
+ break;
410
+ }
411
+ // wlen = 1
412
+ if (wlen == 1 && SCWS_IS_ALPHA(ch))
413
+ {
414
+ pflag ^= PFLAG_DIGIT;
415
+ pflag |= PFLAG_ADDSYM;
416
+ continue;
417
+ }
418
+ // strict must add: !$this->_is_digit(ord($this->txt[$start+1])))
419
+ if ((pflag & PFLAG_ADDSYM) || !(ch == 0x2e && SCWS_IS_DIGIT(txt[start+1])))
420
+ break;
421
+ pflag |= PFLAG_ADDSYM;
422
+ }
423
+ }
424
+ else
425
+ {
426
+ /* hightman.110419: - 出现在字母中间允许连接(0x2d), _ 允许连接(0x5f) */
427
+ if ((ch == 0x2d || ch == 0x5f) && SCWS_IS_ALPHA(txt[start+1]))
428
+ pflag |= PFLAG_ADDSYM;
429
+ else if (!SCWS_IS_ALPHA(ch))
430
+ {
431
+ if ((pflag & PFLAG_ADDSYM)
432
+ || !((ch == 0x27 && SCWS_IS_ALPHA(txt[start+1]))
433
+ || (SCWS_IS_DIGIT(ch) && !SCWS_IS_DIGIT(txt[start+1]))))
434
+ {
435
+ break;
436
+ }
437
+ pflag |= PFLAG_ADDSYM;
438
+ }
439
+ }
440
+ start++;
441
+ wlen++;
442
+ if (wlen >= SCWS_MAX_EWLEN)
443
+ break;
444
+ }
445
+ idf = SCWS_EN_IDF(wlen);
446
+ SCWS_PUT_RES(start-wlen, idf, wlen, attr_en);
447
+ if ((s->mode & SCWS_MULTI_DUALITY) && (pflag & PFLAG_ADDSYM))
448
+ _scws_alnum_multi(s, start-wlen, wlen);
449
+ }
450
+ else if (!(s->mode & SCWS_IGN_SYMBOL))
451
+ {
452
+ SCWS_PUT_RES(start-1, 0.0, 1, attr_un);
453
+ }
454
+ }
455
+ }
456
+
457
+ /* multibyte segment */
458
+ static int _scws_mget_word(scws_t s, int i, int j)
459
+ {
460
+ int r, k;
461
+ word_t item;
462
+
463
+ if (!(s->wmap[i][i]->flag & SCWS_ZFLAG_WHEAD))
464
+ return i;
465
+
466
+ for (r=i, k=i+1; k <= j; k++)
467
+ {
468
+ item = s->wmap[i][k];
469
+ if (item && (item->flag & SCWS_WORD_FULL))
470
+ {
471
+ r = k;
472
+ if (!(item->flag & SCWS_WORD_PART))
473
+ break;
474
+ }
475
+ }
476
+ return r;
477
+ }
478
+
479
+ static void _scws_mset_word(scws_t s, int i, int j)
480
+ {
481
+ word_t item;
482
+
483
+ item = s->wmap[i][j];
484
+ /* hightman.070705: 加入 item == null 判断, 防止超长词(255字以上)unsigned char溢出 */
485
+ if ((item == NULL) || ((s->mode & SCWS_IGN_SYMBOL)
486
+ && !SCWS_IS_ECHAR(item->flag) && !memcmp(item->attr, attr_un, 2)))
487
+ return;
488
+
489
+ /* hightman.070701: 散字自动二元聚合 */
490
+ if (s->mode & SCWS_DUALITY)
491
+ {
492
+ int k = s->zis;
493
+
494
+ if (i == j && !SCWS_IS_ECHAR(item->flag) && memcmp(item->attr, attr_un, 2))
495
+ {
496
+ s->zis = i;
497
+ if (k < 0)
498
+ return;
499
+
500
+ i = (k & ~SCWS_ZIS_USED);
501
+ if ((i != (j-1)) || (!(k & SCWS_ZIS_USED) && s->wend == i))
502
+ {
503
+ SCWS_PUT_RES(s->zmap[i].start, s->wmap[i][i]->idf, (s->zmap[i].end - s->zmap[i].start), s->wmap[i][i]->attr);
504
+ if (i != (j-1))
505
+ return;
506
+ }
507
+ s->zis |= SCWS_ZIS_USED;
508
+ }
509
+ else
510
+ {
511
+ if ((k >= 0) && (!(k & SCWS_ZIS_USED) || (j > i)))
512
+ {
513
+ k &= ~SCWS_ZIS_USED;
514
+ SCWS_PUT_RES(s->zmap[k].start, s->wmap[k][k]->idf, (s->zmap[k].end - s->zmap[k].start), s->wmap[k][k]->attr);
515
+ }
516
+ if (j > i)
517
+ s->wend = j + 1;
518
+ s->zis = -1;
519
+ }
520
+ }
521
+
522
+ SCWS_PUT_RES(s->zmap[i].start, item->idf, (s->zmap[j].end - s->zmap[i].start), item->attr);
523
+
524
+ // hightman.070902: multi segment
525
+ // step1: split to short words
526
+ if ((j-i) > 1)
527
+ {
528
+ int n, k, m = i;
529
+ if (s->mode & SCWS_MULTI_SHORT)
530
+ {
531
+ while (m < j)
532
+ {
533
+ k = m;
534
+ // hightman.111223: multi short enhanced
535
+ for (n = m + 1; n <= j; n++)
536
+ {
537
+ // 3 chars at most
538
+ if ((n == j && m == i) || (n - m) > 2) break;
539
+ item = s->wmap[m][n];
540
+ if (!item) continue;
541
+ // first shortest or last longest word
542
+ if ((item->flag & SCWS_WORD_FULL) && (k == m || n == j))
543
+ k = n;
544
+ if (!(item->flag & SCWS_WORD_PART)) break;
545
+ }
546
+ // short word not found, stop to find, passed to next loop
547
+ if (k == m)
548
+ break;
549
+
550
+ // save the short word
551
+ item = s->wmap[m][k];
552
+ SCWS_PUT_RES(s->zmap[m].start, item->idf, (s->zmap[k].end - s->zmap[m].start), item->attr);
553
+ // find the next word or go to prev for duality last word
554
+ if ((m = k + 1) == j)
555
+ {
556
+ m--;
557
+ break;
558
+ }
559
+ }
560
+ }
561
+
562
+ if (s->mode & SCWS_MULTI_DUALITY)
563
+ {
564
+ while (m < j)
565
+ {
566
+ if (SCWS_IS_ECHAR(s->wmap[m][m]->flag))
567
+ {
568
+ SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
569
+ s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
570
+ }
571
+ else if (SCWS_IS_ECHAR(s->wmap[m+1][m+1]->flag))
572
+ {
573
+ if (m == i)
574
+ {
575
+ SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
576
+ s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
577
+ }
578
+ m++;
579
+ SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
580
+ s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
581
+ }
582
+ else
583
+ {
584
+ SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m+1].end - s->zmap[m].start), s->wmap[m][m]->attr);
585
+ }
586
+ m++;
587
+ if (m == j && (SCWS_IS_ECHAR(s->wmap[m][m]->flag) || SCWS_IS_ECHAR(s->wmap[m-1][m-1]->flag)))
588
+ {
589
+ SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
590
+ s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
591
+ }
592
+ }
593
+ }
594
+ }
595
+
596
+ // step2, split to single char
597
+ if ((j > i) && (s->mode & (SCWS_MULTI_ZMAIN|SCWS_MULTI_ZALL)))
598
+ {
599
+ if ((j - i) == 1 && !s->wmap[i][j])
600
+ {
601
+ if (s->wmap[i][i]->flag & SCWS_ZFLAG_PUT) i++;
602
+ else s->wmap[i][i]->flag |= SCWS_ZFLAG_PUT;
603
+ s->wmap[j][j]->flag |= SCWS_ZFLAG_PUT;
604
+ }
605
+ do
606
+ {
607
+ if (s->wmap[i][i]->flag & SCWS_ZFLAG_PUT)
608
+ continue;
609
+ if (!(s->mode & SCWS_MULTI_ZALL) && !strchr("jnv", s->wmap[i][i]->attr[0]))
610
+ continue;
611
+ SCWS_PUT_RES(s->zmap[i].start, s->wmap[i][i]->idf, (s->zmap[i].end - s->zmap[i].start), s->wmap[i][i]->attr);
612
+ }
613
+ while (++i <= j);
614
+ }
615
+ }
616
+
617
+ static void _scws_mseg_zone(scws_t s, int f, int t)
618
+ {
619
+ unsigned char *mpath, *npath;
620
+ word_t **wmap;
621
+ int x,i,j,m,n,j2,sz;
622
+ double weight, nweight;
623
+ char attr1[3];
624
+
625
+ mpath = npath = NULL;
626
+ weight = nweight = (double) 0.0;
627
+
628
+ wmap = s->wmap;
629
+ j2 = 0;
630
+ for (x = i = f; i <= t; i++)
631
+ {
632
+ j = _scws_mget_word(s, i, (x > i ? x - 1 : t));
633
+ if (j == i) continue;
634
+ // skip NR in NR
635
+ if (j < j2 && wmap[i][j]->attr[0] == 'n' && wmap[i][j]->attr[1] == 'r') continue;
636
+ if (i > j2 && (wmap[i][j]->flag & SCWS_WORD_USED)) continue;
637
+
638
+ /* one word only */
639
+ if (i == f && j == t)
640
+ {
641
+ mpath = (unsigned char *) malloc(2);
642
+ mpath[0] = j - i;
643
+ mpath[1] = 0xff;
644
+ break;
645
+ }
646
+
647
+ if (i != f && (wmap[i][j]->flag & SCWS_WORD_RULE))
648
+ continue;
649
+
650
+ /* create the new path */
651
+ wmap[i][j]->flag |= SCWS_WORD_USED;
652
+ nweight = (double) wmap[i][j]->tf * pow(j-i,4);
653
+
654
+ if (npath == NULL)
655
+ {
656
+ npath = (unsigned char *) malloc(t-f+2);
657
+ memset(npath, 0xff, t-f+2);
658
+ }
659
+
660
+ /* lookfor backward */
661
+ x = sz = 0;
662
+ memset(attr1, 0, sizeof(attr1));
663
+ for (m = f; m < i; m = n+1)
664
+ {
665
+ n = _scws_mget_word(s, m, i-1);
666
+ nweight *= wmap[m][n]->tf;
667
+ npath[x++] = n - m;
668
+ if (n > m)
669
+ {
670
+ nweight *= pow(n-m,4);
671
+ wmap[m][n]->flag |= SCWS_WORD_USED;
672
+ }
673
+ else sz++;
674
+
675
+ if (attr1[0] != '\0')
676
+ nweight *= scws_rule_attr_ratio(s->r, attr1, wmap[m][n]->attr, &npath[x-2]);
677
+ memcpy(attr1, wmap[m][n]->attr, 2);
678
+ }
679
+
680
+ /* my self */
681
+ npath[x++] = j - i;
682
+
683
+ if (attr1[0] != '\0')
684
+ nweight *= scws_rule_attr_ratio(s->r, attr1, wmap[i][j]->attr, &npath[x-2]);
685
+ memcpy(attr1, wmap[i][j]->attr, 2);
686
+
687
+ /* lookfor forward */
688
+ for (m = j+1; m <= t; m = n+1)
689
+ {
690
+ n = _scws_mget_word(s, m, t);
691
+ nweight *= wmap[m][n]->tf;
692
+ npath[x++] = n - m;
693
+ if (n > m)
694
+ {
695
+ nweight *= pow(n-m,4);
696
+ wmap[m][n]->flag |= SCWS_WORD_USED;
697
+ }
698
+ else sz++;
699
+
700
+ nweight *= scws_rule_attr_ratio(s->r, attr1, wmap[m][n]->attr, &npath[x-2]);
701
+ memcpy(attr1, wmap[m][n]->attr, 2);
702
+ }
703
+
704
+ npath[x] = 0xff;
705
+ nweight /= pow(x+sz-1,5);
706
+
707
+ /* draw the path for debug */
708
+ #ifdef DEBUG
709
+ if (s->mode & SCWS_DEBUG)
710
+ {
711
+ fprintf(stderr, "PATH by keyword = %.*s, (weight=%.4f):\n",
712
+ s->zmap[j].end - s->zmap[i].start, s->txt + s->zmap[i].start, nweight);
713
+ for (x = 0, m = f; (n = npath[x]) != 0xff; x++)
714
+ {
715
+ n += m;
716
+ fprintf(stderr, "%.*s ", s->zmap[n].end - s->zmap[m].start, s->txt + s->zmap[m].start);
717
+ m = n + 1;
718
+ }
719
+ fprintf(stderr, "\n--\n");
720
+ }
721
+ #endif
722
+
723
+ j2 = x = j;
724
+ if ((x - i) > 1) i--;
725
+ /* check better path */
726
+ if (nweight > weight)
727
+ {
728
+ unsigned char *swap;
729
+
730
+ weight = nweight;
731
+ swap = mpath;
732
+ mpath = npath;
733
+ npath = swap;
734
+ }
735
+ }
736
+
737
+ /* set the result, mpath != NULL */
738
+ if (mpath == NULL)
739
+ return;
740
+
741
+ for (x = 0, m = f; (n = mpath[x]) != 0xff; x++)
742
+ {
743
+ n += m;
744
+ _scws_mset_word(s, m, n);
745
+ m = n + 1;
746
+ }
747
+
748
+ /* 一口.070808: memory leak fixed. */
749
+ if (mpath) free(mpath);
750
+ if (npath) free(npath);
751
+ }
752
+
753
+ /* quick define for zrule_checker in loop */
754
+ #define ___ZRULE_CHECKER1___ \
755
+ if (j >= zlen || SCWS_NO_RULE2(wmap[j][j]->flag)) \
756
+ break;
757
+
758
+ #define ___ZRULE_CHECKER2___ \
759
+ if (j < 0 || SCWS_NO_RULE2(wmap[j][j]->flag)) \
760
+ break;
761
+
762
+ #define ___ZRULE_CHECKER3___ \
763
+ if (!scws_rule_check(s->r, r1, txt + zmap[j].start, zmap[j].end - zmap[j].start)) \
764
+ break;
765
+
766
+ static void _scws_msegment(scws_t s, int end, int zlen)
767
+ {
768
+ word_t **wmap, query;
769
+ struct scws_zchar *zmap;
770
+ unsigned char *txt;
771
+ #ifdef HAVE_NAME_RULE /* 20150403: Remove rules, just deepend on dictionary */
772
+ rule_item_t r1;
773
+ #endif
774
+ int i, j, k, ch, clen, start;
775
+ pool_t p;
776
+
777
+ /* pool used to management some dynamic memory */
778
+ p = pool_new();
779
+
780
+ /* create wmap & zmap */
781
+ wmap = s->wmap = (word_t **) darray_new(zlen, zlen, sizeof(word_t));
782
+ zmap = s->zmap = (struct scws_zchar *) pmalloc(p, zlen * sizeof(struct scws_zchar));
783
+ txt = s->txt;
784
+ start = s->off;
785
+ s->zis = -1;
786
+
787
+ for (i = 0; start < end; i++)
788
+ {
789
+ ch = txt[start];
790
+ clen = SCWS_CHARLEN(ch);
791
+ if (clen == 1)
792
+ {
793
+ while (start++ < end)
794
+ {
795
+ ch = txt[start];
796
+ if (start == end || SCWS_CHARLEN(txt[start]) > 1)
797
+ break;
798
+ clen++;
799
+ }
800
+ wmap[i][i] = (word_t) pmalloc_z(p, sizeof(word_st));
801
+ wmap[i][i]->tf = 0.5;
802
+ wmap[i][i]->flag |= SCWS_ZFLAG_ENGLISH;
803
+ strcpy(wmap[i][i]->attr, SCWS_IS_ALPHA(txt[start-1]) ? attr_en : attr_un);
804
+ }
805
+ else
806
+ {
807
+ query = xdict_query(s->d, txt + start, clen);
808
+ wmap[i][i] = (word_t) pmalloc(p, sizeof(word_st));
809
+ if (query == NULL)
810
+ {
811
+ wmap[i][i]->tf = 0.5;
812
+ wmap[i][i]->idf = 0.0;
813
+ wmap[i][i]->flag = 0;
814
+ strcpy(wmap[i][i]->attr, attr_un);
815
+ }
816
+ else
817
+ {
818
+ ch = query->flag;
819
+ query->flag = SCWS_WORD_FULL;
820
+ memcpy(wmap[i][i], query, sizeof(word_st));
821
+ if (query->attr[0] == '#')
822
+ wmap[i][i]->flag |= SCWS_ZFLAG_SYMBOL;
823
+
824
+ if (ch & SCWS_WORD_MALLOCED)
825
+ free(query);
826
+ }
827
+ start += clen;
828
+ }
829
+
830
+ zmap[i].start = start - clen;
831
+ zmap[i].end = start;
832
+ }
833
+
834
+ /* fixed real zlength */
835
+ zlen = i;
836
+
837
+ /* create word query table */
838
+ for (i = 0; i < zlen; i++)
839
+ {
840
+ k = 0;
841
+ for (j = i+1; j < zlen; j++)
842
+ {
843
+ query = xdict_query(s->d, txt + zmap[i].start, zmap[j].end - zmap[i].start);
844
+ if (query == NULL)
845
+ break;
846
+ ch = query->flag;
847
+ if ((ch & SCWS_WORD_FULL) && memcmp(query->attr, attr_na, 2))
848
+ {
849
+ wmap[i][j] = (word_t) pmalloc(p, sizeof(word_st));
850
+ memcpy(wmap[i][j], query, sizeof(word_st));
851
+
852
+ wmap[i][i]->flag |= SCWS_ZFLAG_WHEAD;
853
+
854
+ for (k = i+1; k <= j; k++)
855
+ wmap[k][k]->flag |= SCWS_ZFLAG_WPART;
856
+ }
857
+
858
+ if (ch & SCWS_WORD_MALLOCED)
859
+ free(query);
860
+
861
+ if (!(ch & SCWS_WORD_PART))
862
+ break;
863
+ }
864
+
865
+ if (k--)
866
+ {
867
+ /* set nr2 to some short name */
868
+ if ((k == (i+1)))
869
+ {
870
+ if (!memcmp(wmap[i][k]->attr, attr_nr, 2))
871
+ wmap[i][i]->flag |= SCWS_ZFLAG_NR2;
872
+ //if (wmap[i][k]->attr[0] == 'n')
873
+ //wmap[i][i]->flag |= SCWS_ZFLAG_N2;
874
+ }
875
+
876
+ /* clean the PART flag for the last word */
877
+ if (k < j)
878
+ wmap[i][k]->flag ^= SCWS_WORD_PART;
879
+ }
880
+ }
881
+
882
+ if (s->r == NULL)
883
+ goto do_segment;
884
+
885
+ #ifdef HAVE_NAME_RULE /* 20150403: Remove rules, just deepend on dictionary */
886
+ /* auto rule set for name & zone & chinese numeric */
887
+
888
+ /* one word auto rule check */
889
+ for (i = 0; i < zlen; i++)
890
+ {
891
+ if (SCWS_NO_RULE1(wmap[i][i]->flag))
892
+ continue;
893
+
894
+ r1 = scws_rule_get(s->r, txt + zmap[i].start, zmap[i].end - zmap[i].start);
895
+ if (r1 == NULL)
896
+ continue;
897
+
898
+ clen = r1->zmin > 0 ? r1->zmin : 1;
899
+ if ((r1->flag & SCWS_ZRULE_PREFIX) && (i < (zlen - clen)))
900
+ {
901
+ /* prefix, check after (zmin~zmax) */
902
+ // 先检查 zmin 字内是否全部符合要求
903
+ // 再在 zmax 范围内取得符合要求的字
904
+ // int i, j, k, ch, clen, start;
905
+ for (ch = 1; ch <= clen; ch++)
906
+ {
907
+ j = i + ch;
908
+ ___ZRULE_CHECKER1___
909
+ ___ZRULE_CHECKER3___
910
+ }
911
+
912
+ if (ch <= clen)
913
+ continue;
914
+
915
+ /* no limit znum or limit to a range */
916
+ j = i + ch;
917
+ while (1)
918
+ {
919
+ if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
920
+ break;
921
+ ___ZRULE_CHECKER1___
922
+ ___ZRULE_CHECKER3___
923
+ clen++;
924
+ j++;
925
+ }
926
+
927
+ // 注意原来2字人名,识别后仍为2字的情况
928
+ if (wmap[i][i]->flag & SCWS_ZFLAG_NR2)
929
+ {
930
+ if (clen == 1)
931
+ continue;
932
+ wmap[i][i+1]->flag |= SCWS_WORD_PART;
933
+ }
934
+
935
+ /* ok, got: i & clen */
936
+ k = i + clen;
937
+ wmap[i][k] = (word_t) pmalloc(p, sizeof(word_st));
938
+ wmap[i][k]->tf = r1->tf;
939
+ wmap[i][k]->idf = r1->idf;
940
+ wmap[i][k]->flag = (SCWS_WORD_RULE|SCWS_WORD_FULL);
941
+ strncpy(wmap[i][k]->attr, r1->attr, 2);
942
+
943
+ wmap[i][i]->flag |= SCWS_ZFLAG_WHEAD;
944
+ for (j = i+1; j <= k; j++)
945
+ wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
946
+
947
+ if (!(wmap[i][i]->flag & SCWS_ZFLAG_WPART))
948
+ i = k;
949
+
950
+ continue;
951
+ }
952
+
953
+ if ((r1->flag & SCWS_ZRULE_SUFFIX) && (i >= clen))
954
+ {
955
+ /* suffix, check before */
956
+ for (ch = 1; ch <= clen; ch++)
957
+ {
958
+ j = i - ch;
959
+ ___ZRULE_CHECKER2___
960
+ ___ZRULE_CHECKER3___
961
+ }
962
+
963
+ if (ch <= clen)
964
+ continue;
965
+
966
+ /* no limit znum or limit to a range */
967
+ j = i - ch;
968
+ while (1)
969
+ {
970
+ if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
971
+ break;
972
+ ___ZRULE_CHECKER2___
973
+ ___ZRULE_CHECKER3___
974
+ clen++;
975
+ j--;
976
+ }
977
+
978
+ /* ok, got: i & clen (maybe clen=1 & [k][i] isset) */
979
+ k = i - clen;
980
+ if (wmap[k][i] != NULL)
981
+ continue;
982
+
983
+ wmap[k][i] = (word_t) pmalloc(p, sizeof(word_st));
984
+ wmap[k][i]->tf = r1->tf;
985
+ wmap[k][i]->idf = r1->idf;
986
+ wmap[k][i]->flag = SCWS_WORD_FULL;
987
+ strncpy(wmap[k][i]->attr, r1->attr, 2);
988
+
989
+ wmap[k][k]->flag |= SCWS_ZFLAG_WHEAD;
990
+ for (j = k+1; j <= i; j++)
991
+ {
992
+ wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
993
+ if ((j != i) && (wmap[k][j] != NULL))
994
+ wmap[k][j]->flag |= SCWS_WORD_PART;
995
+ }
996
+ continue;
997
+ }
998
+ }
999
+
1000
+ /* two words auto rule check (欧阳** , **西路) */
1001
+ for (i = zlen - 2; i >= 0; i--)
1002
+ {
1003
+ /* with value ==> must be have SCWS_WORD_FULL, so needn't check it ag. */
1004
+ if ((wmap[i][i+1] == NULL) || (wmap[i][i+1]->flag & SCWS_WORD_PART))
1005
+ continue;
1006
+
1007
+ k = i+1;
1008
+ r1 = scws_rule_get(s->r, txt + zmap[i].start, zmap[k].end - zmap[i].start);
1009
+ if (r1 == NULL)
1010
+ continue;
1011
+
1012
+ clen = r1->zmin > 0 ? r1->zmin : 1;
1013
+ if ((r1->flag & SCWS_ZRULE_PREFIX) && (k < (zlen - clen)))
1014
+ {
1015
+ for (ch = 1; ch <= clen; ch++)
1016
+ {
1017
+ j = k + ch;
1018
+ ___ZRULE_CHECKER1___
1019
+ ___ZRULE_CHECKER3___
1020
+ }
1021
+
1022
+ if (ch <= clen)
1023
+ continue;
1024
+
1025
+ /* no limit znum or limit to a range */
1026
+ j = k + ch;
1027
+ while (1)
1028
+ {
1029
+ if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
1030
+ break;
1031
+ ___ZRULE_CHECKER1___
1032
+ ___ZRULE_CHECKER3___
1033
+ clen++;
1034
+ j++;
1035
+ }
1036
+
1037
+ /* ok, got: i & clen */
1038
+ k = k + clen;
1039
+ wmap[i][k] = (word_t) pmalloc(p, sizeof(word_st));
1040
+ wmap[i][k]->tf = r1->tf;
1041
+ wmap[i][k]->idf = r1->idf;
1042
+ wmap[i][k]->flag = SCWS_WORD_FULL;
1043
+ strncpy(wmap[i][k]->attr, r1->attr, 2);
1044
+
1045
+ wmap[i][i+1]->flag |= SCWS_WORD_PART;
1046
+ for (j = i+2; j <= k; j++)
1047
+ wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
1048
+
1049
+ i--;
1050
+ continue;
1051
+ }
1052
+
1053
+ if ((r1->flag & SCWS_ZRULE_SUFFIX) && (i >= clen))
1054
+ {
1055
+ /* suffix, check before */
1056
+ for (ch = 1; ch <= clen; ch++)
1057
+ {
1058
+ j = i - ch;
1059
+ ___ZRULE_CHECKER2___
1060
+ ___ZRULE_CHECKER3___
1061
+ }
1062
+
1063
+ if (ch <= clen)
1064
+ continue;
1065
+
1066
+ /* no limit znum or limit to a range */
1067
+ j = i - ch;
1068
+ while (1)
1069
+ {
1070
+ if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
1071
+ break;
1072
+ ___ZRULE_CHECKER2___
1073
+ ___ZRULE_CHECKER3___
1074
+ clen++;
1075
+ j--;
1076
+ }
1077
+
1078
+ /* ok, got: i & clen (maybe clen=1 & [k][i] isset) */
1079
+ k = i - clen;
1080
+ i = i + 1;
1081
+ wmap[k][i] = (word_t) pmalloc(p, sizeof(word_st));
1082
+ wmap[k][i]->tf = r1->tf;
1083
+ wmap[k][i]->idf = r1->idf;
1084
+ wmap[k][i]->flag = SCWS_WORD_FULL;
1085
+ strncpy(wmap[k][i]->attr, r1->attr, 2);
1086
+
1087
+ wmap[k][k]->flag |= SCWS_ZFLAG_WHEAD;
1088
+ for (j = k+1; j <= i; j++)
1089
+ {
1090
+ wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
1091
+ if (wmap[k][j] != NULL)
1092
+ wmap[k][j]->flag |= SCWS_WORD_PART;
1093
+ }
1094
+
1095
+ i -= (clen+1);
1096
+ continue;
1097
+ }
1098
+ }
1099
+ #endif
1100
+
1101
+ /* real do the segment */
1102
+ do_segment:
1103
+
1104
+ /* find the easy break point */
1105
+ for (i = 0, j = 0; i < zlen; i++)
1106
+ {
1107
+ if (wmap[i][i]->flag & SCWS_ZFLAG_WPART)
1108
+ continue;
1109
+
1110
+ if (i > j)
1111
+ _scws_mseg_zone(s, j, i-1);
1112
+
1113
+ j = i;
1114
+ if (!(wmap[i][i]->flag & SCWS_ZFLAG_WHEAD))
1115
+ {
1116
+ _scws_mset_word(s, i, i);
1117
+ j++;
1118
+ }
1119
+ }
1120
+
1121
+ /* the lastest zone */
1122
+ if (i > j)
1123
+ _scws_mseg_zone(s, j, i-1);
1124
+
1125
+ /* the last single for duality */
1126
+ if ((s->mode & SCWS_DUALITY) && (s->zis >= 0) && !(s->zis & SCWS_ZIS_USED))
1127
+ {
1128
+ i = s->zis;
1129
+ SCWS_PUT_RES(s->zmap[i].start, s->wmap[i][i]->idf, (s->zmap[i].end - s->zmap[i].start), s->wmap[i][i]->attr);
1130
+ }
1131
+
1132
+ /* free the wmap & zmap */
1133
+ pool_free(p);
1134
+ darray_free((void **) wmap);
1135
+ }
1136
+
1137
+ scws_res_t scws_get_result(scws_t s)
1138
+ {
1139
+ int off, len, ch, clen, zlen, pflag;
1140
+ unsigned char *txt;
1141
+
1142
+ off = s->off;
1143
+ len = s->len;
1144
+ txt = s->txt;
1145
+ s->res0 = s->res1 = NULL;
1146
+ while ((off < len) && (txt[off] <= 0x20))
1147
+ {
1148
+ if (txt[off] == 0x0a || txt[off] == 0x0d)
1149
+ {
1150
+ s->off = off + 1;
1151
+ SCWS_PUT_RES(off, 0.0, 1, attr_un);
1152
+ return s->res0;
1153
+ }
1154
+ off++;
1155
+ }
1156
+
1157
+ if (off >= len)
1158
+ return NULL;
1159
+
1160
+ /* try to parse the sentence */
1161
+ s->off = off;
1162
+ ch = txt[off];
1163
+ if (SCWS_CHAR_TOKEN(ch) && !(s->mode & SCWS_IGN_SYMBOL))
1164
+ {
1165
+ s->off++;
1166
+ SCWS_PUT_RES(off, 0.0, 1, attr_un);
1167
+ return s->res0;
1168
+ }
1169
+ clen = SCWS_CHARLEN(ch);
1170
+ zlen = 1;
1171
+ pflag = (clen > 1 ? PFLAG_WITH_MB : (SCWS_IS_ALNUM(ch) ? PFLAG_ALNUM : 0));
1172
+ while ((off = (off+clen)) < len)
1173
+ {
1174
+ ch = txt[off];
1175
+ if (ch <= 0x20 || SCWS_CHAR_TOKEN(ch)) break;
1176
+ clen = SCWS_CHARLEN(ch);
1177
+ if (!(pflag & PFLAG_WITH_MB))
1178
+ {
1179
+ // pure single-byte -> multibyte (2bytes)
1180
+ if (clen == 1)
1181
+ {
1182
+ if (pflag & PFLAG_ALNUM)
1183
+ {
1184
+ if (SCWS_IS_ALPHA(ch))
1185
+ {
1186
+ if (!(pflag & PFLAG_LONGALPHA) && SCWS_IS_ALPHA(txt[off-1]))
1187
+ pflag |= PFLAG_LONGALPHA;
1188
+ }
1189
+ else if (SCWS_IS_DIGIT(ch))
1190
+ {
1191
+ if (!(pflag & PFLAG_LONGDIGIT) && SCWS_IS_DIGIT(txt[off-1]))
1192
+ pflag |= PFLAG_LONGDIGIT;
1193
+ }
1194
+ else
1195
+ pflag ^= PFLAG_ALNUM;
1196
+ }
1197
+ }
1198
+ else
1199
+ {
1200
+ if (!(pflag & PFLAG_ALNUM) || zlen > 2)
1201
+ break;
1202
+
1203
+ pflag |= PFLAG_WITH_MB;
1204
+ /* zlen = 1; */
1205
+ }
1206
+ }
1207
+ else if ((pflag & PFLAG_WITH_MB) && clen == 1)
1208
+ {
1209
+ int i;
1210
+
1211
+ // mb + single-byte. allowd: alpha+num + 中文
1212
+ if (!SCWS_IS_ALNUM(ch))
1213
+ break;
1214
+
1215
+ pflag &= ~PFLAG_VALID;
1216
+ // 夹在中文间的英文数字最多允许 2 个字符 (超过2可以独立成词没啥问题)
1217
+ for (i = off+1; i < (off+3); i++)
1218
+ {
1219
+ ch = txt[i];
1220
+ if ((i >= len) || (ch <= 0x20) || (SCWS_CHARLEN(ch) > 1))
1221
+ {
1222
+ pflag |= PFLAG_VALID;
1223
+ break;
1224
+ }
1225
+
1226
+ if (!SCWS_IS_ALNUM(ch))
1227
+ break;
1228
+ }
1229
+
1230
+ if (!(pflag & PFLAG_VALID))
1231
+ break;
1232
+
1233
+ clen += (i - off - 1);
1234
+ }
1235
+ /* hightman.070813: add max zlen limit */
1236
+ if (++zlen >= SCWS_MAX_ZLEN)
1237
+ break;
1238
+ }
1239
+
1240
+ /* hightman.070624: 处理半个字的问题 */
1241
+ if ((ch = off) > len)
1242
+ off -= clen;
1243
+
1244
+ /* do the real segment */
1245
+ if (off <= s->off)
1246
+ return NULL;
1247
+ else if (pflag & PFLAG_WITH_MB)
1248
+ _scws_msegment(s, off, zlen);
1249
+ else if (!(pflag & PFLAG_ALNUM) || ((off - s->off) >= SCWS_MAX_EWLEN))
1250
+ _scws_ssegment(s, off);
1251
+ else
1252
+ {
1253
+ zlen = off - s->off;
1254
+ if ((pflag & (PFLAG_LONGALPHA|PFLAG_LONGDIGIT)) == (PFLAG_LONGALPHA|PFLAG_LONGDIGIT))
1255
+ _scws_alnum_multi(s, s->off, zlen);
1256
+ else
1257
+ {
1258
+ float idf;
1259
+
1260
+ idf = SCWS_EN_IDF(zlen);
1261
+ SCWS_PUT_RES(s->off, idf, zlen, attr_en);
1262
+
1263
+ /* hightman.090523: 为字母数字混合再度拆解, 纯数字, (>1 ? 纯字母 : 数字+字母) */
1264
+ if ((s->mode & SCWS_MULTI_DUALITY) && zlen > 2)
1265
+ _scws_alnum_multi(s, s->off, zlen);
1266
+ }
1267
+ }
1268
+
1269
+ /* reutrn the result */
1270
+ s->off = (ch > len ? len : off);
1271
+ if (s->res0 == NULL)
1272
+ return scws_get_result(s);
1273
+
1274
+ return s->res0;
1275
+ }
1276
+
1277
+ /* free the result retunned by scws_get_result */
1278
+ void scws_free_result(scws_res_t result)
1279
+ {
1280
+ scws_res_t cur;
1281
+
1282
+ while ((cur = result) != NULL)
1283
+ {
1284
+ result = cur->next;
1285
+ free(cur);
1286
+ }
1287
+ }
1288
+
1289
+ /* top words count */
1290
+ // xattr = ~v,p,c
1291
+ // xattr = v,pn,c
1292
+
1293
+ static int _tops_cmp(a, b)
1294
+ scws_top_t *a,*b;
1295
+ {
1296
+ if ((*b)->weight > (*a)->weight)
1297
+ return 1;
1298
+ return -1;
1299
+ }
1300
+
1301
+ static void _tops_load_node(node_t node, scws_top_t *values, int *start)
1302
+ {
1303
+ int i = *start;
1304
+
1305
+ if (node == NULL)
1306
+ return;
1307
+
1308
+ values[i] = node->value;
1309
+ values[i]->word = node->key;
1310
+
1311
+ *start = ++i;
1312
+ _tops_load_node(node->left, values, start);
1313
+ _tops_load_node(node->right, values, start);
1314
+ }
1315
+
1316
+ static void _tops_load_all(xtree_t xt, scws_top_t *values)
1317
+ {
1318
+ int i, start;
1319
+
1320
+ for (i = 0, start = 0; i < xt->prime; i++)
1321
+ _tops_load_node(xt->trees[i], values, &start);
1322
+ }
1323
+
1324
+ typedef char word_attr[4];
1325
+ static inline int _attr_belong(const char *a, word_attr *at)
1326
+ {
1327
+ if ((*at)[0] == '\0') return 1;
1328
+ while ((*at)[0])
1329
+ {
1330
+ if (!strcmp(a, *at)) return 1;
1331
+ at++;
1332
+ }
1333
+ return 0;
1334
+ }
1335
+
1336
+ /* macro to parse xattr -> xmode, at */
1337
+ #define __PARSE_XATTR__ do { \
1338
+ if (xattr == NULL) break; \
1339
+ if (*xattr == '~') { xattr++; xmode = SCWS_YEA; } \
1340
+ if (*xattr == '\0') break; \
1341
+ cnt = ((strlen(xattr)/2) + 2) * sizeof(word_attr); \
1342
+ at = (word_attr *) malloc(cnt); \
1343
+ memset(at, 0, cnt); \
1344
+ cnt = 0; \
1345
+ for (cnt = 0; (word = strchr(xattr, ',')); cnt++) { \
1346
+ at[cnt][0] = *xattr++; \
1347
+ at[cnt][1] = xattr == word ? '\0' : *xattr; \
1348
+ xattr = word + 1; \
1349
+ } \
1350
+ strncpy(at[cnt], xattr, 2); \
1351
+ } while (0)
1352
+
1353
+ scws_top_t scws_get_tops(scws_t s, int limit, char *xattr)
1354
+ {
1355
+ int off, cnt, xmode = SCWS_NA;
1356
+ xtree_t xt;
1357
+ scws_res_t res, cur;
1358
+ scws_top_t top, *list, tail, base;
1359
+ char *word;
1360
+ word_attr *at = NULL;
1361
+
1362
+ if (!s || !s->txt || !(xt = xtree_new(0,1)))
1363
+ return NULL;
1364
+
1365
+ __PARSE_XATTR__;
1366
+
1367
+ // save the offset.
1368
+ off = s->off;
1369
+ s->off = cnt = 0;
1370
+ while ((cur = res = scws_get_result(s)) != NULL)
1371
+ {
1372
+ do
1373
+ {
1374
+ if (cur->idf < 0.2 || cur->attr[0] == '#')
1375
+ continue;
1376
+
1377
+ /* check attribute filter */
1378
+ if (at != NULL)
1379
+ {
1380
+ if ((xmode == SCWS_NA) && !_attr_belong(cur->attr, at))
1381
+ continue;
1382
+
1383
+ if ((xmode == SCWS_YEA) && _attr_belong(cur->attr, at))
1384
+ continue;
1385
+ }
1386
+
1387
+ /* check stopwords */
1388
+ if (!strncmp(cur->attr, attr_en, 2) && cur->len > 6)
1389
+ {
1390
+ word = _mem_ndup(s->txt + cur->off, cur->len);
1391
+ _str_tolower(word, word);
1392
+ if (SCWS_IS_NOSTATS(word, cur->len))
1393
+ {
1394
+ free(word);
1395
+ continue;
1396
+ }
1397
+ free(word);
1398
+ }
1399
+
1400
+ /* put to the stats */
1401
+ if (!(top = xtree_nget(xt, s->txt + cur->off, cur->len, NULL)))
1402
+ {
1403
+ top = (scws_top_t) pmalloc_z(xt->p, sizeof(struct scws_topword));
1404
+ top->weight = cur->idf;
1405
+ top->times = 1;
1406
+ strncpy(top->attr, cur->attr, 2);
1407
+ xtree_nput(xt, top, sizeof(struct scws_topword), s->txt + cur->off, cur->len);
1408
+ cnt++;
1409
+ }
1410
+ else
1411
+ {
1412
+ top->weight += cur->idf;
1413
+ top->times++;
1414
+ }
1415
+ }
1416
+ while ((cur = cur->next) != NULL);
1417
+ scws_free_result(res);
1418
+ }
1419
+
1420
+ // free at
1421
+ if (at != NULL)
1422
+ free(at);
1423
+ top = NULL;
1424
+ if (cnt > 0)
1425
+ {
1426
+ /* sort the list */
1427
+ list = (scws_top_t *) malloc(sizeof(scws_top_t) * cnt);
1428
+ _tops_load_all(xt, list);
1429
+ qsort(list, cnt, sizeof(scws_top_t), _tops_cmp);
1430
+
1431
+ /* save to return pointer */
1432
+ if (!limit || limit > cnt)
1433
+ limit = cnt;
1434
+
1435
+ top = tail = (scws_top_t) malloc(sizeof(struct scws_topword));
1436
+ memcpy(top, list[0], sizeof(struct scws_topword));
1437
+ top->word = strdup(list[0]->word);
1438
+ top->next = NULL;
1439
+
1440
+ for (cnt = 1; cnt < limit; cnt++)
1441
+ {
1442
+ base = (scws_top_t) malloc(sizeof(struct scws_topword));
1443
+ memcpy(base, list[cnt], sizeof(struct scws_topword));
1444
+ base->word = strdup(list[cnt]->word);
1445
+ base->next = NULL;
1446
+ tail->next = base;
1447
+ tail = base;
1448
+ }
1449
+ free(list);
1450
+ }
1451
+
1452
+ // restore the offset
1453
+ s->off = off;
1454
+ xtree_free(xt);
1455
+ return top;
1456
+ }
1457
+
1458
+ // word check by attr.
1459
+ int scws_has_word(scws_t s, char *xattr)
1460
+ {
1461
+ int off, cnt, xmode = SCWS_NA;
1462
+ scws_res_t res, cur;
1463
+ char *word;
1464
+ word_attr *at = NULL;
1465
+
1466
+ if (!s || !s->txt)
1467
+ return 0;
1468
+
1469
+ __PARSE_XATTR__;
1470
+
1471
+ // save the offset. (cnt -> return_value)
1472
+ off = s->off;
1473
+ cnt = s->off = 0;
1474
+ while (!cnt && (cur = res = scws_get_result(s)) != NULL)
1475
+ {
1476
+ do
1477
+ {
1478
+ /* check attribute filter */
1479
+ if (at != NULL)
1480
+ {
1481
+ if ((xmode == SCWS_NA) && _attr_belong(cur->attr, at))
1482
+ cnt = 1;
1483
+
1484
+ if ((xmode == SCWS_YEA) && !_attr_belong(cur->attr, at))
1485
+ cnt = 1;
1486
+ }
1487
+ }
1488
+ while (!cnt && (cur = cur->next) != NULL);
1489
+ scws_free_result(res);
1490
+ }
1491
+ // memory leak fixed, thanks to lauxinz
1492
+ if (at != NULL)
1493
+ free(at);
1494
+ s->off = off;
1495
+ return cnt;
1496
+ }
1497
+
1498
+ // get words by attr (rand order)
1499
+ scws_top_t scws_get_words(scws_t s, char *xattr)
1500
+ {
1501
+ int off, cnt, xmode = SCWS_NA;
1502
+ xtree_t xt;
1503
+ scws_res_t res, cur;
1504
+ scws_top_t top, tail, base;
1505
+ char *word;
1506
+ word_attr *at = NULL;
1507
+
1508
+ if (!s || !s->txt || !(xt = xtree_new(0,1)))
1509
+ return NULL;
1510
+
1511
+ __PARSE_XATTR__;
1512
+
1513
+ // save the offset.
1514
+ off = s->off;
1515
+ s->off = 0;
1516
+ base = tail = NULL;
1517
+ while ((cur = res = scws_get_result(s)) != NULL)
1518
+ {
1519
+ do
1520
+ {
1521
+ /* check attribute filter */
1522
+ if (at != NULL)
1523
+ {
1524
+ if ((xmode == SCWS_NA) && !_attr_belong(cur->attr, at))
1525
+ continue;
1526
+
1527
+ if ((xmode == SCWS_YEA) && _attr_belong(cur->attr, at))
1528
+ continue;
1529
+ }
1530
+
1531
+ /* put to the stats */
1532
+ if (!(top = xtree_nget(xt, s->txt + cur->off, cur->len, NULL)))
1533
+ {
1534
+ top = (scws_top_t) malloc(sizeof(struct scws_topword));
1535
+ top->weight = cur->idf;
1536
+ top->times = 1;
1537
+ top->next = NULL;
1538
+ top->word = (char *)_mem_ndup(s->txt + cur->off, cur->len);
1539
+ strncpy(top->attr, cur->attr, 2);
1540
+ // add to the chain
1541
+ if (tail == NULL)
1542
+ base = tail = top;
1543
+ else
1544
+ {
1545
+ tail->next = top;
1546
+ tail = top;
1547
+ }
1548
+ xtree_nput(xt, top, sizeof(struct scws_topword), s->txt + cur->off, cur->len);
1549
+ }
1550
+ else
1551
+ {
1552
+ top->weight += cur->idf;
1553
+ top->times++;
1554
+ }
1555
+ }
1556
+ while ((cur = cur->next) != NULL);
1557
+ scws_free_result(res);
1558
+ }
1559
+
1560
+ // free at & xtree
1561
+ if (at != NULL)
1562
+ free(at);
1563
+ xtree_free(xt);
1564
+
1565
+ // restore the offset
1566
+ s->off = off;
1567
+ return base;
1568
+ }
1569
+
1570
+ void scws_free_tops(scws_top_t tops)
1571
+ {
1572
+ scws_top_t cur;
1573
+
1574
+ while ((cur = tops) != NULL)
1575
+ {
1576
+ tops = cur->next;
1577
+ if (cur->word)
1578
+ free(cur->word);
1579
+ free(cur);
1580
+ }
1581
+ }