scws4r 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/scws4r/scws.c ADDED
@@ -0,0 +1,1581 @@
1
+ /*
2
+ * @file scws.c (core segment functions)
3
+ * @author Hightman Mar
4
+ * @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
5
+ * $Id $
6
+ */
7
+
8
+ #ifdef HAVE_CONFIG_H
9
+ # include "config.h"
10
+ #endif
11
+
12
+ #ifdef WIN32
13
+ # include "config_win32.h"
14
+ #endif
15
+
16
+ #include "scws.h"
17
+ #include "xdict.h"
18
+ #include "rule.h"
19
+ #include "charset.h"
20
+ #include "darray.h"
21
+ #include "xtree.h"
22
+ #include <stdio.h>
23
+ #include <math.h>
24
+ #include <stdlib.h>
25
+ #include <string.h>
26
+
27
+ /* quick macro define for frequency usage */
28
+ #define SCWS_IS_SPECIAL(x,l) scws_rule_checkbit(s->r,x,l,SCWS_RULE_SPECIAL)
29
+ #define SCWS_IS_NOSTATS(x,l) scws_rule_checkbit(s->r,x,l,SCWS_RULE_NOSTATS)
30
+ #define SCWS_CHARLEN(x) s->mblen[(x)]
31
+ #define SCWS_IS_ALNUM(x) (((x)>=48&&(x)<=57)||((x)>=65&&(x)<=90)||((x)>=97&&(x)<=122))
32
+ #define SCWS_IS_ALPHA(x) (((x)>=65&&(x)<=90)||((x)>=97&&(x)<=122))
33
+ #define SCWS_IS_UALPHA(x) ((x)>=65&&(x)<=90)
34
+ #define SCWS_IS_DIGIT(x) ((x)>=48&&(x)<=57)
35
+ #define SCWS_IS_WHEAD(x) ((x) & SCWS_ZFLAG_WHEAD)
36
+ #define SCWS_IS_ECHAR(x) ((x) & SCWS_ZFLAG_ENGLISH)
37
+ #define SCWS_NO_RULE1(x) (((x) & (SCWS_ZFLAG_SYMBOL|SCWS_ZFLAG_ENGLISH))||(((x) & (SCWS_ZFLAG_WHEAD|SCWS_ZFLAG_NR2)) == SCWS_ZFLAG_WHEAD))
38
+ ///#define SCWS_NO_RULE2(x) (((x) & SCWS_ZFLAG_ENGLISH)||(((x) & (SCWS_ZFLAG_WHEAD|SCWS_ZFLAG_N2)) == SCWS_ZFLAG_WHEAD))
39
+ #define SCWS_NO_RULE2 SCWS_NO_RULE1
40
+ #define SCWS_MAX_EWLEN 33
41
+ ///hightman.070706: char token
42
+ #define SCWS_CHAR_TOKEN(x) ((x)=='('||(x)==')'||(x)=='['||(x)==']'||(x)=='{'||(x)=='}'||(x)==':'||(x)=='"')
43
+ ///hightman.070814: max zlen = ?? (4 * zlen * zlen = ??)
44
+ #define SCWS_MAX_ZLEN 128
45
+ #define SCWS_EN_IDF(x) (float)(2.5*logf(x))
46
+
47
+ static const char *attr_en = "en";
48
+ static const char *attr_un = "un";
49
+ static const char *attr_nr = "nr";
50
+ static const char *attr_na = "!";
51
+
52
+ /* create scws engine */
53
+ scws_t scws_new()
54
+ {
55
+ scws_t s;
56
+ s = (scws_t) malloc(sizeof(scws_st));
57
+ if (s == NULL)
58
+ return s;
59
+ memset(s, 0, sizeof(scws_st));
60
+ s->mblen = charset_table_get(NULL);
61
+ s->off = s->len = 0;
62
+ s->wend = -1;
63
+
64
+ return s;
65
+ }
66
+
67
+ /* hightman.110320: fork scws */
68
+ scws_t scws_fork(scws_t p)
69
+ {
70
+ scws_t s = scws_new();
71
+
72
+ if (p != NULL && s != NULL)
73
+ {
74
+ s->mblen = p->mblen;
75
+ s->mode = p->mode;
76
+ // fork dict/rules
77
+ s->r = scws_rule_fork(p->r);
78
+ s->d = xdict_fork(p->d);
79
+ }
80
+
81
+ return s;
82
+ }
83
+
84
+ /* close & free the engine */
85
+ void scws_free(scws_t s)
86
+ {
87
+ if (s->d)
88
+ {
89
+ xdict_close(s->d);
90
+ s->d = NULL;
91
+ }
92
+ if (s->r)
93
+ {
94
+ scws_rule_free(s->r);
95
+ s->r = NULL;
96
+ }
97
+ free(s);
98
+ }
99
+
100
+ /* add a dict into scws */
101
+ int scws_add_dict(scws_t s, const char *fpath, int mode)
102
+ {
103
+ xdict_t xx;
104
+ if (mode & SCWS_XDICT_SET)
105
+ {
106
+ xdict_close(s->d);
107
+ mode ^= SCWS_XDICT_SET;
108
+ s->d = NULL;
109
+ }
110
+ xx = s->d;
111
+ s->d = xdict_add(s->d, fpath, mode, s->mblen);
112
+ return (xx == s->d ? -1 : 0);
113
+ }
114
+
115
+ /* set the dict & open it */
116
+ int scws_set_dict(scws_t s, const char *fpath, int mode)
117
+ {
118
+ return scws_add_dict(s, fpath, mode | SCWS_XDICT_SET);
119
+ }
120
+
121
+ void scws_set_charset(scws_t s, const char *cs)
122
+ {
123
+ s->mblen = charset_table_get(cs);
124
+ }
125
+
126
+ void scws_set_rule(scws_t s, const char *fpath)
127
+ {
128
+ if (s->r != NULL)
129
+ scws_rule_free(s->r);
130
+
131
+ s->r = scws_rule_new(fpath, s->mblen);
132
+ }
133
+
134
+ /* set ignore symbol or multi segments */
135
+ void scws_set_ignore(scws_t s, int yes)
136
+ {
137
+ if (yes == SCWS_YEA)
138
+ s->mode |= SCWS_IGN_SYMBOL;
139
+
140
+ if (yes == SCWS_NA)
141
+ s->mode &= ~SCWS_IGN_SYMBOL;
142
+ }
143
+
144
+ void scws_set_multi(scws_t s, int mode)
145
+ {
146
+ s->mode &= ~SCWS_MULTI_MASK;
147
+
148
+ if (mode & SCWS_MULTI_MASK)
149
+ s->mode |= mode;
150
+ }
151
+
152
+ void scws_set_debug(scws_t s, int yes)
153
+ {
154
+ if (yes == SCWS_YEA)
155
+ s->mode |= SCWS_DEBUG;
156
+
157
+ if (yes == SCWS_NA)
158
+ s->mode &= ~SCWS_DEBUG;
159
+ }
160
+
161
+ void scws_set_duality(scws_t s, int yes)
162
+ {
163
+ if (yes == SCWS_YEA)
164
+ s->mode |= SCWS_DUALITY;
165
+
166
+ if (yes == SCWS_NA)
167
+ s->mode &= ~SCWS_DUALITY;
168
+ }
169
+
170
+ /* send the text buffer & init some others */
171
+ void scws_send_text(scws_t s, const char *text, int len)
172
+ {
173
+ s->txt = (unsigned char *) text;
174
+ s->len = len;
175
+ s->off = 0;
176
+ }
177
+
178
+ /* get some words, if these is not words, return NULL */
179
+ #define SCWS_PUT_RES(o,i,l,a) \
180
+ do { \
181
+ scws_res_t res; \
182
+ res = (scws_res_t) malloc(sizeof(struct scws_result)); \
183
+ res->off = o; \
184
+ res->idf = i; \
185
+ res->len = l; \
186
+ strncpy(res->attr, a, 2); \
187
+ res->attr[2] = '\0'; \
188
+ res->next = NULL; \
189
+ if (s->res1 == NULL) \
190
+ s->res1 = s->res0 = res; \
191
+ else \
192
+ { \
193
+ s->res1->next = res; \
194
+ s->res1 = res; \
195
+ } \
196
+ } while(0)
197
+
198
+ /* single bytes segment (纯单字节字符) */
199
+ #define PFLAG_WITH_MB 0x01
200
+ #define PFLAG_ALNUM 0x02
201
+ #define PFLAG_VALID 0x04
202
+ #define PFLAG_DIGIT 0x08
203
+ #define PFLAG_ADDSYM 0x10
204
+ #define PFLAG_ALPHA 0x20
205
+ #define PFLAG_LONGDIGIT 0x40
206
+ #define PFLAG_LONGALPHA 0x80
207
+
208
+ static void _str_toupper(char *src, char *dst)
209
+ {
210
+ while (*src)
211
+ {
212
+ *dst++ = *src++;
213
+ if (dst[-1] >= 'a' && dst[-1] <= 'z')
214
+ dst[-1] ^= 0x20;
215
+ }
216
+ }
217
+
218
+ static void _str_tolower(char *src, char *dst)
219
+ {
220
+ while (*src)
221
+ {
222
+ *dst++ = *src++;
223
+ if (dst[-1] >= 'A' && dst[-1] <= 'Z')
224
+ dst[-1] ^= 0x20;
225
+ }
226
+ }
227
+
228
+ #ifdef HAVE_STRNDUP
229
+ #define _mem_ndup strndup
230
+ #else
231
+ static inline void *_mem_ndup(const char *src, int len)
232
+ {
233
+ char *dst;
234
+ dst = malloc(len+1);
235
+ memcpy(dst, src, len);
236
+ dst[len] = '\0';
237
+ return dst;
238
+ }
239
+ #endif
240
+
241
+ static void _scws_alnum_multi(scws_t s, int start, int wlen)
242
+ {
243
+ char chunk[SCWS_MAX_EWLEN];
244
+ int i, j, k, ch, pflag;
245
+ unsigned char *txt;
246
+ float idf;
247
+
248
+ txt = s->txt;
249
+ pflag = 0;
250
+ for (i = j = k = 0; i < wlen; i++)
251
+ {
252
+ ch = txt[start + i];
253
+ if (SCWS_IS_DIGIT(ch))
254
+ {
255
+ if (pflag & PFLAG_DIGIT)
256
+ continue;
257
+ if (pflag != 0)
258
+ {
259
+ chunk[j++] = (char) (i-k);
260
+ k = i;
261
+ }
262
+ pflag = PFLAG_DIGIT;
263
+ }
264
+ else if (SCWS_IS_ALPHA(ch))
265
+ {
266
+ if (pflag & PFLAG_ALPHA)
267
+ continue;
268
+ if (pflag != 0)
269
+ {
270
+ chunk[j++] = (char) (i-k);
271
+ k = i;
272
+ }
273
+ pflag = PFLAG_ALPHA;
274
+ }
275
+ else
276
+ {
277
+ if (pflag & PFLAG_ADDSYM)
278
+ continue;
279
+ if (pflag != 0)
280
+ {
281
+ chunk[j++] = (char) (i-k);
282
+ k = i;
283
+ }
284
+ pflag = PFLAG_ADDSYM;
285
+ }
286
+ }
287
+
288
+ if (j > 0)
289
+ {
290
+ chunk[j] = (char) (i-k);
291
+ ch = start;
292
+ for (i = 0; i <= j; i++)
293
+ {
294
+ if (!SCWS_IS_ALNUM(txt[ch]))
295
+ {
296
+ // just skip
297
+ }
298
+ else if (chunk[i] == 1)
299
+ {
300
+ if (i > 0 && chunk[i-1] > 1 && (i != 1 || i != j))
301
+ {
302
+ if (!SCWS_IS_ALNUM(txt[ch-1]))
303
+ {
304
+ idf = SCWS_EN_IDF(chunk[i]);
305
+ SCWS_PUT_RES(ch, idf, chunk[i], attr_en);
306
+ }
307
+ else
308
+ {
309
+ idf = SCWS_EN_IDF(chunk[i-1]+1);
310
+ SCWS_PUT_RES(ch - chunk[i-1], idf, chunk[i-1]+1, attr_en);
311
+ }
312
+ }
313
+ if (i < j && (i != 0 || j != 1))
314
+ {
315
+ if (!SCWS_IS_ALNUM(txt[ch+1]))
316
+ {
317
+ idf = SCWS_EN_IDF(chunk[i]);
318
+ SCWS_PUT_RES(ch, idf, chunk[i], attr_en);
319
+ }
320
+ else
321
+ {
322
+ idf = SCWS_EN_IDF(chunk[i+1]+1);
323
+ SCWS_PUT_RES(ch, idf, chunk[i+1]+1, attr_en);
324
+ }
325
+ }
326
+ }
327
+ else
328
+ {
329
+ idf = SCWS_EN_IDF(chunk[i]);
330
+ SCWS_PUT_RES(ch, idf, chunk[i], attr_en);
331
+ }
332
+ ch += chunk[i];
333
+ }
334
+ }
335
+ }
336
+
337
+ static void _scws_ssegment(scws_t s, int end)
338
+ {
339
+ int start, wlen, ch, pflag, ipflag = 0;
340
+ unsigned char *txt;
341
+ float idf;
342
+
343
+ start = s->off;
344
+ wlen = end - start;
345
+
346
+ /* check special words (need strtoupper) */
347
+ if (wlen > 1)
348
+ {
349
+ txt = (char *) _mem_ndup(s->txt + start, wlen);
350
+ _str_toupper(txt, txt);
351
+ if (SCWS_IS_SPECIAL(txt, wlen))
352
+ {
353
+ SCWS_PUT_RES(start, 9.5, wlen, "nz");
354
+ free(txt);
355
+ return;
356
+ }
357
+ free(txt);
358
+ }
359
+
360
+ txt = s->txt;
361
+ /* check brief words such as S.H.E M.R. */
362
+ if (SCWS_IS_ALPHA(txt[start]) && txt[start+1] == '.')
363
+ {
364
+ for (ch = start + 2; ch < end; ch++)
365
+ {
366
+ if (!SCWS_IS_ALPHA(txt[ch])) break;
367
+ ch++;
368
+ if (ch == end || txt[ch] != '.') break;
369
+ }
370
+ if (ch == end)
371
+ {
372
+ SCWS_PUT_RES(start, 7.5, wlen, "nz");
373
+ return;
374
+ }
375
+ }
376
+
377
+ /* 取出单词及标点. 数字允许一个点且下一个为数字,不连续的. 字母允许一个不连续的' */
378
+ while (start < end)
379
+ {
380
+ ch = txt[start++];
381
+ if (ipflag && ch != 0x2e && !SCWS_IS_DIGIT(ch))
382
+ ipflag = 0;
383
+ if (SCWS_IS_ALNUM(ch))
384
+ {
385
+ pflag = SCWS_IS_DIGIT(ch) ? PFLAG_DIGIT : 0;
386
+ wlen = 1;
387
+ while (start < end)
388
+ {
389
+ ch = txt[start];
390
+ if (pflag & PFLAG_DIGIT)
391
+ {
392
+ if (!SCWS_IS_DIGIT(ch))
393
+ {
394
+ // check percent % = 0x25
395
+ if (ch == 0x25 && !SCWS_IS_DIGIT(txt[start+1]))
396
+ {
397
+ start++;
398
+ wlen++;
399
+ break;
400
+ }
401
+ if (ipflag)
402
+ break;
403
+ // special for IP address or version number? (find out all digit + dot)
404
+ if (ch == 0x2e && (pflag & PFLAG_ADDSYM))
405
+ {
406
+ ipflag = 1;
407
+ while(--wlen && txt[--start] != 0x2e);
408
+ pflag = 0;
409
+ break;
410
+ }
411
+ // wlen = 1
412
+ if (wlen == 1 && SCWS_IS_ALPHA(ch))
413
+ {
414
+ pflag ^= PFLAG_DIGIT;
415
+ pflag |= PFLAG_ADDSYM;
416
+ continue;
417
+ }
418
+ // strict must add: !$this->_is_digit(ord($this->txt[$start+1])))
419
+ if ((pflag & PFLAG_ADDSYM) || !(ch == 0x2e && SCWS_IS_DIGIT(txt[start+1])))
420
+ break;
421
+ pflag |= PFLAG_ADDSYM;
422
+ }
423
+ }
424
+ else
425
+ {
426
+ /* hightman.110419: - 出现在字母中间允许连接(0x2d), _ 允许连接(0x5f) */
427
+ if ((ch == 0x2d || ch == 0x5f) && SCWS_IS_ALPHA(txt[start+1]))
428
+ pflag |= PFLAG_ADDSYM;
429
+ else if (!SCWS_IS_ALPHA(ch))
430
+ {
431
+ if ((pflag & PFLAG_ADDSYM)
432
+ || !((ch == 0x27 && SCWS_IS_ALPHA(txt[start+1]))
433
+ || (SCWS_IS_DIGIT(ch) && !SCWS_IS_DIGIT(txt[start+1]))))
434
+ {
435
+ break;
436
+ }
437
+ pflag |= PFLAG_ADDSYM;
438
+ }
439
+ }
440
+ start++;
441
+ wlen++;
442
+ if (wlen >= SCWS_MAX_EWLEN)
443
+ break;
444
+ }
445
+ idf = SCWS_EN_IDF(wlen);
446
+ SCWS_PUT_RES(start-wlen, idf, wlen, attr_en);
447
+ if ((s->mode & SCWS_MULTI_DUALITY) && (pflag & PFLAG_ADDSYM))
448
+ _scws_alnum_multi(s, start-wlen, wlen);
449
+ }
450
+ else if (!(s->mode & SCWS_IGN_SYMBOL))
451
+ {
452
+ SCWS_PUT_RES(start-1, 0.0, 1, attr_un);
453
+ }
454
+ }
455
+ }
456
+
457
+ /* multibyte segment */
458
+ static int _scws_mget_word(scws_t s, int i, int j)
459
+ {
460
+ int r, k;
461
+ word_t item;
462
+
463
+ if (!(s->wmap[i][i]->flag & SCWS_ZFLAG_WHEAD))
464
+ return i;
465
+
466
+ for (r=i, k=i+1; k <= j; k++)
467
+ {
468
+ item = s->wmap[i][k];
469
+ if (item && (item->flag & SCWS_WORD_FULL))
470
+ {
471
+ r = k;
472
+ if (!(item->flag & SCWS_WORD_PART))
473
+ break;
474
+ }
475
+ }
476
+ return r;
477
+ }
478
+
479
+ static void _scws_mset_word(scws_t s, int i, int j)
480
+ {
481
+ word_t item;
482
+
483
+ item = s->wmap[i][j];
484
+ /* hightman.070705: 加入 item == null 判断, 防止超长词(255字以上)unsigned char溢出 */
485
+ if ((item == NULL) || ((s->mode & SCWS_IGN_SYMBOL)
486
+ && !SCWS_IS_ECHAR(item->flag) && !memcmp(item->attr, attr_un, 2)))
487
+ return;
488
+
489
+ /* hightman.070701: 散字自动二元聚合 */
490
+ if (s->mode & SCWS_DUALITY)
491
+ {
492
+ int k = s->zis;
493
+
494
+ if (i == j && !SCWS_IS_ECHAR(item->flag) && memcmp(item->attr, attr_un, 2))
495
+ {
496
+ s->zis = i;
497
+ if (k < 0)
498
+ return;
499
+
500
+ i = (k & ~SCWS_ZIS_USED);
501
+ if ((i != (j-1)) || (!(k & SCWS_ZIS_USED) && s->wend == i))
502
+ {
503
+ SCWS_PUT_RES(s->zmap[i].start, s->wmap[i][i]->idf, (s->zmap[i].end - s->zmap[i].start), s->wmap[i][i]->attr);
504
+ if (i != (j-1))
505
+ return;
506
+ }
507
+ s->zis |= SCWS_ZIS_USED;
508
+ }
509
+ else
510
+ {
511
+ if ((k >= 0) && (!(k & SCWS_ZIS_USED) || (j > i)))
512
+ {
513
+ k &= ~SCWS_ZIS_USED;
514
+ SCWS_PUT_RES(s->zmap[k].start, s->wmap[k][k]->idf, (s->zmap[k].end - s->zmap[k].start), s->wmap[k][k]->attr);
515
+ }
516
+ if (j > i)
517
+ s->wend = j + 1;
518
+ s->zis = -1;
519
+ }
520
+ }
521
+
522
+ SCWS_PUT_RES(s->zmap[i].start, item->idf, (s->zmap[j].end - s->zmap[i].start), item->attr);
523
+
524
+ // hightman.070902: multi segment
525
+ // step1: split to short words
526
+ if ((j-i) > 1)
527
+ {
528
+ int n, k, m = i;
529
+ if (s->mode & SCWS_MULTI_SHORT)
530
+ {
531
+ while (m < j)
532
+ {
533
+ k = m;
534
+ // hightman.111223: multi short enhanced
535
+ for (n = m + 1; n <= j; n++)
536
+ {
537
+ // 3 chars at most
538
+ if ((n == j && m == i) || (n - m) > 2) break;
539
+ item = s->wmap[m][n];
540
+ if (!item) continue;
541
+ // first shortest or last longest word
542
+ if ((item->flag & SCWS_WORD_FULL) && (k == m || n == j))
543
+ k = n;
544
+ if (!(item->flag & SCWS_WORD_PART)) break;
545
+ }
546
+ // short word not found, stop to find, passed to next loop
547
+ if (k == m)
548
+ break;
549
+
550
+ // save the short word
551
+ item = s->wmap[m][k];
552
+ SCWS_PUT_RES(s->zmap[m].start, item->idf, (s->zmap[k].end - s->zmap[m].start), item->attr);
553
+ // find the next word or go to prev for duality last word
554
+ if ((m = k + 1) == j)
555
+ {
556
+ m--;
557
+ break;
558
+ }
559
+ }
560
+ }
561
+
562
+ if (s->mode & SCWS_MULTI_DUALITY)
563
+ {
564
+ while (m < j)
565
+ {
566
+ if (SCWS_IS_ECHAR(s->wmap[m][m]->flag))
567
+ {
568
+ SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
569
+ s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
570
+ }
571
+ else if (SCWS_IS_ECHAR(s->wmap[m+1][m+1]->flag))
572
+ {
573
+ if (m == i)
574
+ {
575
+ SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
576
+ s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
577
+ }
578
+ m++;
579
+ SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
580
+ s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
581
+ }
582
+ else
583
+ {
584
+ SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m+1].end - s->zmap[m].start), s->wmap[m][m]->attr);
585
+ }
586
+ m++;
587
+ if (m == j && (SCWS_IS_ECHAR(s->wmap[m][m]->flag) || SCWS_IS_ECHAR(s->wmap[m-1][m-1]->flag)))
588
+ {
589
+ SCWS_PUT_RES(s->zmap[m].start, s->wmap[m][m]->idf, (s->zmap[m].end - s->zmap[m].start), s->wmap[m][m]->attr);
590
+ s->wmap[m][m]->flag |= SCWS_ZFLAG_PUT;
591
+ }
592
+ }
593
+ }
594
+ }
595
+
596
+ // step2, split to single char
597
+ if ((j > i) && (s->mode & (SCWS_MULTI_ZMAIN|SCWS_MULTI_ZALL)))
598
+ {
599
+ if ((j - i) == 1 && !s->wmap[i][j])
600
+ {
601
+ if (s->wmap[i][i]->flag & SCWS_ZFLAG_PUT) i++;
602
+ else s->wmap[i][i]->flag |= SCWS_ZFLAG_PUT;
603
+ s->wmap[j][j]->flag |= SCWS_ZFLAG_PUT;
604
+ }
605
+ do
606
+ {
607
+ if (s->wmap[i][i]->flag & SCWS_ZFLAG_PUT)
608
+ continue;
609
+ if (!(s->mode & SCWS_MULTI_ZALL) && !strchr("jnv", s->wmap[i][i]->attr[0]))
610
+ continue;
611
+ SCWS_PUT_RES(s->zmap[i].start, s->wmap[i][i]->idf, (s->zmap[i].end - s->zmap[i].start), s->wmap[i][i]->attr);
612
+ }
613
+ while (++i <= j);
614
+ }
615
+ }
616
+
617
+ static void _scws_mseg_zone(scws_t s, int f, int t)
618
+ {
619
+ unsigned char *mpath, *npath;
620
+ word_t **wmap;
621
+ int x,i,j,m,n,j2,sz;
622
+ double weight, nweight;
623
+ char attr1[3];
624
+
625
+ mpath = npath = NULL;
626
+ weight = nweight = (double) 0.0;
627
+
628
+ wmap = s->wmap;
629
+ j2 = 0;
630
+ for (x = i = f; i <= t; i++)
631
+ {
632
+ j = _scws_mget_word(s, i, (x > i ? x - 1 : t));
633
+ if (j == i) continue;
634
+ // skip NR in NR
635
+ if (j < j2 && wmap[i][j]->attr[0] == 'n' && wmap[i][j]->attr[1] == 'r') continue;
636
+ if (i > j2 && (wmap[i][j]->flag & SCWS_WORD_USED)) continue;
637
+
638
+ /* one word only */
639
+ if (i == f && j == t)
640
+ {
641
+ mpath = (unsigned char *) malloc(2);
642
+ mpath[0] = j - i;
643
+ mpath[1] = 0xff;
644
+ break;
645
+ }
646
+
647
+ if (i != f && (wmap[i][j]->flag & SCWS_WORD_RULE))
648
+ continue;
649
+
650
+ /* create the new path */
651
+ wmap[i][j]->flag |= SCWS_WORD_USED;
652
+ nweight = (double) wmap[i][j]->tf * pow(j-i,4);
653
+
654
+ if (npath == NULL)
655
+ {
656
+ npath = (unsigned char *) malloc(t-f+2);
657
+ memset(npath, 0xff, t-f+2);
658
+ }
659
+
660
+ /* lookfor backward */
661
+ x = sz = 0;
662
+ memset(attr1, 0, sizeof(attr1));
663
+ for (m = f; m < i; m = n+1)
664
+ {
665
+ n = _scws_mget_word(s, m, i-1);
666
+ nweight *= wmap[m][n]->tf;
667
+ npath[x++] = n - m;
668
+ if (n > m)
669
+ {
670
+ nweight *= pow(n-m,4);
671
+ wmap[m][n]->flag |= SCWS_WORD_USED;
672
+ }
673
+ else sz++;
674
+
675
+ if (attr1[0] != '\0')
676
+ nweight *= scws_rule_attr_ratio(s->r, attr1, wmap[m][n]->attr, &npath[x-2]);
677
+ memcpy(attr1, wmap[m][n]->attr, 2);
678
+ }
679
+
680
+ /* my self */
681
+ npath[x++] = j - i;
682
+
683
+ if (attr1[0] != '\0')
684
+ nweight *= scws_rule_attr_ratio(s->r, attr1, wmap[i][j]->attr, &npath[x-2]);
685
+ memcpy(attr1, wmap[i][j]->attr, 2);
686
+
687
+ /* lookfor forward */
688
+ for (m = j+1; m <= t; m = n+1)
689
+ {
690
+ n = _scws_mget_word(s, m, t);
691
+ nweight *= wmap[m][n]->tf;
692
+ npath[x++] = n - m;
693
+ if (n > m)
694
+ {
695
+ nweight *= pow(n-m,4);
696
+ wmap[m][n]->flag |= SCWS_WORD_USED;
697
+ }
698
+ else sz++;
699
+
700
+ nweight *= scws_rule_attr_ratio(s->r, attr1, wmap[m][n]->attr, &npath[x-2]);
701
+ memcpy(attr1, wmap[m][n]->attr, 2);
702
+ }
703
+
704
+ npath[x] = 0xff;
705
+ nweight /= pow(x+sz-1,5);
706
+
707
+ /* draw the path for debug */
708
+ #ifdef DEBUG
709
+ if (s->mode & SCWS_DEBUG)
710
+ {
711
+ fprintf(stderr, "PATH by keyword = %.*s, (weight=%.4f):\n",
712
+ s->zmap[j].end - s->zmap[i].start, s->txt + s->zmap[i].start, nweight);
713
+ for (x = 0, m = f; (n = npath[x]) != 0xff; x++)
714
+ {
715
+ n += m;
716
+ fprintf(stderr, "%.*s ", s->zmap[n].end - s->zmap[m].start, s->txt + s->zmap[m].start);
717
+ m = n + 1;
718
+ }
719
+ fprintf(stderr, "\n--\n");
720
+ }
721
+ #endif
722
+
723
+ j2 = x = j;
724
+ if ((x - i) > 1) i--;
725
+ /* check better path */
726
+ if (nweight > weight)
727
+ {
728
+ unsigned char *swap;
729
+
730
+ weight = nweight;
731
+ swap = mpath;
732
+ mpath = npath;
733
+ npath = swap;
734
+ }
735
+ }
736
+
737
+ /* set the result, mpath != NULL */
738
+ if (mpath == NULL)
739
+ return;
740
+
741
+ for (x = 0, m = f; (n = mpath[x]) != 0xff; x++)
742
+ {
743
+ n += m;
744
+ _scws_mset_word(s, m, n);
745
+ m = n + 1;
746
+ }
747
+
748
+ /* 一口.070808: memory leak fixed. */
749
+ if (mpath) free(mpath);
750
+ if (npath) free(npath);
751
+ }
752
+
753
+ /* quick define for zrule_checker in loop */
754
+ #define ___ZRULE_CHECKER1___ \
755
+ if (j >= zlen || SCWS_NO_RULE2(wmap[j][j]->flag)) \
756
+ break;
757
+
758
+ #define ___ZRULE_CHECKER2___ \
759
+ if (j < 0 || SCWS_NO_RULE2(wmap[j][j]->flag)) \
760
+ break;
761
+
762
+ #define ___ZRULE_CHECKER3___ \
763
+ if (!scws_rule_check(s->r, r1, txt + zmap[j].start, zmap[j].end - zmap[j].start)) \
764
+ break;
765
+
766
+ static void _scws_msegment(scws_t s, int end, int zlen)
767
+ {
768
+ word_t **wmap, query;
769
+ struct scws_zchar *zmap;
770
+ unsigned char *txt;
771
+ #ifdef HAVE_NAME_RULE /* 20150403: Remove rules, just deepend on dictionary */
772
+ rule_item_t r1;
773
+ #endif
774
+ int i, j, k, ch, clen, start;
775
+ pool_t p;
776
+
777
+ /* pool used to management some dynamic memory */
778
+ p = pool_new();
779
+
780
+ /* create wmap & zmap */
781
+ wmap = s->wmap = (word_t **) darray_new(zlen, zlen, sizeof(word_t));
782
+ zmap = s->zmap = (struct scws_zchar *) pmalloc(p, zlen * sizeof(struct scws_zchar));
783
+ txt = s->txt;
784
+ start = s->off;
785
+ s->zis = -1;
786
+
787
+ for (i = 0; start < end; i++)
788
+ {
789
+ ch = txt[start];
790
+ clen = SCWS_CHARLEN(ch);
791
+ if (clen == 1)
792
+ {
793
+ while (start++ < end)
794
+ {
795
+ ch = txt[start];
796
+ if (start == end || SCWS_CHARLEN(txt[start]) > 1)
797
+ break;
798
+ clen++;
799
+ }
800
+ wmap[i][i] = (word_t) pmalloc_z(p, sizeof(word_st));
801
+ wmap[i][i]->tf = 0.5;
802
+ wmap[i][i]->flag |= SCWS_ZFLAG_ENGLISH;
803
+ strcpy(wmap[i][i]->attr, SCWS_IS_ALPHA(txt[start-1]) ? attr_en : attr_un);
804
+ }
805
+ else
806
+ {
807
+ query = xdict_query(s->d, txt + start, clen);
808
+ wmap[i][i] = (word_t) pmalloc(p, sizeof(word_st));
809
+ if (query == NULL)
810
+ {
811
+ wmap[i][i]->tf = 0.5;
812
+ wmap[i][i]->idf = 0.0;
813
+ wmap[i][i]->flag = 0;
814
+ strcpy(wmap[i][i]->attr, attr_un);
815
+ }
816
+ else
817
+ {
818
+ ch = query->flag;
819
+ query->flag = SCWS_WORD_FULL;
820
+ memcpy(wmap[i][i], query, sizeof(word_st));
821
+ if (query->attr[0] == '#')
822
+ wmap[i][i]->flag |= SCWS_ZFLAG_SYMBOL;
823
+
824
+ if (ch & SCWS_WORD_MALLOCED)
825
+ free(query);
826
+ }
827
+ start += clen;
828
+ }
829
+
830
+ zmap[i].start = start - clen;
831
+ zmap[i].end = start;
832
+ }
833
+
834
+ /* fixed real zlength */
835
+ zlen = i;
836
+
837
+ /* create word query table */
838
+ for (i = 0; i < zlen; i++)
839
+ {
840
+ k = 0;
841
+ for (j = i+1; j < zlen; j++)
842
+ {
843
+ query = xdict_query(s->d, txt + zmap[i].start, zmap[j].end - zmap[i].start);
844
+ if (query == NULL)
845
+ break;
846
+ ch = query->flag;
847
+ if ((ch & SCWS_WORD_FULL) && memcmp(query->attr, attr_na, 2))
848
+ {
849
+ wmap[i][j] = (word_t) pmalloc(p, sizeof(word_st));
850
+ memcpy(wmap[i][j], query, sizeof(word_st));
851
+
852
+ wmap[i][i]->flag |= SCWS_ZFLAG_WHEAD;
853
+
854
+ for (k = i+1; k <= j; k++)
855
+ wmap[k][k]->flag |= SCWS_ZFLAG_WPART;
856
+ }
857
+
858
+ if (ch & SCWS_WORD_MALLOCED)
859
+ free(query);
860
+
861
+ if (!(ch & SCWS_WORD_PART))
862
+ break;
863
+ }
864
+
865
+ if (k--)
866
+ {
867
+ /* set nr2 to some short name */
868
+ if ((k == (i+1)))
869
+ {
870
+ if (!memcmp(wmap[i][k]->attr, attr_nr, 2))
871
+ wmap[i][i]->flag |= SCWS_ZFLAG_NR2;
872
+ //if (wmap[i][k]->attr[0] == 'n')
873
+ //wmap[i][i]->flag |= SCWS_ZFLAG_N2;
874
+ }
875
+
876
+ /* clean the PART flag for the last word */
877
+ if (k < j)
878
+ wmap[i][k]->flag ^= SCWS_WORD_PART;
879
+ }
880
+ }
881
+
882
+ if (s->r == NULL)
883
+ goto do_segment;
884
+
885
+ #ifdef HAVE_NAME_RULE /* 20150403: Remove rules, just deepend on dictionary */
886
+ /* auto rule set for name & zone & chinese numeric */
887
+
888
+ /* one word auto rule check */
889
+ for (i = 0; i < zlen; i++)
890
+ {
891
+ if (SCWS_NO_RULE1(wmap[i][i]->flag))
892
+ continue;
893
+
894
+ r1 = scws_rule_get(s->r, txt + zmap[i].start, zmap[i].end - zmap[i].start);
895
+ if (r1 == NULL)
896
+ continue;
897
+
898
+ clen = r1->zmin > 0 ? r1->zmin : 1;
899
+ if ((r1->flag & SCWS_ZRULE_PREFIX) && (i < (zlen - clen)))
900
+ {
901
+ /* prefix, check after (zmin~zmax) */
902
+ // 先检查 zmin 字内是否全部符合要求
903
+ // 再在 zmax 范围内取得符合要求的字
904
+ // int i, j, k, ch, clen, start;
905
+ for (ch = 1; ch <= clen; ch++)
906
+ {
907
+ j = i + ch;
908
+ ___ZRULE_CHECKER1___
909
+ ___ZRULE_CHECKER3___
910
+ }
911
+
912
+ if (ch <= clen)
913
+ continue;
914
+
915
+ /* no limit znum or limit to a range */
916
+ j = i + ch;
917
+ while (1)
918
+ {
919
+ if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
920
+ break;
921
+ ___ZRULE_CHECKER1___
922
+ ___ZRULE_CHECKER3___
923
+ clen++;
924
+ j++;
925
+ }
926
+
927
+ // 注意原来2字人名,识别后仍为2字的情况
928
+ if (wmap[i][i]->flag & SCWS_ZFLAG_NR2)
929
+ {
930
+ if (clen == 1)
931
+ continue;
932
+ wmap[i][i+1]->flag |= SCWS_WORD_PART;
933
+ }
934
+
935
+ /* ok, got: i & clen */
936
+ k = i + clen;
937
+ wmap[i][k] = (word_t) pmalloc(p, sizeof(word_st));
938
+ wmap[i][k]->tf = r1->tf;
939
+ wmap[i][k]->idf = r1->idf;
940
+ wmap[i][k]->flag = (SCWS_WORD_RULE|SCWS_WORD_FULL);
941
+ strncpy(wmap[i][k]->attr, r1->attr, 2);
942
+
943
+ wmap[i][i]->flag |= SCWS_ZFLAG_WHEAD;
944
+ for (j = i+1; j <= k; j++)
945
+ wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
946
+
947
+ if (!(wmap[i][i]->flag & SCWS_ZFLAG_WPART))
948
+ i = k;
949
+
950
+ continue;
951
+ }
952
+
953
+ if ((r1->flag & SCWS_ZRULE_SUFFIX) && (i >= clen))
954
+ {
955
+ /* suffix, check before */
956
+ for (ch = 1; ch <= clen; ch++)
957
+ {
958
+ j = i - ch;
959
+ ___ZRULE_CHECKER2___
960
+ ___ZRULE_CHECKER3___
961
+ }
962
+
963
+ if (ch <= clen)
964
+ continue;
965
+
966
+ /* no limit znum or limit to a range */
967
+ j = i - ch;
968
+ while (1)
969
+ {
970
+ if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
971
+ break;
972
+ ___ZRULE_CHECKER2___
973
+ ___ZRULE_CHECKER3___
974
+ clen++;
975
+ j--;
976
+ }
977
+
978
+ /* ok, got: i & clen (maybe clen=1 & [k][i] isset) */
979
+ k = i - clen;
980
+ if (wmap[k][i] != NULL)
981
+ continue;
982
+
983
+ wmap[k][i] = (word_t) pmalloc(p, sizeof(word_st));
984
+ wmap[k][i]->tf = r1->tf;
985
+ wmap[k][i]->idf = r1->idf;
986
+ wmap[k][i]->flag = SCWS_WORD_FULL;
987
+ strncpy(wmap[k][i]->attr, r1->attr, 2);
988
+
989
+ wmap[k][k]->flag |= SCWS_ZFLAG_WHEAD;
990
+ for (j = k+1; j <= i; j++)
991
+ {
992
+ wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
993
+ if ((j != i) && (wmap[k][j] != NULL))
994
+ wmap[k][j]->flag |= SCWS_WORD_PART;
995
+ }
996
+ continue;
997
+ }
998
+ }
999
+
1000
+ /* two words auto rule check (欧阳** , **西路) */
1001
+ for (i = zlen - 2; i >= 0; i--)
1002
+ {
1003
+ /* with value ==> must be have SCWS_WORD_FULL, so needn't check it ag. */
1004
+ if ((wmap[i][i+1] == NULL) || (wmap[i][i+1]->flag & SCWS_WORD_PART))
1005
+ continue;
1006
+
1007
+ k = i+1;
1008
+ r1 = scws_rule_get(s->r, txt + zmap[i].start, zmap[k].end - zmap[i].start);
1009
+ if (r1 == NULL)
1010
+ continue;
1011
+
1012
+ clen = r1->zmin > 0 ? r1->zmin : 1;
1013
+ if ((r1->flag & SCWS_ZRULE_PREFIX) && (k < (zlen - clen)))
1014
+ {
1015
+ for (ch = 1; ch <= clen; ch++)
1016
+ {
1017
+ j = k + ch;
1018
+ ___ZRULE_CHECKER1___
1019
+ ___ZRULE_CHECKER3___
1020
+ }
1021
+
1022
+ if (ch <= clen)
1023
+ continue;
1024
+
1025
+ /* no limit znum or limit to a range */
1026
+ j = k + ch;
1027
+ while (1)
1028
+ {
1029
+ if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
1030
+ break;
1031
+ ___ZRULE_CHECKER1___
1032
+ ___ZRULE_CHECKER3___
1033
+ clen++;
1034
+ j++;
1035
+ }
1036
+
1037
+ /* ok, got: i & clen */
1038
+ k = k + clen;
1039
+ wmap[i][k] = (word_t) pmalloc(p, sizeof(word_st));
1040
+ wmap[i][k]->tf = r1->tf;
1041
+ wmap[i][k]->idf = r1->idf;
1042
+ wmap[i][k]->flag = SCWS_WORD_FULL;
1043
+ strncpy(wmap[i][k]->attr, r1->attr, 2);
1044
+
1045
+ wmap[i][i+1]->flag |= SCWS_WORD_PART;
1046
+ for (j = i+2; j <= k; j++)
1047
+ wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
1048
+
1049
+ i--;
1050
+ continue;
1051
+ }
1052
+
1053
+ if ((r1->flag & SCWS_ZRULE_SUFFIX) && (i >= clen))
1054
+ {
1055
+ /* suffix, check before */
1056
+ for (ch = 1; ch <= clen; ch++)
1057
+ {
1058
+ j = i - ch;
1059
+ ___ZRULE_CHECKER2___
1060
+ ___ZRULE_CHECKER3___
1061
+ }
1062
+
1063
+ if (ch <= clen)
1064
+ continue;
1065
+
1066
+ /* no limit znum or limit to a range */
1067
+ j = i - ch;
1068
+ while (1)
1069
+ {
1070
+ if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
1071
+ break;
1072
+ ___ZRULE_CHECKER2___
1073
+ ___ZRULE_CHECKER3___
1074
+ clen++;
1075
+ j--;
1076
+ }
1077
+
1078
+ /* ok, got: i & clen (maybe clen=1 & [k][i] isset) */
1079
+ k = i - clen;
1080
+ i = i + 1;
1081
+ wmap[k][i] = (word_t) pmalloc(p, sizeof(word_st));
1082
+ wmap[k][i]->tf = r1->tf;
1083
+ wmap[k][i]->idf = r1->idf;
1084
+ wmap[k][i]->flag = SCWS_WORD_FULL;
1085
+ strncpy(wmap[k][i]->attr, r1->attr, 2);
1086
+
1087
+ wmap[k][k]->flag |= SCWS_ZFLAG_WHEAD;
1088
+ for (j = k+1; j <= i; j++)
1089
+ {
1090
+ wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
1091
+ if (wmap[k][j] != NULL)
1092
+ wmap[k][j]->flag |= SCWS_WORD_PART;
1093
+ }
1094
+
1095
+ i -= (clen+1);
1096
+ continue;
1097
+ }
1098
+ }
1099
+ #endif
1100
+
1101
+ /* real do the segment */
1102
+ do_segment:
1103
+
1104
+ /* find the easy break point */
1105
+ for (i = 0, j = 0; i < zlen; i++)
1106
+ {
1107
+ if (wmap[i][i]->flag & SCWS_ZFLAG_WPART)
1108
+ continue;
1109
+
1110
+ if (i > j)
1111
+ _scws_mseg_zone(s, j, i-1);
1112
+
1113
+ j = i;
1114
+ if (!(wmap[i][i]->flag & SCWS_ZFLAG_WHEAD))
1115
+ {
1116
+ _scws_mset_word(s, i, i);
1117
+ j++;
1118
+ }
1119
+ }
1120
+
1121
+ /* the lastest zone */
1122
+ if (i > j)
1123
+ _scws_mseg_zone(s, j, i-1);
1124
+
1125
+ /* the last single for duality */
1126
+ if ((s->mode & SCWS_DUALITY) && (s->zis >= 0) && !(s->zis & SCWS_ZIS_USED))
1127
+ {
1128
+ i = s->zis;
1129
+ SCWS_PUT_RES(s->zmap[i].start, s->wmap[i][i]->idf, (s->zmap[i].end - s->zmap[i].start), s->wmap[i][i]->attr);
1130
+ }
1131
+
1132
+ /* free the wmap & zmap */
1133
+ pool_free(p);
1134
+ darray_free((void **) wmap);
1135
+ }
1136
+
1137
+ scws_res_t scws_get_result(scws_t s)
1138
+ {
1139
+ int off, len, ch, clen, zlen, pflag;
1140
+ unsigned char *txt;
1141
+
1142
+ off = s->off;
1143
+ len = s->len;
1144
+ txt = s->txt;
1145
+ s->res0 = s->res1 = NULL;
1146
+ while ((off < len) && (txt[off] <= 0x20))
1147
+ {
1148
+ if (txt[off] == 0x0a || txt[off] == 0x0d)
1149
+ {
1150
+ s->off = off + 1;
1151
+ SCWS_PUT_RES(off, 0.0, 1, attr_un);
1152
+ return s->res0;
1153
+ }
1154
+ off++;
1155
+ }
1156
+
1157
+ if (off >= len)
1158
+ return NULL;
1159
+
1160
+ /* try to parse the sentence */
1161
+ s->off = off;
1162
+ ch = txt[off];
1163
+ if (SCWS_CHAR_TOKEN(ch) && !(s->mode & SCWS_IGN_SYMBOL))
1164
+ {
1165
+ s->off++;
1166
+ SCWS_PUT_RES(off, 0.0, 1, attr_un);
1167
+ return s->res0;
1168
+ }
1169
+ clen = SCWS_CHARLEN(ch);
1170
+ zlen = 1;
1171
+ pflag = (clen > 1 ? PFLAG_WITH_MB : (SCWS_IS_ALNUM(ch) ? PFLAG_ALNUM : 0));
1172
+ while ((off = (off+clen)) < len)
1173
+ {
1174
+ ch = txt[off];
1175
+ if (ch <= 0x20 || SCWS_CHAR_TOKEN(ch)) break;
1176
+ clen = SCWS_CHARLEN(ch);
1177
+ if (!(pflag & PFLAG_WITH_MB))
1178
+ {
1179
+ // pure single-byte -> multibyte (2bytes)
1180
+ if (clen == 1)
1181
+ {
1182
+ if (pflag & PFLAG_ALNUM)
1183
+ {
1184
+ if (SCWS_IS_ALPHA(ch))
1185
+ {
1186
+ if (!(pflag & PFLAG_LONGALPHA) && SCWS_IS_ALPHA(txt[off-1]))
1187
+ pflag |= PFLAG_LONGALPHA;
1188
+ }
1189
+ else if (SCWS_IS_DIGIT(ch))
1190
+ {
1191
+ if (!(pflag & PFLAG_LONGDIGIT) && SCWS_IS_DIGIT(txt[off-1]))
1192
+ pflag |= PFLAG_LONGDIGIT;
1193
+ }
1194
+ else
1195
+ pflag ^= PFLAG_ALNUM;
1196
+ }
1197
+ }
1198
+ else
1199
+ {
1200
+ if (!(pflag & PFLAG_ALNUM) || zlen > 2)
1201
+ break;
1202
+
1203
+ pflag |= PFLAG_WITH_MB;
1204
+ /* zlen = 1; */
1205
+ }
1206
+ }
1207
+ else if ((pflag & PFLAG_WITH_MB) && clen == 1)
1208
+ {
1209
+ int i;
1210
+
1211
+ // mb + single-byte. allowd: alpha+num + 中文
1212
+ if (!SCWS_IS_ALNUM(ch))
1213
+ break;
1214
+
1215
+ pflag &= ~PFLAG_VALID;
1216
+ // 夹在中文间的英文数字最多允许 2 个字符 (超过2可以独立成词没啥问题)
1217
+ for (i = off+1; i < (off+3); i++)
1218
+ {
1219
+ ch = txt[i];
1220
+ if ((i >= len) || (ch <= 0x20) || (SCWS_CHARLEN(ch) > 1))
1221
+ {
1222
+ pflag |= PFLAG_VALID;
1223
+ break;
1224
+ }
1225
+
1226
+ if (!SCWS_IS_ALNUM(ch))
1227
+ break;
1228
+ }
1229
+
1230
+ if (!(pflag & PFLAG_VALID))
1231
+ break;
1232
+
1233
+ clen += (i - off - 1);
1234
+ }
1235
+ /* hightman.070813: add max zlen limit */
1236
+ if (++zlen >= SCWS_MAX_ZLEN)
1237
+ break;
1238
+ }
1239
+
1240
+ /* hightman.070624: 处理半个字的问题 */
1241
+ if ((ch = off) > len)
1242
+ off -= clen;
1243
+
1244
+ /* do the real segment */
1245
+ if (off <= s->off)
1246
+ return NULL;
1247
+ else if (pflag & PFLAG_WITH_MB)
1248
+ _scws_msegment(s, off, zlen);
1249
+ else if (!(pflag & PFLAG_ALNUM) || ((off - s->off) >= SCWS_MAX_EWLEN))
1250
+ _scws_ssegment(s, off);
1251
+ else
1252
+ {
1253
+ zlen = off - s->off;
1254
+ if ((pflag & (PFLAG_LONGALPHA|PFLAG_LONGDIGIT)) == (PFLAG_LONGALPHA|PFLAG_LONGDIGIT))
1255
+ _scws_alnum_multi(s, s->off, zlen);
1256
+ else
1257
+ {
1258
+ float idf;
1259
+
1260
+ idf = SCWS_EN_IDF(zlen);
1261
+ SCWS_PUT_RES(s->off, idf, zlen, attr_en);
1262
+
1263
+ /* hightman.090523: 为字母数字混合再度拆解, 纯数字, (>1 ? 纯字母 : 数字+字母) */
1264
+ if ((s->mode & SCWS_MULTI_DUALITY) && zlen > 2)
1265
+ _scws_alnum_multi(s, s->off, zlen);
1266
+ }
1267
+ }
1268
+
1269
+ /* reutrn the result */
1270
+ s->off = (ch > len ? len : off);
1271
+ if (s->res0 == NULL)
1272
+ return scws_get_result(s);
1273
+
1274
+ return s->res0;
1275
+ }
1276
+
1277
+ /* free the result retunned by scws_get_result */
1278
+ void scws_free_result(scws_res_t result)
1279
+ {
1280
+ scws_res_t cur;
1281
+
1282
+ while ((cur = result) != NULL)
1283
+ {
1284
+ result = cur->next;
1285
+ free(cur);
1286
+ }
1287
+ }
1288
+
1289
+ /* top words count */
1290
+ // xattr = ~v,p,c
1291
+ // xattr = v,pn,c
1292
+
1293
+ static int _tops_cmp(a, b)
1294
+ scws_top_t *a,*b;
1295
+ {
1296
+ if ((*b)->weight > (*a)->weight)
1297
+ return 1;
1298
+ return -1;
1299
+ }
1300
+
1301
+ static void _tops_load_node(node_t node, scws_top_t *values, int *start)
1302
+ {
1303
+ int i = *start;
1304
+
1305
+ if (node == NULL)
1306
+ return;
1307
+
1308
+ values[i] = node->value;
1309
+ values[i]->word = node->key;
1310
+
1311
+ *start = ++i;
1312
+ _tops_load_node(node->left, values, start);
1313
+ _tops_load_node(node->right, values, start);
1314
+ }
1315
+
1316
+ static void _tops_load_all(xtree_t xt, scws_top_t *values)
1317
+ {
1318
+ int i, start;
1319
+
1320
+ for (i = 0, start = 0; i < xt->prime; i++)
1321
+ _tops_load_node(xt->trees[i], values, &start);
1322
+ }
1323
+
1324
+ typedef char word_attr[4];
1325
+ static inline int _attr_belong(const char *a, word_attr *at)
1326
+ {
1327
+ if ((*at)[0] == '\0') return 1;
1328
+ while ((*at)[0])
1329
+ {
1330
+ if (!strcmp(a, *at)) return 1;
1331
+ at++;
1332
+ }
1333
+ return 0;
1334
+ }
1335
+
1336
+ /* macro to parse xattr -> xmode, at */
1337
+ #define __PARSE_XATTR__ do { \
1338
+ if (xattr == NULL) break; \
1339
+ if (*xattr == '~') { xattr++; xmode = SCWS_YEA; } \
1340
+ if (*xattr == '\0') break; \
1341
+ cnt = ((strlen(xattr)/2) + 2) * sizeof(word_attr); \
1342
+ at = (word_attr *) malloc(cnt); \
1343
+ memset(at, 0, cnt); \
1344
+ cnt = 0; \
1345
+ for (cnt = 0; (word = strchr(xattr, ',')); cnt++) { \
1346
+ at[cnt][0] = *xattr++; \
1347
+ at[cnt][1] = xattr == word ? '\0' : *xattr; \
1348
+ xattr = word + 1; \
1349
+ } \
1350
+ strncpy(at[cnt], xattr, 2); \
1351
+ } while (0)
1352
+
1353
+ scws_top_t scws_get_tops(scws_t s, int limit, char *xattr)
1354
+ {
1355
+ int off, cnt, xmode = SCWS_NA;
1356
+ xtree_t xt;
1357
+ scws_res_t res, cur;
1358
+ scws_top_t top, *list, tail, base;
1359
+ char *word;
1360
+ word_attr *at = NULL;
1361
+
1362
+ if (!s || !s->txt || !(xt = xtree_new(0,1)))
1363
+ return NULL;
1364
+
1365
+ __PARSE_XATTR__;
1366
+
1367
+ // save the offset.
1368
+ off = s->off;
1369
+ s->off = cnt = 0;
1370
+ while ((cur = res = scws_get_result(s)) != NULL)
1371
+ {
1372
+ do
1373
+ {
1374
+ if (cur->idf < 0.2 || cur->attr[0] == '#')
1375
+ continue;
1376
+
1377
+ /* check attribute filter */
1378
+ if (at != NULL)
1379
+ {
1380
+ if ((xmode == SCWS_NA) && !_attr_belong(cur->attr, at))
1381
+ continue;
1382
+
1383
+ if ((xmode == SCWS_YEA) && _attr_belong(cur->attr, at))
1384
+ continue;
1385
+ }
1386
+
1387
+ /* check stopwords */
1388
+ if (!strncmp(cur->attr, attr_en, 2) && cur->len > 6)
1389
+ {
1390
+ word = _mem_ndup(s->txt + cur->off, cur->len);
1391
+ _str_tolower(word, word);
1392
+ if (SCWS_IS_NOSTATS(word, cur->len))
1393
+ {
1394
+ free(word);
1395
+ continue;
1396
+ }
1397
+ free(word);
1398
+ }
1399
+
1400
+ /* put to the stats */
1401
+ if (!(top = xtree_nget(xt, s->txt + cur->off, cur->len, NULL)))
1402
+ {
1403
+ top = (scws_top_t) pmalloc_z(xt->p, sizeof(struct scws_topword));
1404
+ top->weight = cur->idf;
1405
+ top->times = 1;
1406
+ strncpy(top->attr, cur->attr, 2);
1407
+ xtree_nput(xt, top, sizeof(struct scws_topword), s->txt + cur->off, cur->len);
1408
+ cnt++;
1409
+ }
1410
+ else
1411
+ {
1412
+ top->weight += cur->idf;
1413
+ top->times++;
1414
+ }
1415
+ }
1416
+ while ((cur = cur->next) != NULL);
1417
+ scws_free_result(res);
1418
+ }
1419
+
1420
+ // free at
1421
+ if (at != NULL)
1422
+ free(at);
1423
+ top = NULL;
1424
+ if (cnt > 0)
1425
+ {
1426
+ /* sort the list */
1427
+ list = (scws_top_t *) malloc(sizeof(scws_top_t) * cnt);
1428
+ _tops_load_all(xt, list);
1429
+ qsort(list, cnt, sizeof(scws_top_t), _tops_cmp);
1430
+
1431
+ /* save to return pointer */
1432
+ if (!limit || limit > cnt)
1433
+ limit = cnt;
1434
+
1435
+ top = tail = (scws_top_t) malloc(sizeof(struct scws_topword));
1436
+ memcpy(top, list[0], sizeof(struct scws_topword));
1437
+ top->word = strdup(list[0]->word);
1438
+ top->next = NULL;
1439
+
1440
+ for (cnt = 1; cnt < limit; cnt++)
1441
+ {
1442
+ base = (scws_top_t) malloc(sizeof(struct scws_topword));
1443
+ memcpy(base, list[cnt], sizeof(struct scws_topword));
1444
+ base->word = strdup(list[cnt]->word);
1445
+ base->next = NULL;
1446
+ tail->next = base;
1447
+ tail = base;
1448
+ }
1449
+ free(list);
1450
+ }
1451
+
1452
+ // restore the offset
1453
+ s->off = off;
1454
+ xtree_free(xt);
1455
+ return top;
1456
+ }
1457
+
1458
+ // word check by attr.
1459
+ int scws_has_word(scws_t s, char *xattr)
1460
+ {
1461
+ int off, cnt, xmode = SCWS_NA;
1462
+ scws_res_t res, cur;
1463
+ char *word;
1464
+ word_attr *at = NULL;
1465
+
1466
+ if (!s || !s->txt)
1467
+ return 0;
1468
+
1469
+ __PARSE_XATTR__;
1470
+
1471
+ // save the offset. (cnt -> return_value)
1472
+ off = s->off;
1473
+ cnt = s->off = 0;
1474
+ while (!cnt && (cur = res = scws_get_result(s)) != NULL)
1475
+ {
1476
+ do
1477
+ {
1478
+ /* check attribute filter */
1479
+ if (at != NULL)
1480
+ {
1481
+ if ((xmode == SCWS_NA) && _attr_belong(cur->attr, at))
1482
+ cnt = 1;
1483
+
1484
+ if ((xmode == SCWS_YEA) && !_attr_belong(cur->attr, at))
1485
+ cnt = 1;
1486
+ }
1487
+ }
1488
+ while (!cnt && (cur = cur->next) != NULL);
1489
+ scws_free_result(res);
1490
+ }
1491
+ // memory leak fixed, thanks to lauxinz
1492
+ if (at != NULL)
1493
+ free(at);
1494
+ s->off = off;
1495
+ return cnt;
1496
+ }
1497
+
1498
+ // get words by attr (rand order)
1499
+ scws_top_t scws_get_words(scws_t s, char *xattr)
1500
+ {
1501
+ int off, cnt, xmode = SCWS_NA;
1502
+ xtree_t xt;
1503
+ scws_res_t res, cur;
1504
+ scws_top_t top, tail, base;
1505
+ char *word;
1506
+ word_attr *at = NULL;
1507
+
1508
+ if (!s || !s->txt || !(xt = xtree_new(0,1)))
1509
+ return NULL;
1510
+
1511
+ __PARSE_XATTR__;
1512
+
1513
+ // save the offset.
1514
+ off = s->off;
1515
+ s->off = 0;
1516
+ base = tail = NULL;
1517
+ while ((cur = res = scws_get_result(s)) != NULL)
1518
+ {
1519
+ do
1520
+ {
1521
+ /* check attribute filter */
1522
+ if (at != NULL)
1523
+ {
1524
+ if ((xmode == SCWS_NA) && !_attr_belong(cur->attr, at))
1525
+ continue;
1526
+
1527
+ if ((xmode == SCWS_YEA) && _attr_belong(cur->attr, at))
1528
+ continue;
1529
+ }
1530
+
1531
+ /* put to the stats */
1532
+ if (!(top = xtree_nget(xt, s->txt + cur->off, cur->len, NULL)))
1533
+ {
1534
+ top = (scws_top_t) malloc(sizeof(struct scws_topword));
1535
+ top->weight = cur->idf;
1536
+ top->times = 1;
1537
+ top->next = NULL;
1538
+ top->word = (char *)_mem_ndup(s->txt + cur->off, cur->len);
1539
+ strncpy(top->attr, cur->attr, 2);
1540
+ // add to the chain
1541
+ if (tail == NULL)
1542
+ base = tail = top;
1543
+ else
1544
+ {
1545
+ tail->next = top;
1546
+ tail = top;
1547
+ }
1548
+ xtree_nput(xt, top, sizeof(struct scws_topword), s->txt + cur->off, cur->len);
1549
+ }
1550
+ else
1551
+ {
1552
+ top->weight += cur->idf;
1553
+ top->times++;
1554
+ }
1555
+ }
1556
+ while ((cur = cur->next) != NULL);
1557
+ scws_free_result(res);
1558
+ }
1559
+
1560
+ // free at & xtree
1561
+ if (at != NULL)
1562
+ free(at);
1563
+ xtree_free(xt);
1564
+
1565
+ // restore the offset
1566
+ s->off = off;
1567
+ return base;
1568
+ }
1569
+
1570
+ void scws_free_tops(scws_top_t tops)
1571
+ {
1572
+ scws_top_t cur;
1573
+
1574
+ while ((cur = tops) != NULL)
1575
+ {
1576
+ tops = cur->next;
1577
+ if (cur->word)
1578
+ free(cur->word);
1579
+ free(cur);
1580
+ }
1581
+ }