isomorfeus-ferret 0.12.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (222) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +612 -0
  3. data/README.md +44 -0
  4. data/ext/isomorfeus_ferret_ext/benchmark.c +223 -0
  5. data/ext/isomorfeus_ferret_ext/benchmark.h +45 -0
  6. data/ext/isomorfeus_ferret_ext/benchmarks_all.h +25 -0
  7. data/ext/isomorfeus_ferret_ext/bm_bitvector.c +123 -0
  8. data/ext/isomorfeus_ferret_ext/bm_hash.c +118 -0
  9. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +40 -0
  10. data/ext/isomorfeus_ferret_ext/bm_store.c +93 -0
  11. data/ext/isomorfeus_ferret_ext/email.rl +21 -0
  12. data/ext/isomorfeus_ferret_ext/extconf.rb +5 -0
  13. data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -0
  14. data/ext/isomorfeus_ferret_ext/frb_analysis.c +2577 -0
  15. data/ext/isomorfeus_ferret_ext/frb_index.c +3457 -0
  16. data/ext/isomorfeus_ferret_ext/frb_lang.c +9 -0
  17. data/ext/isomorfeus_ferret_ext/frb_lang.h +17 -0
  18. data/ext/isomorfeus_ferret_ext/frb_qparser.c +629 -0
  19. data/ext/isomorfeus_ferret_ext/frb_search.c +4460 -0
  20. data/ext/isomorfeus_ferret_ext/frb_store.c +515 -0
  21. data/ext/isomorfeus_ferret_ext/frb_threading.h +30 -0
  22. data/ext/isomorfeus_ferret_ext/frb_utils.c +1127 -0
  23. data/ext/isomorfeus_ferret_ext/frt_analysis.c +1644 -0
  24. data/ext/isomorfeus_ferret_ext/frt_analysis.h +247 -0
  25. data/ext/isomorfeus_ferret_ext/frt_array.c +124 -0
  26. data/ext/isomorfeus_ferret_ext/frt_array.h +54 -0
  27. data/ext/isomorfeus_ferret_ext/frt_bitvector.c +95 -0
  28. data/ext/isomorfeus_ferret_ext/frt_bitvector.h +586 -0
  29. data/ext/isomorfeus_ferret_ext/frt_compound_io.c +374 -0
  30. data/ext/isomorfeus_ferret_ext/frt_config.h +44 -0
  31. data/ext/isomorfeus_ferret_ext/frt_document.c +134 -0
  32. data/ext/isomorfeus_ferret_ext/frt_document.h +52 -0
  33. data/ext/isomorfeus_ferret_ext/frt_except.c +95 -0
  34. data/ext/isomorfeus_ferret_ext/frt_except.h +188 -0
  35. data/ext/isomorfeus_ferret_ext/frt_field_index.c +233 -0
  36. data/ext/isomorfeus_ferret_ext/frt_field_index.h +42 -0
  37. data/ext/isomorfeus_ferret_ext/frt_filter.c +157 -0
  38. data/ext/isomorfeus_ferret_ext/frt_fs_store.c +502 -0
  39. data/ext/isomorfeus_ferret_ext/frt_global.c +427 -0
  40. data/ext/isomorfeus_ferret_ext/frt_global.h +290 -0
  41. data/ext/isomorfeus_ferret_ext/frt_hash.c +518 -0
  42. data/ext/isomorfeus_ferret_ext/frt_hash.h +466 -0
  43. data/ext/isomorfeus_ferret_ext/frt_hashset.c +191 -0
  44. data/ext/isomorfeus_ferret_ext/frt_hashset.h +206 -0
  45. data/ext/isomorfeus_ferret_ext/frt_helper.c +62 -0
  46. data/ext/isomorfeus_ferret_ext/frt_helper.h +13 -0
  47. data/ext/isomorfeus_ferret_ext/frt_ind.c +353 -0
  48. data/ext/isomorfeus_ferret_ext/frt_ind.h +54 -0
  49. data/ext/isomorfeus_ferret_ext/frt_index.c +6377 -0
  50. data/ext/isomorfeus_ferret_ext/frt_index.h +880 -0
  51. data/ext/isomorfeus_ferret_ext/frt_lang.c +104 -0
  52. data/ext/isomorfeus_ferret_ext/frt_lang.h +44 -0
  53. data/ext/isomorfeus_ferret_ext/frt_mempool.c +87 -0
  54. data/ext/isomorfeus_ferret_ext/frt_mempool.h +33 -0
  55. data/ext/isomorfeus_ferret_ext/frt_multimapper.c +349 -0
  56. data/ext/isomorfeus_ferret_ext/frt_multimapper.h +52 -0
  57. data/ext/isomorfeus_ferret_ext/frt_posh.c +1006 -0
  58. data/ext/isomorfeus_ferret_ext/frt_posh.h +973 -0
  59. data/ext/isomorfeus_ferret_ext/frt_priorityqueue.c +147 -0
  60. data/ext/isomorfeus_ferret_ext/frt_priorityqueue.h +147 -0
  61. data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +1612 -0
  62. data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +157 -0
  63. data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +209 -0
  64. data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +281 -0
  65. data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +147 -0
  66. data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +672 -0
  67. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +3084 -0
  68. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +1182 -0
  69. data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +98 -0
  70. data/ext/isomorfeus_ferret_ext/frt_q_range.c +665 -0
  71. data/ext/isomorfeus_ferret_ext/frt_q_span.c +2386 -0
  72. data/ext/isomorfeus_ferret_ext/frt_q_term.c +311 -0
  73. data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +166 -0
  74. data/ext/isomorfeus_ferret_ext/frt_ram_store.c +460 -0
  75. data/ext/isomorfeus_ferret_ext/frt_scanner.c +899 -0
  76. data/ext/isomorfeus_ferret_ext/frt_scanner.h +28 -0
  77. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +6705 -0
  78. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +4419 -0
  79. data/ext/isomorfeus_ferret_ext/frt_search.c +1824 -0
  80. data/ext/isomorfeus_ferret_ext/frt_search.h +924 -0
  81. data/ext/isomorfeus_ferret_ext/frt_similarity.c +150 -0
  82. data/ext/isomorfeus_ferret_ext/frt_similarity.h +79 -0
  83. data/ext/isomorfeus_ferret_ext/frt_sort.c +796 -0
  84. data/ext/isomorfeus_ferret_ext/frt_stopwords.c +395 -0
  85. data/ext/isomorfeus_ferret_ext/frt_store.c +680 -0
  86. data/ext/isomorfeus_ferret_ext/frt_store.h +789 -0
  87. data/ext/isomorfeus_ferret_ext/frt_term_vectors.c +72 -0
  88. data/ext/isomorfeus_ferret_ext/frt_threading.h +23 -0
  89. data/ext/isomorfeus_ferret_ext/frt_win32.h +54 -0
  90. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +409 -0
  91. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +95 -0
  92. data/ext/isomorfeus_ferret_ext/libstemmer.c +93 -0
  93. data/ext/isomorfeus_ferret_ext/libstemmer.h +73 -0
  94. data/ext/isomorfeus_ferret_ext/q_parser.y +1366 -0
  95. data/ext/isomorfeus_ferret_ext/scanner.h +28 -0
  96. data/ext/isomorfeus_ferret_ext/scanner.in +43 -0
  97. data/ext/isomorfeus_ferret_ext/scanner.rl +84 -0
  98. data/ext/isomorfeus_ferret_ext/scanner_mb.rl +200 -0
  99. data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +85 -0
  100. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +324 -0
  101. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +7 -0
  102. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +610 -0
  103. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +6 -0
  104. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +1104 -0
  105. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +6 -0
  106. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +749 -0
  107. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +7 -0
  108. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +1233 -0
  109. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +6 -0
  110. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +490 -0
  111. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +6 -0
  112. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +1217 -0
  113. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +7 -0
  114. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +1052 -0
  115. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +6 -0
  116. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +283 -0
  117. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +6 -0
  118. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +735 -0
  119. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +6 -0
  120. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +1003 -0
  121. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +7 -0
  122. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +1079 -0
  123. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +6 -0
  124. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +293 -0
  125. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +6 -0
  126. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +984 -0
  127. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +6 -0
  128. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +686 -0
  129. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +6 -0
  130. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.c +325 -0
  131. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.h +6 -0
  132. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.c +620 -0
  133. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.h +6 -0
  134. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.c +1111 -0
  135. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.h +6 -0
  136. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.c +754 -0
  137. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.h +6 -0
  138. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.c +1242 -0
  139. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.h +6 -0
  140. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.c +495 -0
  141. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.h +6 -0
  142. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.c +1220 -0
  143. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.h +6 -0
  144. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.c +1059 -0
  145. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.h +6 -0
  146. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.c +285 -0
  147. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.h +6 -0
  148. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.c +741 -0
  149. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.h +6 -0
  150. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.c +1009 -0
  151. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.h +6 -0
  152. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.c +990 -0
  153. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.h +6 -0
  154. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.c +680 -0
  155. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.h +6 -0
  156. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.c +1083 -0
  157. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.h +6 -0
  158. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.c +294 -0
  159. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.h +6 -0
  160. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.c +2191 -0
  161. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.h +6 -0
  162. data/ext/isomorfeus_ferret_ext/stem_api.c +66 -0
  163. data/ext/isomorfeus_ferret_ext/stem_api.h +26 -0
  164. data/ext/isomorfeus_ferret_ext/stem_header.h +57 -0
  165. data/ext/isomorfeus_ferret_ext/stem_modules.h +190 -0
  166. data/ext/isomorfeus_ferret_ext/stem_modules.txt +50 -0
  167. data/ext/isomorfeus_ferret_ext/stem_utilities.c +478 -0
  168. data/ext/isomorfeus_ferret_ext/test.c +850 -0
  169. data/ext/isomorfeus_ferret_ext/test.h +416 -0
  170. data/ext/isomorfeus_ferret_ext/test_1710.c +63 -0
  171. data/ext/isomorfeus_ferret_ext/test_analysis.c +1221 -0
  172. data/ext/isomorfeus_ferret_ext/test_array.c +272 -0
  173. data/ext/isomorfeus_ferret_ext/test_bitvector.c +600 -0
  174. data/ext/isomorfeus_ferret_ext/test_compound_io.c +170 -0
  175. data/ext/isomorfeus_ferret_ext/test_document.c +156 -0
  176. data/ext/isomorfeus_ferret_ext/test_except.c +244 -0
  177. data/ext/isomorfeus_ferret_ext/test_fields.c +522 -0
  178. data/ext/isomorfeus_ferret_ext/test_file_deleter.c +185 -0
  179. data/ext/isomorfeus_ferret_ext/test_filter.c +331 -0
  180. data/ext/isomorfeus_ferret_ext/test_fs_store.c +25 -0
  181. data/ext/isomorfeus_ferret_ext/test_global.c +299 -0
  182. data/ext/isomorfeus_ferret_ext/test_hash.c +485 -0
  183. data/ext/isomorfeus_ferret_ext/test_hashset.c +288 -0
  184. data/ext/isomorfeus_ferret_ext/test_helper.c +47 -0
  185. data/ext/isomorfeus_ferret_ext/test_highlighter.c +548 -0
  186. data/ext/isomorfeus_ferret_ext/test_index.c +2323 -0
  187. data/ext/isomorfeus_ferret_ext/test_lang.c +74 -0
  188. data/ext/isomorfeus_ferret_ext/test_mempool.c +102 -0
  189. data/ext/isomorfeus_ferret_ext/test_multimapper.c +64 -0
  190. data/ext/isomorfeus_ferret_ext/test_priorityqueue.c +213 -0
  191. data/ext/isomorfeus_ferret_ext/test_q_const_score.c +84 -0
  192. data/ext/isomorfeus_ferret_ext/test_q_filtered.c +61 -0
  193. data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +241 -0
  194. data/ext/isomorfeus_ferret_ext/test_q_parser.c +464 -0
  195. data/ext/isomorfeus_ferret_ext/test_q_span.c +575 -0
  196. data/ext/isomorfeus_ferret_ext/test_ram_store.c +77 -0
  197. data/ext/isomorfeus_ferret_ext/test_search.c +1874 -0
  198. data/ext/isomorfeus_ferret_ext/test_segments.c +167 -0
  199. data/ext/isomorfeus_ferret_ext/test_similarity.c +25 -0
  200. data/ext/isomorfeus_ferret_ext/test_sort.c +333 -0
  201. data/ext/isomorfeus_ferret_ext/test_store.c +591 -0
  202. data/ext/isomorfeus_ferret_ext/test_store.h +3 -0
  203. data/ext/isomorfeus_ferret_ext/test_term.c +351 -0
  204. data/ext/isomorfeus_ferret_ext/test_term_vectors.c +373 -0
  205. data/ext/isomorfeus_ferret_ext/test_test.c +83 -0
  206. data/ext/isomorfeus_ferret_ext/test_threading.c +188 -0
  207. data/ext/isomorfeus_ferret_ext/testhelper.c +561 -0
  208. data/ext/isomorfeus_ferret_ext/testhelper.h +25 -0
  209. data/ext/isomorfeus_ferret_ext/tests_all.h +87 -0
  210. data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +1854 -0
  211. data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +1999 -0
  212. data/ext/isomorfeus_ferret_ext/url.rl +27 -0
  213. data/ext/isomorfeus_ferret_ext/word_list.h +15156 -0
  214. data/lib/isomorfeus/ferret/document.rb +132 -0
  215. data/lib/isomorfeus/ferret/field_symbol.rb +85 -0
  216. data/lib/isomorfeus/ferret/index/field_infos.rb +48 -0
  217. data/lib/isomorfeus/ferret/index/index.rb +970 -0
  218. data/lib/isomorfeus/ferret/monitor.rb +323 -0
  219. data/lib/isomorfeus/ferret/stdlib_patches.rb +151 -0
  220. data/lib/isomorfeus/ferret/version.rb +5 -0
  221. data/lib/isomorfeus-ferret.rb +8 -0
  222. metadata +307 -0
@@ -0,0 +1,1644 @@
1
+ #include <string.h>
2
+ #include <ctype.h>
3
+ #include <wctype.h>
4
+ #include <wchar.h>
5
+ #include "frt_analysis.h"
6
+ #include "frt_hash.h"
7
+ #include "libstemmer.h"
8
+ #include "frt_scanner.h"
9
+
10
+ /****************************************************************************
11
+ *
12
+ * Token
13
+ *
14
+ ****************************************************************************/
15
+
16
+ FrtToken *frt_tk_set(FrtToken *tk,
17
+ char *text, int tlen, off_t start, off_t end, int pos_inc)
18
+ {
19
+ if (tlen >= FRT_MAX_WORD_SIZE) {
20
+ tlen = FRT_MAX_WORD_SIZE - 1;
21
+ }
22
+ memcpy(tk->text, text, sizeof(char) * tlen);
23
+ tk->text[tlen] = '\0';
24
+ tk->len = tlen;
25
+ tk->start = start;
26
+ tk->end = end;
27
+ tk->pos_inc = pos_inc;
28
+ return tk;
29
+ }
30
+
31
+ static FrtToken *frt_tk_set_ts(FrtToken *tk, char *start, char *end,
32
+ char *text, int pos_inc)
33
+ {
34
+ return frt_tk_set(tk, start, (int)(end - start),
35
+ (off_t)(start - text), (off_t)(end - text), pos_inc);
36
+ }
37
+
38
+ FrtToken *frt_tk_set_no_len(FrtToken *tk,
39
+ char *text, off_t start, off_t end, int pos_inc)
40
+ {
41
+ return frt_tk_set(tk, text, (int)strlen(text), start, end, pos_inc);
42
+ }
43
+
44
+ static FrtToken *w_tk_set(FrtToken *tk, wchar_t *text, off_t start,
45
+ off_t end, int pos_inc)
46
+ {
47
+ int len = wcstombs(tk->text, text, FRT_MAX_WORD_SIZE - 1);
48
+ tk->text[len] = '\0';
49
+ tk->len = len;
50
+ tk->start = start;
51
+ tk->end = end;
52
+ tk->pos_inc = pos_inc;
53
+ return tk;
54
+ }
55
+
56
+ int frt_tk_eq(FrtToken *tk1, FrtToken *tk2)
57
+ {
58
+ return (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
59
+ tk1->start == tk2->start && tk1->end == tk2->end &&
60
+ tk1->pos_inc == tk2->pos_inc);
61
+ }
62
+
63
+ int frt_tk_cmp(FrtToken *tk1, FrtToken *tk2)
64
+ {
65
+ int cmp;
66
+ if (tk1->start > tk2->start) {
67
+ cmp = 1;
68
+ }
69
+ else if (tk1->start < tk2->start) {
70
+ cmp = -1;
71
+ }
72
+ else {
73
+ if (tk1->end > tk2->end) {
74
+ cmp = 1;
75
+ }
76
+ else if (tk1->end < tk2->end) {
77
+ cmp = -1;
78
+ }
79
+ else {
80
+ cmp = strcmp((char *)tk1->text, (char *)tk2->text);
81
+ }
82
+ }
83
+ return cmp;
84
+ }
85
+
86
+ void frt_tk_destroy(void *p)
87
+ {
88
+ free(p);
89
+ }
90
+
91
+ FrtToken *frt_tk_new()
92
+ {
93
+ return FRT_ALLOC(FrtToken);
94
+ }
95
+ /****************************************************************************
96
+ *
97
+ * TokenStream
98
+ *
99
+ ****************************************************************************/
100
+
101
+ void frt_ts_deref(FrtTokenStream *ts)
102
+ {
103
+ if (--ts->ref_cnt <= 0) {
104
+ ts->destroy_i(ts);
105
+ }
106
+ }
107
+
108
+ static FrtTokenStream *ts_reset(FrtTokenStream *ts, char *text)
109
+ {
110
+ ts->t = ts->text = text;
111
+ return ts;
112
+ }
113
+
114
+ FrtTokenStream *frt_ts_clone_size(FrtTokenStream *orig_ts, size_t size)
115
+ {
116
+ FrtTokenStream *ts = (FrtTokenStream *)frt_ecalloc(size);
117
+ memcpy(ts, orig_ts, size);
118
+ ts->ref_cnt = 1;
119
+ return ts;
120
+ }
121
+
122
+ FrtTokenStream *frt_ts_new_i(size_t size)
123
+ {
124
+ FrtTokenStream *ts = (FrtTokenStream *)frt_ecalloc(size);
125
+
126
+ ts->destroy_i = (void (*)(FrtTokenStream *))&free;
127
+ ts->reset = &ts_reset;
128
+ ts->ref_cnt = 1;
129
+
130
+ return ts;
131
+ }
132
+
133
+ /****************************************************************************
134
+ * CachedTokenStream
135
+ ****************************************************************************/
136
+
137
+ #define CTS(token_stream) ((FrtCachedTokenStream *)(token_stream))
138
+
139
+ static FrtTokenStream *cts_clone_i(FrtTokenStream *orig_ts)
140
+ {
141
+ return frt_ts_clone_size(orig_ts, sizeof(FrtCachedTokenStream));
142
+ }
143
+
144
+ static FrtTokenStream *cts_new()
145
+ {
146
+ FrtTokenStream *ts = frt_ts_new(FrtCachedTokenStream);
147
+ ts->clone_i = &cts_clone_i;
148
+ return ts;
149
+ }
150
+
151
+ /* * Multi-byte TokenStream * */
152
+
153
+ #define MBTS(token_stream) ((FrtMultiByteTokenStream *)(token_stream))
154
+
155
+ static int mb_next_char(wchar_t *wchr, const char *s, mbstate_t *state)
156
+ {
157
+ int num_bytes;
158
+ if ((num_bytes = (int)mbrtowc(wchr, s, MB_CUR_MAX, state)) < 0) {
159
+ const char *t = s;
160
+ do {
161
+ t++;
162
+ FRT_ZEROSET(state, mbstate_t);
163
+ num_bytes = (int)mbrtowc(wchr, t, MB_CUR_MAX, state);
164
+ } while ((num_bytes < 0) && (*t != 0));
165
+ num_bytes = t - s;
166
+ if (*t == 0) *wchr = 0;
167
+ }
168
+ return num_bytes;
169
+ }
170
+
171
+ static FrtTokenStream *mb_ts_reset(FrtTokenStream *ts, char *text)
172
+ {
173
+ FRT_ZEROSET(&(MBTS(ts)->state), mbstate_t);
174
+ ts_reset(ts, text);
175
+ return ts;
176
+ }
177
+
178
+ static FrtTokenStream *mb_ts_clone_i(FrtTokenStream *orig_ts)
179
+ {
180
+ return frt_ts_clone_size(orig_ts, sizeof(FrtMultiByteTokenStream));
181
+ }
182
+
183
+ static FrtTokenStream *mb_ts_new()
184
+ {
185
+ FrtTokenStream *ts = frt_ts_new(FrtMultiByteTokenStream);
186
+ ts->reset = &mb_ts_reset;
187
+ ts->clone_i = &mb_ts_clone_i;
188
+ ts->ref_cnt = 1;
189
+ return ts;
190
+ }
191
+
192
+ /****************************************************************************
193
+ *
194
+ * Analyzer
195
+ *
196
+ ****************************************************************************/
197
+
198
+ void frt_a_deref(FrtAnalyzer *a)
199
+ {
200
+ if (--a->ref_cnt <= 0) {
201
+ a->destroy_i(a);
202
+ }
203
+ }
204
+
205
+ static void frt_a_standard_destroy_i(FrtAnalyzer *a)
206
+ {
207
+ if (a->current_ts) {
208
+ frt_ts_deref(a->current_ts);
209
+ }
210
+ free(a);
211
+ }
212
+
213
+ static FrtTokenStream *a_standard_get_ts(FrtAnalyzer *a,
214
+ FrtSymbol field,
215
+ char *text)
216
+ {
217
+ FrtTokenStream *ts;
218
+ (void)field;
219
+ ts = frt_ts_clone(a->current_ts);
220
+ return ts->reset(ts, text);
221
+ }
222
+
223
+ FrtAnalyzer *frt_analyzer_new(FrtTokenStream *ts,
224
+ void (*destroy_i)(FrtAnalyzer *a),
225
+ FrtTokenStream *(*get_ts)(FrtAnalyzer *a,
226
+ FrtSymbol field,
227
+ char *text))
228
+ {
229
+ FrtAnalyzer *a = FRT_ALLOC(FrtAnalyzer);
230
+ a->current_ts = ts;
231
+ a->destroy_i = (destroy_i ? destroy_i : &frt_a_standard_destroy_i);
232
+ a->get_ts = (get_ts ? get_ts : &a_standard_get_ts);
233
+ a->ref_cnt = 1;
234
+ return a;
235
+ }
236
+
237
+ /****************************************************************************
238
+ *
239
+ * Non
240
+ *
241
+ ****************************************************************************/
242
+
243
+ /*
244
+ * NonTokenizer
245
+ */
246
+ static FrtToken *nt_next(FrtTokenStream *ts)
247
+ {
248
+ if (ts->t) {
249
+ size_t len = strlen(ts->t);
250
+ ts->t = NULL;
251
+
252
+ return frt_tk_set(&(CTS(ts)->token), ts->text, len, 0, len, 1);
253
+ }
254
+ else {
255
+ return NULL;
256
+ }
257
+ }
258
+
259
+ FrtTokenStream *frt_non_tokenizer_new()
260
+ {
261
+ FrtTokenStream *ts = cts_new();
262
+ ts->next = &nt_next;
263
+ return ts;
264
+ }
265
+
266
+ /*
267
+ * NonAnalyzer
268
+ */
269
+ FrtAnalyzer *frt_non_analyzer_new()
270
+ {
271
+ return frt_analyzer_new(frt_non_tokenizer_new(), NULL, NULL);
272
+ }
273
+
274
+ /****************************************************************************
275
+ *
276
+ * Whitespace
277
+ *
278
+ ****************************************************************************/
279
+
280
+ /*
281
+ * WhitespaceTokenizer
282
+ */
283
+ static FrtToken *wst_next(FrtTokenStream *ts)
284
+ {
285
+ char *t = ts->t;
286
+ char *start;
287
+
288
+ while (*t != '\0' && isspace(*t)) {
289
+ t++;
290
+ }
291
+
292
+ if (*t == '\0') {
293
+ return NULL;
294
+ }
295
+
296
+ start = t;
297
+ while (*t != '\0' && !isspace(*t)) {
298
+ t++;
299
+ }
300
+
301
+ ts->t = t;
302
+ return frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
303
+ }
304
+
305
+ FrtTokenStream *frt_whitespace_tokenizer_new()
306
+ {
307
+ FrtTokenStream *ts = cts_new();
308
+ ts->next = &wst_next;
309
+ return ts;
310
+ }
311
+
312
+ /*
313
+ * Multi-byte WhitespaceTokenizer
314
+ */
315
+ static FrtToken *mb_wst_next(FrtTokenStream *ts)
316
+ {
317
+ int i;
318
+ char *start;
319
+ char *t = ts->t;
320
+ wchar_t wchr;
321
+ mbstate_t *state = &(MBTS(ts)->state);
322
+
323
+ i = mb_next_char(&wchr, t, state);
324
+ while (wchr != 0 && iswspace(wchr)) {
325
+ t += i;
326
+ i = mb_next_char(&wchr, t, state);
327
+ }
328
+ if (wchr == 0) {
329
+ return NULL;
330
+ }
331
+
332
+ start = t;
333
+ t += i;
334
+ i = mb_next_char(&wchr, t, state);
335
+ while (wchr != 0 && !iswspace(wchr)) {
336
+ t += i;
337
+ i = mb_next_char(&wchr, t, state);
338
+ }
339
+ ts->t = t;
340
+ return frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
341
+ }
342
+
343
+ /*
344
+ * Lowercasing Multi-byte WhitespaceTokenizer
345
+ */
346
+ static FrtToken *mb_wst_next_lc(FrtTokenStream *ts)
347
+ {
348
+ int i;
349
+ char *start;
350
+ char *t = ts->t;
351
+ wchar_t wchr;
352
+ wchar_t wbuf[FRT_MAX_WORD_SIZE + 1], *w, *w_end;
353
+ mbstate_t *state = &(MBTS(ts)->state);
354
+
355
+ w = wbuf;
356
+ w_end = &wbuf[FRT_MAX_WORD_SIZE];
357
+
358
+ i = mb_next_char(&wchr, t, state);
359
+ while (wchr != 0 && iswspace(wchr)) {
360
+ t += i;
361
+ i = mb_next_char(&wchr, t, state);
362
+ }
363
+ if (wchr == 0) {
364
+ return NULL;
365
+ }
366
+
367
+ start = t;
368
+ t += i;
369
+ *w++ = towlower(wchr);
370
+ i = mb_next_char(&wchr, t, state);
371
+ while (wchr != 0 && !iswspace(wchr)) {
372
+ if (w < w_end) {
373
+ *w++ = towlower(wchr);
374
+ }
375
+ t += i;
376
+ i = mb_next_char(&wchr, t, state);
377
+ }
378
+ *w = 0;
379
+ ts->t = t;
380
+ return w_tk_set(&(CTS(ts)->token), wbuf, (off_t)(start - ts->text),
381
+ (off_t)(t - ts->text), 1);
382
+ }
383
+
384
+ FrtTokenStream *frt_mb_whitespace_tokenizer_new(bool lowercase)
385
+ {
386
+ FrtTokenStream *ts = mb_ts_new();
387
+ ts->next = lowercase ? &mb_wst_next_lc : &mb_wst_next;
388
+ return ts;
389
+ }
390
+
391
+ /*
392
+ * WhitespaceAnalyzers
393
+ */
394
+ FrtAnalyzer *frt_whitespace_analyzer_new(bool lowercase)
395
+ {
396
+ FrtTokenStream *ts;
397
+ if (lowercase) {
398
+ ts = frt_lowercase_filter_new(frt_whitespace_tokenizer_new());
399
+ }
400
+ else {
401
+ ts = frt_whitespace_tokenizer_new();
402
+ }
403
+ return frt_analyzer_new(ts, NULL, NULL);
404
+ }
405
+
406
+ FrtAnalyzer *frt_mb_whitespace_analyzer_new(bool lowercase)
407
+ {
408
+ return frt_analyzer_new(frt_mb_whitespace_tokenizer_new(lowercase), NULL, NULL);
409
+ }
410
+
411
+ /****************************************************************************
412
+ *
413
+ * Letter
414
+ *
415
+ ****************************************************************************/
416
+
417
+ /*
418
+ * LetterTokenizer
419
+ */
420
+ static FrtToken *lt_next(FrtTokenStream *ts)
421
+ {
422
+ char *start;
423
+ char *t = ts->t;
424
+
425
+ while (*t != '\0' && !isalpha(*t)) {
426
+ t++;
427
+ }
428
+
429
+ if (*t == '\0') {
430
+ return NULL;
431
+ }
432
+
433
+ start = t;
434
+ while (*t != '\0' && isalpha(*t)) {
435
+ t++;
436
+ }
437
+
438
+ ts->t = t;
439
+ return frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
440
+ }
441
+
442
+ FrtTokenStream *frt_letter_tokenizer_new()
443
+ {
444
+ FrtTokenStream *ts = cts_new();
445
+ ts->next = &lt_next;
446
+ return ts;
447
+ }
448
+
449
+ /*
450
+ * Multi-byte LetterTokenizer
451
+ */
452
+ static FrtToken *mb_lt_next(FrtTokenStream *ts)
453
+ {
454
+ int i;
455
+ char *start;
456
+ char *t = ts->t;
457
+ wchar_t wchr;
458
+ mbstate_t *state = &(MBTS(ts)->state);
459
+
460
+ i = mb_next_char(&wchr, t, state);
461
+ while (wchr != 0 && !iswalpha(wchr)) {
462
+ t += i;
463
+ i = mb_next_char(&wchr, t, state);
464
+ }
465
+
466
+ if (wchr == 0) {
467
+ return NULL;
468
+ }
469
+
470
+ start = t;
471
+ t += i;
472
+ i = mb_next_char(&wchr, t, state);
473
+ while (wchr != 0 && iswalpha(wchr)) {
474
+ t += i;
475
+ i = mb_next_char(&wchr, t, state);
476
+ }
477
+ ts->t = t;
478
+ return frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
479
+ }
480
+
481
+ /*
482
+ * Lowercasing Multi-byte LetterTokenizer
483
+ */
484
+ static FrtToken *mb_lt_next_lc(FrtTokenStream *ts)
485
+ {
486
+ int i;
487
+ char *start;
488
+ char *t = ts->t;
489
+ wchar_t wchr;
490
+ wchar_t wbuf[FRT_MAX_WORD_SIZE + 1], *w, *w_end;
491
+ mbstate_t *state = &(MBTS(ts)->state);
492
+
493
+ w = wbuf;
494
+ w_end = &wbuf[FRT_MAX_WORD_SIZE];
495
+
496
+ i = mb_next_char(&wchr, t, state);
497
+ while (wchr != 0 && !iswalpha(wchr)) {
498
+ t += i;
499
+ i = mb_next_char(&wchr, t, state);
500
+ }
501
+ if (wchr == 0) {
502
+ return NULL;
503
+ }
504
+
505
+ start = t;
506
+ t += i;
507
+ *w++ = towlower(wchr);
508
+ i = mb_next_char(&wchr, t, state);
509
+ while (wchr != 0 && iswalpha(wchr)) {
510
+ if (w < w_end) {
511
+ *w++ = towlower(wchr);
512
+ }
513
+ t += i;
514
+ i = mb_next_char(&wchr, t, state);
515
+ }
516
+ *w = 0;
517
+ ts->t = t;
518
+ return w_tk_set(&(CTS(ts)->token), wbuf, (off_t)(start - ts->text),
519
+ (off_t)(t - ts->text), 1);
520
+ }
521
+
522
+ FrtTokenStream *frt_mb_letter_tokenizer_new(bool lowercase)
523
+ {
524
+ FrtTokenStream *ts = mb_ts_new();
525
+ ts->next = lowercase ? &mb_lt_next_lc : &mb_lt_next;
526
+ return ts;
527
+ }
528
+
529
+ /*
530
+ * LetterAnalyzers
531
+ */
532
+ FrtAnalyzer *frt_letter_analyzer_new(bool lowercase)
533
+ {
534
+ FrtTokenStream *ts;
535
+ if (lowercase) {
536
+ ts = frt_lowercase_filter_new(frt_letter_tokenizer_new());
537
+ }
538
+ else {
539
+ ts = frt_letter_tokenizer_new();
540
+ }
541
+ return frt_analyzer_new(ts, NULL, NULL);
542
+ }
543
+
544
+ FrtAnalyzer *frt_mb_letter_analyzer_new(bool lowercase)
545
+ {
546
+ return frt_analyzer_new(frt_mb_letter_tokenizer_new(lowercase), NULL, NULL);
547
+ }
548
+
549
+ /****************************************************************************
550
+ *
551
+ * Standard
552
+ *
553
+ ****************************************************************************/
554
+
555
+ #define STDTS(token_stream) ((FrtStandardTokenizer *)(token_stream))
556
+
557
+ /*
558
+ * FrtStandardTokenizer
559
+ */
560
+ static FrtToken *std_next(FrtTokenStream *ts)
561
+ {
562
+ FrtStandardTokenizer *std_tz = STDTS(ts);
563
+ const char *start = NULL;
564
+ const char *end = NULL;
565
+ int len;
566
+ FrtToken *tk = &(CTS(ts)->token);
567
+
568
+ switch (std_tz->type) {
569
+ case FRT_STT_ASCII:
570
+ frt_std_scan(ts->t, tk->text, sizeof(tk->text) - 1,
571
+ &start, &end, &len);
572
+ break;
573
+ case FRT_STT_MB:
574
+ frt_std_scan_mb(ts->t, tk->text, sizeof(tk->text) - 1,
575
+ &start, &end, &len);
576
+ break;
577
+ case FRT_STT_UTF8:
578
+ frt_std_scan_utf8(ts->t, tk->text, sizeof(tk->text) - 1,
579
+ &start, &end, &len);
580
+ break;
581
+ }
582
+
583
+ if (len == 0)
584
+ return NULL;
585
+
586
+ ts->t = (char *)end;
587
+ tk->len = len;
588
+ tk->start = start - ts->text;
589
+ tk->end = end - ts->text;
590
+ tk->pos_inc = 1;
591
+ return &(CTS(ts)->token);
592
+ }
593
+
594
+ static FrtTokenStream *std_ts_clone_i(FrtTokenStream *orig_ts)
595
+ {
596
+ return frt_ts_clone_size(orig_ts, sizeof(FrtStandardTokenizer));
597
+ }
598
+
599
+ static FrtTokenStream *std_ts_new()
600
+ {
601
+ FrtTokenStream *ts = frt_ts_new(FrtStandardTokenizer);
602
+
603
+ ts->clone_i = &std_ts_clone_i;
604
+ ts->next = &std_next;
605
+
606
+ return ts;
607
+ }
608
+
609
+ FrtTokenStream *frt_standard_tokenizer_new()
610
+ {
611
+ FrtTokenStream *ts = std_ts_new();
612
+ STDTS(ts)->type = FRT_STT_ASCII;
613
+ return ts;
614
+ }
615
+
616
+ FrtTokenStream *frt_mb_standard_tokenizer_new()
617
+ {
618
+ FrtTokenStream *ts = std_ts_new();
619
+ STDTS(ts)->type = FRT_STT_MB;
620
+ return ts;
621
+ }
622
+
623
+ FrtTokenStream *frt_utf8_standard_tokenizer_new()
624
+ {
625
+ FrtTokenStream *ts = std_ts_new();
626
+ STDTS(ts)->type = FRT_STT_UTF8;
627
+ return ts;
628
+ }
629
+
630
+ /****************************************************************************
631
+ *
632
+ * LegacyStandard
633
+ *
634
+ ****************************************************************************/
635
+
636
+ #define LSTDTS(token_stream) ((FrtLegacyStandardTokenizer *)(token_stream))
637
+
638
+ /*
639
+ * LegacyStandardTokenizer
640
+ */
641
+ static int legacy_std_get_alpha(FrtTokenStream *ts, char *token)
642
+ {
643
+ int i = 0;
644
+ char *t = ts->t;
645
+ while (t[i] != '\0' && isalnum(t[i])) {
646
+ if (i < FRT_MAX_WORD_SIZE) {
647
+ token[i] = t[i];
648
+ }
649
+ i++;
650
+ }
651
+ return i;
652
+ }
653
+
654
+ static int mb_legacy_std_get_alpha(FrtTokenStream *ts, char *token)
655
+ {
656
+ char *t = ts->t;
657
+ wchar_t wchr;
658
+ int i;
659
+ mbstate_t state; FRT_ZEROSET(&state, mbstate_t);
660
+
661
+ i = mb_next_char(&wchr, t, &state);
662
+
663
+ while (wchr != 0 && iswalnum(wchr)) {
664
+ t += i;
665
+ i = mb_next_char(&wchr, t, &state);
666
+ }
667
+
668
+ i = (int)(t - ts->t);
669
+ if (i > FRT_MAX_WORD_SIZE) {
670
+ i = FRT_MAX_WORD_SIZE - 1;
671
+ }
672
+ memcpy(token, ts->t, i);
673
+ return i;
674
+ }
675
+
676
+ static int isnumpunc(char c)
677
+ {
678
+ return (c == '.' || c == ',' || c == '\\' || c == '/' || c == '_'
679
+ || c == '-');
680
+ }
681
+
682
+ static int w_isnumpunc(wchar_t c)
683
+ {
684
+ return (c == L'.' || c == L',' || c == L'\\' || c == L'/' || c == L'_'
685
+ || c == L'-');
686
+ }
687
+
688
+ static int isurlpunc(char c)
689
+ {
690
+ return (c == '.' || c == '/' || c == '-' || c == '_');
691
+ }
692
+
693
+ static int isurlc(char c)
694
+ {
695
+ return (c == '.' || c == '/' || c == '-' || c == '_' || isalnum(c));
696
+ }
697
+
698
+ static int isurlxatpunc(char c)
699
+ {
700
+ return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@');
701
+ }
702
+
703
+ static int isurlxatc(char c)
704
+ {
705
+ return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@'
706
+ || isalnum(c));
707
+ }
708
+
709
+ static bool legacy_std_is_tok_char(char *c)
710
+ {
711
+ if (isspace(*c)) {
712
+ return false; /* most common so check first. */
713
+ }
714
+ if (isalnum(*c) || isnumpunc(*c) || *c == '&' ||
715
+ *c == '@' || *c == '\'' || *c == ':') {
716
+ return true;
717
+ }
718
+ return false;
719
+ }
720
+
721
+ static bool mb_legacy_std_is_tok_char(char *t)
722
+ {
723
+ wchar_t c;
724
+ mbstate_t state; FRT_ZEROSET(&state, mbstate_t);
725
+
726
+ if (((int)mbrtowc(&c, t, MB_CUR_MAX, &state)) < 0) {
727
+ /* error which we can handle next time round. For now just return
728
+ * false so that we can return a token */
729
+ return false;
730
+ }
731
+ if (iswspace(c)) {
732
+ return false; /* most common so check first. */
733
+ }
734
+ if (iswalnum(c) || w_isnumpunc(c) || c == L'&' || c == L'@' || c == L'\''
735
+ || c == L':') {
736
+ return true;
737
+ }
738
+ return false;
739
+ }
740
+
741
+ /* (alnum)((punc)(alnum))+ where every second sequence of alnum must contain at
742
+ * least one digit.
743
+ * (alnum) = [a-zA-Z0-9]
744
+ * (punc) = [_\/.,-]
745
+ */
746
+ static int legacy_std_get_number(char *input)
747
+ {
748
+ int i = 0;
749
+ int count = 0;
750
+ int last_seen_digit = 2;
751
+ int seen_digit = false;
752
+
753
+ while (last_seen_digit >= 0) {
754
+ while ((input[i] != '\0') && isalnum(input[i])) {
755
+ if ((last_seen_digit < 2) && isdigit(input[i])) {
756
+ last_seen_digit = 2;
757
+ }
758
+ if ((seen_digit == false) && isdigit(input[i])) {
759
+ seen_digit = true;
760
+ }
761
+ i++;
762
+ }
763
+ last_seen_digit--;
764
+ if (!isnumpunc(input[i]) || !isalnum(input[i + 1])) {
765
+
766
+ if (last_seen_digit >= 0) {
767
+ count = i;
768
+ }
769
+ break;
770
+ }
771
+ count = i;
772
+ i++;
773
+ }
774
+ if (seen_digit) {
775
+ return count;
776
+ }
777
+ else {
778
+ return 0;
779
+ }
780
+ }
781
+
782
+ static int legacy_std_get_apostrophe(char *input)
783
+ {
784
+ char *t = input;
785
+
786
+ while (isalpha(*t) || *t == '\'') {
787
+ t++;
788
+ }
789
+
790
+ return (int)(t - input);
791
+ }
792
+
793
+ static int mb_legacy_std_get_apostrophe(char *input)
794
+ {
795
+ char *t = input;
796
+ wchar_t wchr;
797
+ int i;
798
+ mbstate_t state; FRT_ZEROSET(&state, mbstate_t);
799
+
800
+ i = mb_next_char(&wchr, t, &state);
801
+
802
+ while (iswalpha(wchr) || wchr == L'\'') {
803
+ t += i;
804
+ i = mb_next_char(&wchr, t, &state);
805
+ }
806
+ return (int)(t - input);
807
+ }
808
+
809
+ static char *std_get_url(char *input, char *token, int i, int *len)
810
+ {
811
+ char *next = NULL;
812
+ while (isurlc(input[i])) {
813
+ if (isurlpunc(input[i]) && isurlpunc(input[i - 1])) {
814
+ break; /* can't have two puncs in a row */
815
+ }
816
+ if (i < FRT_MAX_WORD_SIZE) {
817
+ token[i] = input[i];
818
+ }
819
+ i++;
820
+ }
821
+ next = input + i;
822
+
823
+ /* We don't want to index past the end of the token capacity) */
824
+ if (i >= FRT_MAX_WORD_SIZE) {
825
+ i = FRT_MAX_WORD_SIZE - 1;
826
+ }
827
+
828
+ /* strip trailing puncs */
829
+ while (isurlpunc(input[i - 1])) {
830
+ i--;
831
+ }
832
+ *len = i;
833
+ token[i] = '\0';
834
+
835
+ return next;
836
+ }
837
+
838
+ /* Company names can contain '@' and '&' like AT&T and Excite@Home. Let's
839
+ */
840
+ static int legacy_std_get_company_name(char *input)
841
+ {
842
+ int i = 0;
843
+ while (isalpha(input[i]) || input[i] == '@' || input[i] == '&') {
844
+ i++;
845
+ }
846
+
847
+ return i;
848
+ }
849
+
850
+ static bool legacy_std_advance_to_start(FrtTokenStream *ts)
851
+ {
852
+ char *t = ts->t;
853
+ while (*t != '\0' && !isalnum(*t)) {
854
+ if (isnumpunc(*t) && isdigit(t[1])) break;
855
+ t++;
856
+ }
857
+
858
+ ts->t = t;
859
+
860
+ return (*t != '\0');
861
+ }
862
+
863
+ static bool mb_legacy_std_advance_to_start(FrtTokenStream *ts)
864
+ {
865
+ int i;
866
+ wchar_t wchr;
867
+ mbstate_t state; FRT_ZEROSET(&state, mbstate_t);
868
+
869
+ i = mb_next_char(&wchr, ts->t, &state);
870
+
871
+ while (wchr != 0 && !iswalnum(wchr)) {
872
+ if (isnumpunc(*ts->t) && isdigit(ts->t[1])) break;
873
+ ts->t += i;
874
+ i = mb_next_char(&wchr, ts->t, &state);
875
+ }
876
+
877
+ return (wchr != 0);
878
+ }
879
+
880
+ static FrtToken *legacy_std_next(FrtTokenStream *ts)
881
+ {
882
+ FrtLegacyStandardTokenizer *std_tz = LSTDTS(ts);
883
+ char *s;
884
+ char *t;
885
+ char *start = NULL;
886
+ char *num_end = NULL;
887
+ char token[FRT_MAX_WORD_SIZE + 1];
888
+ int token_i = 0;
889
+ int len;
890
+ bool is_acronym;
891
+ bool seen_at_symbol;
892
+
893
+
894
+ if (!std_tz->advance_to_start(ts)) {
895
+ return NULL;
896
+ }
897
+
898
+ start = t = ts->t;
899
+ token_i = std_tz->get_alpha(ts, token);
900
+ t += token_i;
901
+
902
+ if (!std_tz->is_tok_char(t)) {
903
+ /* very common case, ie a plain word, so check and return */
904
+ ts->t = t;
905
+ return frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
906
+ }
907
+
908
+ if (*t == '\'') { /* apostrophe case. */
909
+ t += std_tz->get_apostrophe(t);
910
+ ts->t = t;
911
+ len = (int)(t - start);
912
+ /* strip possesive */
913
+ if ((t[-1] == 's' || t[-1] == 'S') && t[-2] == '\'') {
914
+ t -= 2;
915
+ frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
916
+ CTS(ts)->token.end += 2;
917
+ }
918
+ else if (t[-1] == '\'') {
919
+ t -= 1;
920
+ frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
921
+ CTS(ts)->token.end += 1;
922
+ }
923
+ else {
924
+ frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
925
+ }
926
+
927
+ return &(CTS(ts)->token);
928
+ }
929
+
930
+ if (*t == '&') { /* apostrophe case. */
931
+ t += legacy_std_get_company_name(t);
932
+ ts->t = t;
933
+ return frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
934
+ }
935
+
936
+ if ((isdigit(*start) || isnumpunc(*start)) /* possibly a number */
937
+ && ((len = legacy_std_get_number(start)) > 0)) {
938
+ num_end = start + len;
939
+ if (!std_tz->is_tok_char(num_end)) { /* won't find a longer token */
940
+ ts->t = num_end;
941
+ return frt_tk_set_ts(&(CTS(ts)->token), start, num_end, ts->text, 1);
942
+ }
943
+ /* else there may be a longer token so check */
944
+ }
945
+
946
+ if (t[0] == ':' && t[1] == '/' && t[2] == '/') {
947
+ /* check for a known url start */
948
+ token[token_i] = '\0';
949
+ t += 3;
950
+ token_i += 3;
951
+ while (*t == '/') {
952
+ t++;
953
+ }
954
+ if (isalpha(*t) &&
955
+ (memcmp(token, "ftp", 3) == 0 ||
956
+ memcmp(token, "http", 4) == 0 ||
957
+ memcmp(token, "https", 5) == 0 ||
958
+ memcmp(token, "file", 4) == 0)) {
959
+ ts->t = std_get_url(t, token, 0, &len); /* dispose of first part of the URL */
960
+ }
961
+ else { /* still treat as url but keep the first part */
962
+ token_i = (int)(t - start);
963
+ memcpy(token, start, token_i * sizeof(char));
964
+ ts->t = std_get_url(start, token, token_i, &len); /* keep start */
965
+ }
966
+ return frt_tk_set(&(CTS(ts)->token), token, len,
967
+ (off_t)(start - ts->text),
968
+ (off_t)(ts->t - ts->text), 1);
969
+ }
970
+
971
+ /* now see how long a url we can find. */
972
+ is_acronym = true;
973
+ seen_at_symbol = false;
974
+ while (isurlxatc(*t)) {
975
+ if (is_acronym && !isalpha(*t) && (*t != '.')) {
976
+ is_acronym = false;
977
+ }
978
+ if (isurlxatpunc(*t) && isurlxatpunc(t[-1])) {
979
+ break; /* can't have two punctuation characters in a row */
980
+ }
981
+ if (*t == '@') {
982
+ if (seen_at_symbol) {
983
+ break; /* we can only have one @ symbol */
984
+ }
985
+ else {
986
+ seen_at_symbol = true;
987
+ }
988
+ }
989
+ t++;
990
+ }
991
+ while (isurlxatpunc(t[-1]) && t > ts->t) {
992
+ t--; /* strip trailing punctuation */
993
+ }
994
+
995
+ if (t < ts->t || (num_end != NULL && num_end < ts->t)) {
996
+ fprintf(stderr, "Warning: encoding error. Please check that you are using the correct locale for your input");
997
+ return NULL;
998
+ } else if (num_end == NULL || t > num_end) {
999
+ ts->t = t;
1000
+
1001
+ if (is_acronym) { /* check it is one letter followed by one '.' */
1002
+ for (s = start; s < t - 1; s++) {
1003
+ if (isalpha(*s) && (s[1] != '.'))
1004
+ is_acronym = false;
1005
+ }
1006
+ }
1007
+ if (is_acronym) { /* strip '.'s */
1008
+ for (s = start + token_i; s < t; s++) {
1009
+ if (*s != '.') {
1010
+ token[token_i] = *s;
1011
+ token_i++;
1012
+ }
1013
+ }
1014
+ frt_tk_set(&(CTS(ts)->token), token, token_i,
1015
+ (off_t)(start - ts->text),
1016
+ (off_t)(t - ts->text), 1);
1017
+ }
1018
+ else { /* just return the url as is */
1019
+ frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
1020
+ }
1021
+ }
1022
+ else { /* return the number */
1023
+ ts->t = num_end;
1024
+ frt_tk_set_ts(&(CTS(ts)->token), start, num_end, ts->text, 1);
1025
+ }
1026
+
1027
+ return &(CTS(ts)->token);
1028
+ }
1029
+
1030
+ static FrtTokenStream *legacy_std_ts_clone_i(FrtTokenStream *orig_ts)
1031
+ {
1032
+ return frt_ts_clone_size(orig_ts, sizeof(FrtLegacyStandardTokenizer));
1033
+ }
1034
+
1035
+ static FrtTokenStream *legacy_std_ts_new()
1036
+ {
1037
+ FrtTokenStream *ts = frt_ts_new(FrtLegacyStandardTokenizer);
1038
+
1039
+ ts->clone_i = &legacy_std_ts_clone_i;
1040
+ ts->next = &legacy_std_next;
1041
+
1042
+ return ts;
1043
+ }
1044
+
1045
+ FrtTokenStream *frt_legacy_standard_tokenizer_new()
1046
+ {
1047
+ FrtTokenStream *ts = legacy_std_ts_new();
1048
+
1049
+ LSTDTS(ts)->advance_to_start = &legacy_std_advance_to_start;
1050
+ LSTDTS(ts)->get_alpha = &legacy_std_get_alpha;
1051
+ LSTDTS(ts)->is_tok_char = &legacy_std_is_tok_char;
1052
+ LSTDTS(ts)->get_apostrophe = &legacy_std_get_apostrophe;
1053
+
1054
+ return ts;
1055
+ }
1056
+
1057
+ FrtTokenStream *frt_mb_legacy_standard_tokenizer_new()
1058
+ {
1059
+ FrtTokenStream *ts = legacy_std_ts_new();
1060
+
1061
+ LSTDTS(ts)->advance_to_start = &mb_legacy_std_advance_to_start;
1062
+ LSTDTS(ts)->get_alpha = &mb_legacy_std_get_alpha;
1063
+ LSTDTS(ts)->is_tok_char = &mb_legacy_std_is_tok_char;
1064
+ LSTDTS(ts)->get_apostrophe = &mb_legacy_std_get_apostrophe;
1065
+
1066
+ return ts;
1067
+ }
1068
+
1069
+ /****************************************************************************
1070
+ *
1071
+ * Filters
1072
+ *
1073
+ ****************************************************************************/
1074
+
1075
+ #define TkFilt(filter) ((FrtTokenFilter *)(filter))
1076
+
1077
+ FrtTokenStream *frt_filter_clone_size(FrtTokenStream *ts, size_t size)
1078
+ {
1079
+ FrtTokenStream *ts_new = frt_ts_clone_size(ts, size);
1080
+ TkFilt(ts_new)->sub_ts = TkFilt(ts)->sub_ts->clone_i(TkFilt(ts)->sub_ts);
1081
+ return ts_new;
1082
+ }
1083
+
1084
+ static FrtTokenStream *filter_clone_i(FrtTokenStream *ts)
1085
+ {
1086
+ return frt_filter_clone_size(ts, sizeof(FrtTokenFilter));
1087
+ }
1088
+
1089
+ static FrtTokenStream *filter_reset(FrtTokenStream *ts, char *text)
1090
+ {
1091
+ TkFilt(ts)->sub_ts->reset(TkFilt(ts)->sub_ts, text);
1092
+ return ts;
1093
+ }
1094
+
1095
+ static void filter_destroy_i(FrtTokenStream *ts)
1096
+ {
1097
+ frt_ts_deref(TkFilt(ts)->sub_ts);
1098
+ free(ts);
1099
+ }
1100
+
1101
+ FrtTokenStream *frt_tf_new_i(size_t size, FrtTokenStream *sub_ts)
1102
+ {
1103
+ FrtTokenStream *ts = (FrtTokenStream *)frt_ecalloc(size);
1104
+
1105
+ TkFilt(ts)->sub_ts = sub_ts;
1106
+
1107
+ ts->clone_i = &filter_clone_i;
1108
+ ts->destroy_i = &filter_destroy_i;
1109
+ ts->reset = &filter_reset;
1110
+ ts->ref_cnt = 1;
1111
+
1112
+ return ts;
1113
+ }
1114
+
1115
+ /****************************************************************************
1116
+ * FrtStopFilter
1117
+ ****************************************************************************/
1118
+
1119
+ #define StopFilt(filter) ((FrtStopFilter *)(filter))
1120
+
1121
+ static void sf_destroy_i(FrtTokenStream *ts)
1122
+ {
1123
+ frt_h_destroy(StopFilt(ts)->words);
1124
+ filter_destroy_i(ts);
1125
+ }
1126
+
1127
+ static FrtTokenStream *sf_clone_i(FrtTokenStream *orig_ts)
1128
+ {
1129
+ FrtTokenStream *new_ts = frt_filter_clone_size(orig_ts, sizeof(FrtMappingFilter));
1130
+ FRT_REF(StopFilt(new_ts)->words);
1131
+ return new_ts;
1132
+ }
1133
+
1134
+ static FrtToken *sf_next(FrtTokenStream *ts)
1135
+ {
1136
+ int pos_inc = 0;
1137
+ FrtHash *words = StopFilt(ts)->words;
1138
+ FrtTokenFilter *tf = TkFilt(ts);
1139
+ FrtToken *tk = tf->sub_ts->next(tf->sub_ts);
1140
+
1141
+ while ((tk != NULL) && (frt_h_get(words, tk->text) != NULL)) {
1142
+ pos_inc += tk->pos_inc;
1143
+ tk = tf->sub_ts->next(tf->sub_ts);
1144
+ }
1145
+
1146
+ if (tk != NULL) {
1147
+ tk->pos_inc += pos_inc;
1148
+ }
1149
+
1150
+ return tk;
1151
+ }
1152
+
1153
+ FrtTokenStream *frt_stop_filter_new_with_words_len(FrtTokenStream *sub_ts,
1154
+ const char **words, int len)
1155
+ {
1156
+ int i;
1157
+ char *word;
1158
+ FrtHash *word_table = frt_h_new_str(&free, (frt_free_ft) NULL);
1159
+ FrtTokenStream *ts = tf_new(FrtStopFilter, sub_ts);
1160
+
1161
+ for (i = 0; i < len; i++) {
1162
+ word = frt_estrdup(words[i]);
1163
+ frt_h_set(word_table, word, word);
1164
+ }
1165
+ StopFilt(ts)->words = word_table;
1166
+ ts->next = &sf_next;
1167
+ ts->destroy_i = &sf_destroy_i;
1168
+ ts->clone_i = &sf_clone_i;
1169
+ return ts;
1170
+ }
1171
+
1172
+ FrtTokenStream *frt_stop_filter_new_with_words(FrtTokenStream *sub_ts,
1173
+ const char **words)
1174
+ {
1175
+ char *word;
1176
+ FrtHash *word_table = frt_h_new_str(&free, (frt_free_ft) NULL);
1177
+ FrtTokenStream *ts = tf_new(FrtStopFilter, sub_ts);
1178
+
1179
+ while (*words) {
1180
+ word = frt_estrdup(*words);
1181
+ frt_h_set(word_table, word, word);
1182
+ words++;
1183
+ }
1184
+
1185
+ StopFilt(ts)->words = word_table;
1186
+ ts->next = &sf_next;
1187
+ ts->destroy_i = &sf_destroy_i;
1188
+ ts->clone_i = &sf_clone_i;
1189
+ return ts;
1190
+ }
1191
+
1192
+ FrtTokenStream *frt_stop_filter_new(FrtTokenStream *ts)
1193
+ {
1194
+ return frt_stop_filter_new_with_words(ts, FRT_FULL_ENGLISH_STOP_WORDS);
1195
+ }
1196
+
1197
+ /****************************************************************************
1198
+ * MappingFilter
1199
+ ****************************************************************************/
1200
+
1201
+ #define MFilt(filter) ((FrtMappingFilter *)(filter))
1202
+
1203
+ static void mf_destroy_i(FrtTokenStream *ts)
1204
+ {
1205
+ frt_mulmap_destroy(MFilt(ts)->mapper);
1206
+ filter_destroy_i(ts);
1207
+ }
1208
+
1209
+ static FrtTokenStream *mf_clone_i(FrtTokenStream *orig_ts)
1210
+ {
1211
+ FrtTokenStream *new_ts = frt_filter_clone_size(orig_ts, sizeof(FrtMappingFilter));
1212
+ FRT_REF(MFilt(new_ts)->mapper);
1213
+ return new_ts;
1214
+ }
1215
+
1216
+ static FrtToken *mf_next(FrtTokenStream *ts)
1217
+ {
1218
+ char buf[FRT_MAX_WORD_SIZE + 1];
1219
+ FrtMultiMapper *mapper = MFilt(ts)->mapper;
1220
+ FrtTokenFilter *tf = TkFilt(ts);
1221
+ FrtToken *tk = tf->sub_ts->next(tf->sub_ts);
1222
+ if (tk != NULL) {
1223
+ tk->len = frt_mulmap_map_len(mapper, buf, tk->text, FRT_MAX_WORD_SIZE);
1224
+ memcpy(tk->text, buf, tk->len + 1);
1225
+ }
1226
+ return tk;
1227
+ }
1228
+
1229
+ static FrtTokenStream *mf_reset(FrtTokenStream *ts, char *text)
1230
+ {
1231
+ FrtMultiMapper *mm = MFilt(ts)->mapper;
1232
+ if (mm->d_size == 0) {
1233
+ frt_mulmap_compile(MFilt(ts)->mapper);
1234
+ }
1235
+ filter_reset(ts, text);
1236
+ return ts;
1237
+ }
1238
+
1239
+ FrtTokenStream *frt_mapping_filter_new(FrtTokenStream *sub_ts)
1240
+ {
1241
+ FrtTokenStream *ts = tf_new(FrtMappingFilter, sub_ts);
1242
+ MFilt(ts)->mapper = frt_mulmap_new();
1243
+ ts->next = &mf_next;
1244
+ ts->destroy_i = &mf_destroy_i;
1245
+ ts->clone_i = &mf_clone_i;
1246
+ ts->reset = &mf_reset;
1247
+ return ts;
1248
+ }
1249
+
1250
+ FrtTokenStream *frt_mapping_filter_add(FrtTokenStream *ts, const char *pattern,
1251
+ const char *replacement)
1252
+ {
1253
+ frt_mulmap_add_mapping(MFilt(ts)->mapper, pattern, replacement);
1254
+ return ts;
1255
+ }
1256
+
1257
+ /****************************************************************************
1258
+ * HyphenFilter
1259
+ ****************************************************************************/
1260
+
1261
+ #define HyphenFilt(filter) ((FrtHyphenFilter *)(filter))
1262
+
1263
+ static FrtTokenStream *hf_clone_i(FrtTokenStream *orig_ts)
1264
+ {
1265
+ FrtTokenStream *new_ts = frt_filter_clone_size(orig_ts, sizeof(FrtHyphenFilter));
1266
+ return new_ts;
1267
+ }
1268
+
1269
+ static FrtToken *hf_next(FrtTokenStream *ts)
1270
+ {
1271
+ FrtHyphenFilter *hf = HyphenFilt(ts);
1272
+ FrtTokenFilter *tf = TkFilt(ts);
1273
+ FrtToken *tk = hf->tk;
1274
+
1275
+ if (hf->pos < hf->len) {
1276
+ const int pos = hf->pos;
1277
+ const int text_len = strlen(hf->text + pos);
1278
+ strcpy(tk->text, hf->text + pos);
1279
+ tk->pos_inc = ((pos != 0) ? 1 : 0);
1280
+ tk->start = hf->start + pos;
1281
+ tk->end = tk->start + text_len;
1282
+ hf->pos += text_len + 1;
1283
+ tk->len = text_len;
1284
+ return tk;
1285
+ }
1286
+ else {
1287
+ char *p;
1288
+ bool seen_hyphen = false;
1289
+ bool seen_other_punc = false;
1290
+ hf->tk = tk = tf->sub_ts->next(tf->sub_ts);
1291
+ if (NULL == tk) return NULL;
1292
+ p = tk->text + 1;
1293
+ while (*p) {
1294
+ if (*p == '-') {
1295
+ seen_hyphen = true;
1296
+ }
1297
+ else if (!isalpha(*p)) {
1298
+ seen_other_punc = true;
1299
+ break;
1300
+ }
1301
+ p++;
1302
+ }
1303
+ if (seen_hyphen && !seen_other_punc) {
1304
+ char *q = hf->text;
1305
+ char *r = tk->text;
1306
+ p = tk->text;
1307
+ while (*p) {
1308
+ if (*p == '-') {
1309
+ *q = '\0';
1310
+ }
1311
+ else {
1312
+ *r = *q = *p;
1313
+ r++;
1314
+ }
1315
+ q++;
1316
+ p++;
1317
+ }
1318
+ *r = *q = '\0';
1319
+ hf->start = tk->start;
1320
+ hf->pos = 0;
1321
+ hf->len = q - hf->text;
1322
+ tk->len = r - tk->text;
1323
+ }
1324
+ }
1325
+ return tk;
1326
+ }
1327
+
1328
+ FrtTokenStream *frt_hyphen_filter_new(FrtTokenStream *sub_ts)
1329
+ {
1330
+ FrtTokenStream *ts = tf_new(FrtHyphenFilter, sub_ts);
1331
+ ts->next = &hf_next;
1332
+ ts->clone_i = &hf_clone_i;
1333
+ return ts;
1334
+ }
1335
+
1336
+ /****************************************************************************
1337
+ * LowerCaseFilter
1338
+ ****************************************************************************/
1339
+
1340
+
1341
+ static FrtToken *mb_lcf_next(FrtTokenStream *ts)
1342
+ {
1343
+ wchar_t wbuf[FRT_MAX_WORD_SIZE + 1], *wchr;
1344
+ FrtToken *tk = TkFilt(ts)->sub_ts->next(TkFilt(ts)->sub_ts);
1345
+ int x;
1346
+ wbuf[FRT_MAX_WORD_SIZE] = 0;
1347
+
1348
+ if (tk == NULL) {
1349
+ return tk;
1350
+ }
1351
+
1352
+ if ((x=mbstowcs(wbuf, tk->text, FRT_MAX_WORD_SIZE)) <= 0) return tk;
1353
+ wchr = wbuf;
1354
+ while (*wchr != 0) {
1355
+ *wchr = towlower(*wchr);
1356
+ wchr++;
1357
+ }
1358
+ tk->len = wcstombs(tk->text, wbuf, FRT_MAX_WORD_SIZE);
1359
+ if (tk->len <= 0) {
1360
+ strcpy(tk->text, "BAD_DATA");
1361
+ tk->len = 8;
1362
+ }
1363
+ tk->text[tk->len] = '\0';
1364
+ return tk;
1365
+ }
1366
+
1367
+ FrtTokenStream *frt_mb_lowercase_filter_new(FrtTokenStream *sub_ts)
1368
+ {
1369
+ FrtTokenStream *ts = tf_new(FrtTokenFilter, sub_ts);
1370
+ ts->next = &mb_lcf_next;
1371
+ return ts;
1372
+ }
1373
+
1374
+ static FrtToken *lcf_next(FrtTokenStream *ts)
1375
+ {
1376
+ int i = 0;
1377
+ FrtToken *tk = TkFilt(ts)->sub_ts->next(TkFilt(ts)->sub_ts);
1378
+ if (tk == NULL) {
1379
+ return tk;
1380
+ }
1381
+ while (tk->text[i] != '\0') {
1382
+ tk->text[i] = tolower(tk->text[i]);
1383
+ i++;
1384
+ }
1385
+ return tk;
1386
+ }
1387
+
1388
+ FrtTokenStream *frt_lowercase_filter_new(FrtTokenStream *sub_ts)
1389
+ {
1390
+ FrtTokenStream *ts = tf_new(FrtTokenFilter, sub_ts);
1391
+ ts->next = &lcf_next;
1392
+ return ts;
1393
+ }
1394
+
1395
+ /****************************************************************************
1396
+ * FrtStemFilter
1397
+ ****************************************************************************/
1398
+
1399
+ #define StemFilt(filter) ((FrtStemFilter *)(filter))
1400
+
1401
+ static void stemf_destroy_i(FrtTokenStream *ts)
1402
+ {
1403
+ sb_stemmer_delete(StemFilt(ts)->stemmer);
1404
+ free(StemFilt(ts)->algorithm);
1405
+ free(StemFilt(ts)->charenc);
1406
+ filter_destroy_i(ts);
1407
+ }
1408
+
1409
+ static FrtToken *stemf_next(FrtTokenStream *ts)
1410
+ {
1411
+ int len;
1412
+ const sb_symbol *stemmed;
1413
+ struct sb_stemmer *stemmer = StemFilt(ts)->stemmer;
1414
+ FrtTokenFilter *tf = TkFilt(ts);
1415
+ FrtToken *tk = tf->sub_ts->next(tf->sub_ts);
1416
+ if (tk == NULL) {
1417
+ return tk;
1418
+ }
1419
+ stemmed = sb_stemmer_stem(stemmer, (sb_symbol *)tk->text, tk->len);
1420
+ len = sb_stemmer_length(stemmer);
1421
+ if (len >= FRT_MAX_WORD_SIZE) {
1422
+ len = FRT_MAX_WORD_SIZE - 1;
1423
+ }
1424
+
1425
+ memcpy(tk->text, stemmed, len);
1426
+ tk->text[len] = '\0';
1427
+ tk->len = len;
1428
+ return tk;
1429
+ }
1430
+
1431
+ static FrtTokenStream *stemf_clone_i(FrtTokenStream *orig_ts)
1432
+ {
1433
+ FrtTokenStream *new_ts = frt_filter_clone_size(orig_ts, sizeof(FrtStemFilter));
1434
+ FrtStemFilter *stemf = StemFilt(new_ts);
1435
+ FrtStemFilter *orig_stemf = StemFilt(orig_ts);
1436
+ stemf->stemmer =
1437
+ sb_stemmer_new(orig_stemf->algorithm, orig_stemf->charenc);
1438
+ stemf->algorithm =
1439
+ orig_stemf->algorithm ? frt_estrdup(orig_stemf->algorithm) : NULL;
1440
+ stemf->charenc =
1441
+ orig_stemf->charenc ? frt_estrdup(orig_stemf->charenc) : NULL;
1442
+ return new_ts;
1443
+ }
1444
+
1445
+ FrtTokenStream *frt_stem_filter_new(FrtTokenStream *ts, const char *algorithm,
1446
+ const char *charenc)
1447
+ {
1448
+ FrtTokenStream *tf = tf_new(FrtStemFilter, ts);
1449
+ char *my_algorithm = NULL;
1450
+ char *my_charenc = NULL;
1451
+ char *s = NULL;
1452
+
1453
+ if (algorithm) {
1454
+ my_algorithm = frt_estrdup(algorithm);
1455
+
1456
+ /* algorithms are lowercase */
1457
+ s = my_algorithm;
1458
+ while (*s) {
1459
+ *s = tolower(*s);
1460
+ s++;
1461
+ }
1462
+ StemFilt(tf)->algorithm = my_algorithm;
1463
+ }
1464
+
1465
+ if (charenc) {
1466
+ my_charenc = frt_estrdup(charenc);
1467
+
1468
+ /* encodings are uppercase and use '_' instead of '-' */
1469
+ s = my_charenc;
1470
+ while (*s) {
1471
+ *s = (*s == '-') ? '_' : toupper(*s);
1472
+ s++;
1473
+ }
1474
+ StemFilt(tf)->charenc = my_charenc;
1475
+ }
1476
+
1477
+ StemFilt(tf)->stemmer = sb_stemmer_new(my_algorithm, my_charenc);
1478
+
1479
+ tf->next = &stemf_next;
1480
+ tf->destroy_i = &stemf_destroy_i;
1481
+ tf->clone_i = &stemf_clone_i;
1482
+ return tf;
1483
+ }
1484
+
1485
+ /****************************************************************************
1486
+ *
1487
+ * Analyzers
1488
+ *
1489
+ ****************************************************************************/
1490
+
1491
+ /****************************************************************************
1492
+ * Standard
1493
+ ****************************************************************************/
1494
+
1495
+ FrtAnalyzer *frt_standard_analyzer_new_with_words_len(const char **words, int len,
1496
+ bool lowercase)
1497
+ {
1498
+ FrtTokenStream *ts = frt_standard_tokenizer_new();
1499
+ if (lowercase) {
1500
+ ts = frt_lowercase_filter_new(ts);
1501
+ }
1502
+ ts = frt_hyphen_filter_new(frt_stop_filter_new_with_words_len(ts, words, len));
1503
+ return frt_analyzer_new(ts, NULL, NULL);
1504
+ }
1505
+
1506
+ FrtAnalyzer *frt_standard_analyzer_new_with_words(const char **words,
1507
+ bool lowercase)
1508
+ {
1509
+ FrtTokenStream *ts = frt_standard_tokenizer_new();
1510
+ if (lowercase) {
1511
+ ts = frt_lowercase_filter_new(ts);
1512
+ }
1513
+ ts = frt_hyphen_filter_new(frt_stop_filter_new_with_words(ts, words));
1514
+ return frt_analyzer_new(ts, NULL, NULL);
1515
+ }
1516
+
1517
+ FrtAnalyzer *frt_mb_standard_analyzer_new_with_words(const char **words,
1518
+ bool lowercase)
1519
+ {
1520
+ FrtTokenStream *ts = frt_mb_standard_tokenizer_new();
1521
+ if (lowercase) {
1522
+ ts = frt_mb_lowercase_filter_new(ts);
1523
+ }
1524
+ ts = frt_hyphen_filter_new(frt_stop_filter_new_with_words(ts, words));
1525
+ return frt_analyzer_new(ts, NULL, NULL);
1526
+ }
1527
+
1528
+ FrtAnalyzer *frt_utf8_standard_analyzer_new_with_words(const char **words,
1529
+ bool lowercase)
1530
+ {
1531
+ FrtTokenStream *ts = frt_utf8_standard_tokenizer_new();
1532
+ if (lowercase) {
1533
+ ts = frt_mb_lowercase_filter_new(ts);
1534
+ }
1535
+ ts = frt_hyphen_filter_new(frt_stop_filter_new_with_words(ts, words));
1536
+ return frt_analyzer_new(ts, NULL, NULL);
1537
+ }
1538
+
1539
+ FrtAnalyzer *frt_standard_analyzer_new(bool lowercase)
1540
+ {
1541
+ return frt_standard_analyzer_new_with_words(FRT_FULL_ENGLISH_STOP_WORDS,
1542
+ lowercase);
1543
+ }
1544
+
1545
+ FrtAnalyzer *frt_mb_standard_analyzer_new(bool lowercase)
1546
+ {
1547
+ return frt_mb_standard_analyzer_new_with_words(FRT_FULL_ENGLISH_STOP_WORDS,
1548
+ lowercase);
1549
+ }
1550
+
1551
+ FrtAnalyzer *frt_utf8_standard_analyzer_new(bool lowercase)
1552
+ {
1553
+ return frt_utf8_standard_analyzer_new_with_words(FRT_FULL_ENGLISH_STOP_WORDS,
1554
+ lowercase);
1555
+ }
1556
+
1557
+ /****************************************************************************
1558
+ * Legacy
1559
+ ****************************************************************************/
1560
+
1561
+ FrtAnalyzer *frt_legacy_standard_analyzer_new_with_words(const char **words,
1562
+ bool lowercase)
1563
+ {
1564
+ FrtTokenStream *ts = frt_legacy_standard_tokenizer_new();
1565
+ if (lowercase) {
1566
+ ts = frt_lowercase_filter_new(ts);
1567
+ }
1568
+ ts = frt_hyphen_filter_new(frt_stop_filter_new_with_words(ts, words));
1569
+ return frt_analyzer_new(ts, NULL, NULL);
1570
+ }
1571
+
1572
+ FrtAnalyzer *frt_mb_legacy_standard_analyzer_new_with_words(const char **words,
1573
+ bool lowercase)
1574
+ {
1575
+ FrtTokenStream *ts = frt_mb_legacy_standard_tokenizer_new();
1576
+ if (lowercase) {
1577
+ ts = frt_mb_lowercase_filter_new(ts);
1578
+ }
1579
+ ts = frt_hyphen_filter_new(frt_stop_filter_new_with_words(ts, words));
1580
+ return frt_analyzer_new(ts, NULL, NULL);
1581
+ }
1582
+
1583
+ FrtAnalyzer *frt_legacy_standard_analyzer_new(bool lowercase)
1584
+ {
1585
+ return frt_legacy_standard_analyzer_new_with_words(FRT_FULL_ENGLISH_STOP_WORDS,
1586
+ lowercase);
1587
+ }
1588
+
1589
+ FrtAnalyzer *frt_mb_legacy_standard_analyzer_new(bool lowercase)
1590
+ {
1591
+ return frt_mb_legacy_standard_analyzer_new_with_words(FRT_FULL_ENGLISH_STOP_WORDS,
1592
+ lowercase);
1593
+ }
1594
+
1595
+ /****************************************************************************
1596
+ *
1597
+ * PerFieldAnalyzer
1598
+ *
1599
+ ****************************************************************************/
1600
+
1601
+ static void pfa_destroy_i(FrtAnalyzer *self)
1602
+ {
1603
+ frt_h_destroy(PFA(self)->dict);
1604
+
1605
+ frt_a_deref(PFA(self)->default_a);
1606
+ free(self);
1607
+ }
1608
+
1609
+ static FrtTokenStream *pfa_get_ts(FrtAnalyzer *self,
1610
+ FrtSymbol field, char *text)
1611
+ {
1612
+ FrtAnalyzer *a = (FrtAnalyzer *)frt_h_get(PFA(self)->dict, (void *)field);
1613
+ if (a == NULL) {
1614
+ a = PFA(self)->default_a;
1615
+ }
1616
+ return frt_a_get_ts(a, field, text);
1617
+ }
1618
+
1619
+ static void pfa_sub_a_destroy_i(void *p)
1620
+ {
1621
+ FrtAnalyzer *a = (FrtAnalyzer *) p;
1622
+ frt_a_deref(a);
1623
+ }
1624
+
1625
+ void frt_pfa_add_field(FrtAnalyzer *self,
1626
+ FrtSymbol field,
1627
+ FrtAnalyzer *analyzer)
1628
+ {
1629
+ frt_h_set(PFA(self)->dict, (void *)field, analyzer);
1630
+ }
1631
+
1632
+ FrtAnalyzer *frt_per_field_analyzer_new(FrtAnalyzer *default_a)
1633
+ {
1634
+ FrtAnalyzer *a = (FrtAnalyzer *)frt_ecalloc(sizeof(FrtPerFieldAnalyzer));
1635
+
1636
+ PFA(a)->default_a = default_a;
1637
+ PFA(a)->dict = frt_h_new_ptr(&pfa_sub_a_destroy_i);
1638
+
1639
+ a->destroy_i = &pfa_destroy_i;
1640
+ a->get_ts = pfa_get_ts;
1641
+ a->ref_cnt = 1;
1642
+
1643
+ return a;
1644
+ }