ferret 0.11.6 → 0.11.8.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (185) hide show
  1. data/README +10 -22
  2. data/RELEASE_CHANGES +137 -0
  3. data/RELEASE_NOTES +60 -0
  4. data/Rakefile +379 -274
  5. data/TODO +100 -8
  6. data/bin/ferret-browser +0 -0
  7. data/ext/BZLIB_blocksort.c +1094 -0
  8. data/ext/BZLIB_bzlib.c +1578 -0
  9. data/ext/BZLIB_compress.c +672 -0
  10. data/ext/BZLIB_crctable.c +104 -0
  11. data/ext/BZLIB_decompress.c +626 -0
  12. data/ext/BZLIB_huffman.c +205 -0
  13. data/ext/BZLIB_randtable.c +84 -0
  14. data/ext/{api.c → STEMMER_api.c} +7 -10
  15. data/ext/{libstemmer.c → STEMMER_libstemmer.c} +3 -2
  16. data/ext/{stem_ISO_8859_1_danish.c → STEMMER_stem_ISO_8859_1_danish.c} +123 -124
  17. data/ext/{stem_ISO_8859_1_dutch.c → STEMMER_stem_ISO_8859_1_dutch.c} +177 -188
  18. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  19. data/ext/{stem_ISO_8859_1_finnish.c → STEMMER_stem_ISO_8859_1_finnish.c} +276 -306
  20. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  21. data/ext/{stem_ISO_8859_1_german.c → STEMMER_stem_ISO_8859_1_german.c} +161 -170
  22. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  25. data/ext/{stem_ISO_8859_1_porter.c → STEMMER_stem_ISO_8859_1_porter.c} +263 -290
  26. data/ext/{stem_ISO_8859_1_portuguese.c → STEMMER_stem_ISO_8859_1_portuguese.c} +362 -380
  27. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  29. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  30. data/ext/{stem_KOI8_R_russian.c → STEMMER_stem_KOI8_R_russian.c} +244 -245
  31. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  32. data/ext/{stem_UTF_8_dutch.c → STEMMER_stem_UTF_8_dutch.c} +192 -211
  33. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  34. data/ext/{stem_UTF_8_finnish.c → STEMMER_stem_UTF_8_finnish.c} +284 -324
  35. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  36. data/ext/{stem_UTF_8_german.c → STEMMER_stem_UTF_8_german.c} +170 -187
  37. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  38. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  39. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  40. data/ext/{stem_UTF_8_porter.c → STEMMER_stem_UTF_8_porter.c} +271 -310
  41. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  42. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  43. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  44. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  45. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  46. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  47. data/ext/{utilities.c → STEMMER_utilities.c} +100 -68
  48. data/ext/analysis.c +276 -121
  49. data/ext/analysis.h +190 -143
  50. data/ext/api.h +3 -4
  51. data/ext/array.c +5 -3
  52. data/ext/array.h +52 -43
  53. data/ext/bitvector.c +38 -482
  54. data/ext/bitvector.h +446 -124
  55. data/ext/bzlib.h +282 -0
  56. data/ext/bzlib_private.h +503 -0
  57. data/ext/compound_io.c +23 -22
  58. data/ext/config.h +21 -11
  59. data/ext/document.c +43 -40
  60. data/ext/document.h +31 -21
  61. data/ext/except.c +20 -38
  62. data/ext/except.h +89 -76
  63. data/ext/extconf.rb +3 -2
  64. data/ext/ferret.c +49 -35
  65. data/ext/ferret.h +14 -11
  66. data/ext/field_index.c +262 -0
  67. data/ext/field_index.h +52 -0
  68. data/ext/filter.c +11 -10
  69. data/ext/fs_store.c +65 -47
  70. data/ext/global.c +245 -165
  71. data/ext/global.h +252 -54
  72. data/ext/hash.c +200 -243
  73. data/ext/hash.h +205 -163
  74. data/ext/hashset.c +118 -96
  75. data/ext/hashset.h +110 -82
  76. data/ext/header.h +19 -19
  77. data/ext/helper.c +11 -10
  78. data/ext/helper.h +14 -6
  79. data/ext/index.c +745 -366
  80. data/ext/index.h +503 -529
  81. data/ext/internal.h +1020 -0
  82. data/ext/lang.c +10 -0
  83. data/ext/lang.h +35 -15
  84. data/ext/mempool.c +5 -4
  85. data/ext/mempool.h +30 -22
  86. data/ext/modules.h +35 -7
  87. data/ext/multimapper.c +43 -2
  88. data/ext/multimapper.h +32 -23
  89. data/ext/posh.c +0 -0
  90. data/ext/posh.h +4 -38
  91. data/ext/priorityqueue.c +10 -12
  92. data/ext/priorityqueue.h +33 -21
  93. data/ext/q_boolean.c +22 -9
  94. data/ext/q_const_score.c +3 -2
  95. data/ext/q_filtered_query.c +15 -12
  96. data/ext/q_fuzzy.c +147 -135
  97. data/ext/q_match_all.c +3 -2
  98. data/ext/q_multi_term.c +28 -32
  99. data/ext/q_parser.c +451 -173
  100. data/ext/q_phrase.c +158 -79
  101. data/ext/q_prefix.c +16 -18
  102. data/ext/q_range.c +363 -31
  103. data/ext/q_span.c +130 -141
  104. data/ext/q_term.c +21 -21
  105. data/ext/q_wildcard.c +19 -23
  106. data/ext/r_analysis.c +369 -242
  107. data/ext/r_index.c +421 -434
  108. data/ext/r_qparser.c +142 -92
  109. data/ext/r_search.c +790 -407
  110. data/ext/r_store.c +44 -44
  111. data/ext/r_utils.c +264 -96
  112. data/ext/ram_store.c +29 -23
  113. data/ext/scanner.c +895 -0
  114. data/ext/scanner.h +36 -0
  115. data/ext/scanner_mb.c +6701 -0
  116. data/ext/scanner_utf8.c +4415 -0
  117. data/ext/search.c +210 -87
  118. data/ext/search.h +556 -488
  119. data/ext/similarity.c +17 -16
  120. data/ext/similarity.h +51 -44
  121. data/ext/sort.c +157 -354
  122. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  123. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  124. data/ext/stem_UTF_8_hungarian.h +16 -0
  125. data/ext/stem_UTF_8_romanian.h +16 -0
  126. data/ext/stem_UTF_8_turkish.h +16 -0
  127. data/ext/stopwords.c +287 -278
  128. data/ext/store.c +57 -51
  129. data/ext/store.h +308 -286
  130. data/ext/symbol.c +10 -0
  131. data/ext/symbol.h +23 -0
  132. data/ext/term_vectors.c +14 -293
  133. data/ext/threading.h +22 -22
  134. data/ext/win32.h +12 -4
  135. data/lib/ferret.rb +2 -1
  136. data/lib/ferret/browser.rb +1 -1
  137. data/lib/ferret/field_symbol.rb +94 -0
  138. data/lib/ferret/index.rb +221 -34
  139. data/lib/ferret/number_tools.rb +6 -6
  140. data/lib/ferret/version.rb +3 -0
  141. data/test/{unit → long_running}/largefile/tc_largefile.rb +1 -1
  142. data/test/test_helper.rb +7 -2
  143. data/test/test_installed.rb +1 -0
  144. data/test/threading/thread_safety_index_test.rb +10 -1
  145. data/test/threading/thread_safety_read_write_test.rb +4 -7
  146. data/test/threading/thread_safety_test.rb +0 -0
  147. data/test/unit/analysis/tc_analyzer.rb +29 -27
  148. data/test/unit/analysis/tc_token_stream.rb +23 -16
  149. data/test/unit/index/tc_index.rb +116 -11
  150. data/test/unit/index/tc_index_reader.rb +27 -27
  151. data/test/unit/index/tc_index_writer.rb +10 -0
  152. data/test/unit/index/th_doc.rb +38 -21
  153. data/test/unit/search/tc_filter.rb +31 -10
  154. data/test/unit/search/tc_index_searcher.rb +6 -0
  155. data/test/unit/search/tm_searcher.rb +53 -1
  156. data/test/unit/store/tc_fs_store.rb +40 -2
  157. data/test/unit/store/tc_ram_store.rb +0 -0
  158. data/test/unit/store/tm_store.rb +0 -0
  159. data/test/unit/store/tm_store_lock.rb +7 -6
  160. data/test/unit/tc_field_symbol.rb +26 -0
  161. data/test/unit/ts_analysis.rb +0 -0
  162. data/test/unit/ts_index.rb +0 -0
  163. data/test/unit/ts_store.rb +0 -0
  164. data/test/unit/ts_utils.rb +0 -0
  165. data/test/unit/utils/tc_number_tools.rb +0 -0
  166. data/test/utils/content_generator.rb +226 -0
  167. metadata +262 -221
  168. data/ext/inc/lang.h +0 -48
  169. data/ext/inc/threading.h +0 -31
  170. data/ext/stem_ISO_8859_1_english.c +0 -1156
  171. data/ext/stem_ISO_8859_1_french.c +0 -1276
  172. data/ext/stem_ISO_8859_1_italian.c +0 -1091
  173. data/ext/stem_ISO_8859_1_norwegian.c +0 -296
  174. data/ext/stem_ISO_8859_1_spanish.c +0 -1119
  175. data/ext/stem_ISO_8859_1_swedish.c +0 -307
  176. data/ext/stem_UTF_8_danish.c +0 -344
  177. data/ext/stem_UTF_8_english.c +0 -1176
  178. data/ext/stem_UTF_8_french.c +0 -1296
  179. data/ext/stem_UTF_8_italian.c +0 -1113
  180. data/ext/stem_UTF_8_norwegian.c +0 -302
  181. data/ext/stem_UTF_8_portuguese.c +0 -1055
  182. data/ext/stem_UTF_8_russian.c +0 -709
  183. data/ext/stem_UTF_8_spanish.c +0 -1137
  184. data/ext/stem_UTF_8_swedish.c +0 -313
  185. data/lib/ferret_version.rb +0 -3
@@ -25,8 +25,8 @@ extern void lose_s(symbol * p) {
25
25
  }
26
26
 
27
27
  /*
28
- new_p = X_skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c
29
- if n +ve, or n characters backwards from p +c - 1 if n -ve. new_p is the new
28
+ new_p = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c
29
+ if n +ve, or n characters backwards from p + c - 1 if n -ve. new_p is the new
30
30
  position, or 0 on failure.
31
31
 
32
32
  -- used to implement hop and next in the utf8 case.
@@ -76,7 +76,7 @@ static int get_utf8(const symbol * p, int c, int l, int * slot) {
76
76
  if (b0 < 0xE0 || c == l) { /* 1110 0000 */
77
77
  * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2;
78
78
  }
79
- * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (*p & 0x3F); return 3;
79
+ * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[c] & 0x3F); return 3;
80
80
  }
81
81
 
82
82
  static int get_b_utf8(const symbol * p, int c, int lb, int * slot) {
@@ -90,94 +90,126 @@ static int get_b_utf8(const symbol * p, int c, int lb, int * slot) {
90
90
  if (b1 >= 0xC0 || c == lb) { /* 1100 0000 */
91
91
  * slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2;
92
92
  }
93
- * slot = (*p & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
93
+ * slot = (p[c] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
94
94
  }
95
95
 
96
- extern int in_grouping_U(struct SN_env * z, unsigned char * s, int min, int max) {
97
- int ch;
98
- int w = get_utf8(z->p, z->c, z->l, & ch);
99
- unless (w) return 0;
100
- if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
101
- z->c += w; return 1;
96
+ extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
97
+ do {
98
+ int ch;
99
+ int w = get_utf8(z->p, z->c, z->l, & ch);
100
+ unless (w) return -1;
101
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
102
+ return w;
103
+ z->c += w;
104
+ } while (repeat);
105
+ return 0;
102
106
  }
103
107
 
104
- extern int in_grouping_b_U(struct SN_env * z, unsigned char * s, int min, int max) {
105
- int ch;
106
- int w = get_b_utf8(z->p, z->c, z->lb, & ch);
107
- unless (w) return 0;
108
- if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
109
- z->c -= w; return 1;
108
+ extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
109
+ do {
110
+ int ch;
111
+ int w = get_b_utf8(z->p, z->c, z->lb, & ch);
112
+ unless (w) return -1;
113
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
114
+ return w;
115
+ z->c -= w;
116
+ } while (repeat);
117
+ return 0;
110
118
  }
111
119
 
112
- extern int out_grouping_U(struct SN_env * z, unsigned char * s, int min, int max) {
113
- int ch;
114
- int w = get_utf8(z->p, z->c, z->l, & ch);
115
- unless (w) return 0;
116
- unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
117
- z->c += w; return 1;
120
+ extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
121
+ do {
122
+ int ch;
123
+ int w = get_utf8(z->p, z->c, z->l, & ch);
124
+ unless (w) return -1;
125
+ unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
126
+ return w;
127
+ z->c += w;
128
+ } while (repeat);
129
+ return 0;
118
130
  }
119
131
 
120
- extern int out_grouping_b_U(struct SN_env * z, unsigned char * s, int min, int max) {
121
- int ch;
122
- int w = get_b_utf8(z->p, z->c, z->lb, & ch);
123
- unless (w) return 0;
124
- unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
125
- z->c -= w; return 1;
132
+ extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
133
+ do {
134
+ int ch;
135
+ int w = get_b_utf8(z->p, z->c, z->lb, & ch);
136
+ unless (w) return -1;
137
+ unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
138
+ return w;
139
+ z->c -= w;
140
+ } while (repeat);
141
+ return 0;
126
142
  }
127
143
 
128
144
  /* Code for character groupings: non-utf8 cases */
129
145
 
130
- extern int in_grouping(struct SN_env * z, unsigned char * s, int min, int max) {
131
- int ch;
132
- if (z->c >= z->l) return 0;
133
- ch = z->p[z->c];
134
- if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
135
- z->c++; return 1;
146
+ extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
147
+ do {
148
+ int ch;
149
+ if (z->c >= z->l) return -1;
150
+ ch = z->p[z->c];
151
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
152
+ return 1;
153
+ z->c++;
154
+ } while (repeat);
155
+ return 0;
136
156
  }
137
157
 
138
- extern int in_grouping_b(struct SN_env * z, unsigned char * s, int min, int max) {
139
- int ch;
140
- if (z->c <= z->lb) return 0;
141
- ch = z->p[z->c - 1];
142
- if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
143
- z->c--; return 1;
158
+ extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
159
+ do {
160
+ int ch;
161
+ if (z->c <= z->lb) return -1;
162
+ ch = z->p[z->c - 1];
163
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
164
+ return 1;
165
+ z->c--;
166
+ } while (repeat);
167
+ return 0;
144
168
  }
145
169
 
146
- extern int out_grouping(struct SN_env * z, unsigned char * s, int min, int max) {
147
- int ch;
148
- if (z->c >= z->l) return 0;
149
- ch = z->p[z->c];
150
- unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
151
- z->c++; return 1;
170
+ extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
171
+ do {
172
+ int ch;
173
+ if (z->c >= z->l) return -1;
174
+ ch = z->p[z->c];
175
+ unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
176
+ return 1;
177
+ z->c++;
178
+ } while (repeat);
179
+ return 0;
152
180
  }
153
181
 
154
- extern int out_grouping_b(struct SN_env * z, unsigned char * s, int min, int max) {
155
- int ch;
156
- if (z->c <= z->lb) return 0;
157
- ch = z->p[z->c - 1];
158
- unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
159
- z->c--; return 1;
182
+ extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
183
+ do {
184
+ int ch;
185
+ if (z->c <= z->lb) return -1;
186
+ ch = z->p[z->c - 1];
187
+ unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
188
+ return 1;
189
+ z->c--;
190
+ } while (repeat);
191
+ return 0;
160
192
  }
161
193
 
162
- extern int eq_s(struct SN_env * z, int s_size, symbol * s) {
194
+ extern int eq_s(struct SN_env * z, int s_size, const symbol * s) {
163
195
  if (z->l - z->c < s_size || memcmp(z->p + z->c, s, s_size * sizeof(symbol)) != 0) return 0;
164
196
  z->c += s_size; return 1;
165
197
  }
166
198
 
167
- extern int eq_s_b(struct SN_env * z, int s_size, symbol * s) {
199
+ extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s) {
168
200
  if (z->c - z->lb < s_size || memcmp(z->p + z->c - s_size, s, s_size * sizeof(symbol)) != 0) return 0;
169
201
  z->c -= s_size; return 1;
170
202
  }
171
203
 
172
- extern int eq_v(struct SN_env * z, symbol * p) {
204
+ extern int eq_v(struct SN_env * z, const symbol * p) {
173
205
  return eq_s(z, SIZE(p), p);
174
206
  }
175
207
 
176
- extern int eq_v_b(struct SN_env * z, symbol * p) {
208
+ extern int eq_v_b(struct SN_env * z, const symbol * p) {
177
209
  return eq_s_b(z, SIZE(p), p);
178
210
  }
179
211
 
180
- extern int find_among(struct SN_env * z, struct among * v, int v_size) {
212
+ extern int find_among(struct SN_env * z, const struct among * v, int v_size) {
181
213
 
182
214
  int i = 0;
183
215
  int j = v_size;
@@ -185,7 +217,7 @@ extern int find_among(struct SN_env * z, struct among * v, int v_size) {
185
217
  int c = z->c; int l = z->l;
186
218
  symbol * q = z->p + c;
187
219
 
188
- struct among * w;
220
+ const struct among * w;
189
221
 
190
222
  int common_i = 0;
191
223
  int common_j = 0;
@@ -198,9 +230,9 @@ extern int find_among(struct SN_env * z, struct among * v, int v_size) {
198
230
  int common = common_i < common_j ? common_i : common_j; /* smaller */
199
231
  w = v + k;
200
232
  {
201
- int i; for (i = common; i < w->s_size; i++) {
233
+ int i2; for (i2 = common; i2 < w->s_size; i2++) {
202
234
  if (c + common == l) { diff = -1; break; }
203
- diff = q[common] - w->s[i];
235
+ diff = q[common] - w->s[i2];
204
236
  if (diff != 0) break;
205
237
  common++;
206
238
  }
@@ -237,7 +269,7 @@ extern int find_among(struct SN_env * z, struct among * v, int v_size) {
237
269
 
238
270
  /* find_among_b is for backwards processing. Same comments apply */
239
271
 
240
- extern int find_among_b(struct SN_env * z, struct among * v, int v_size) {
272
+ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) {
241
273
 
242
274
  int i = 0;
243
275
  int j = v_size;
@@ -245,7 +277,7 @@ extern int find_among_b(struct SN_env * z, struct among * v, int v_size) {
245
277
  int c = z->c; int lb = z->lb;
246
278
  symbol * q = z->p + c - 1;
247
279
 
248
- struct among * w;
280
+ const struct among * w;
249
281
 
250
282
  int common_i = 0;
251
283
  int common_j = 0;
@@ -258,9 +290,9 @@ extern int find_among_b(struct SN_env * z, struct among * v, int v_size) {
258
290
  int common = common_i < common_j ? common_i : common_j;
259
291
  w = v + k;
260
292
  {
261
- int i; for (i = w->s_size - 1 - common; i >= 0; i--) {
293
+ int i2; for (i2 = w->s_size - 1 - common; i2 >= 0; i2--) {
262
294
  if (c - common == lb) { diff = -1; break; }
263
- diff = q[- common] - w->s[i];
295
+ diff = q[- common] - w->s[i2];
264
296
  if (diff != 0) break;
265
297
  common++;
266
298
  }
@@ -362,12 +394,12 @@ static int slice_check(struct SN_env * z) {
362
394
  return 0;
363
395
  }
364
396
 
365
- extern int slice_from_s(struct SN_env * z, int s_size, symbol * s) {
397
+ extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s) {
366
398
  if (slice_check(z)) return -1;
367
399
  return replace_s(z, z->bra, z->ket, s_size, s, NULL);
368
400
  }
369
401
 
370
- extern int slice_from_v(struct SN_env * z, symbol * p) {
402
+ extern int slice_from_v(struct SN_env * z, const symbol * p) {
371
403
  return slice_from_s(z, SIZE(p), p);
372
404
  }
373
405
 
@@ -375,7 +407,7 @@ extern int slice_del(struct SN_env * z) {
375
407
  return slice_from_s(z, 0, 0);
376
408
  }
377
409
 
378
- extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, symbol * s) {
410
+ extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s) {
379
411
  int adjustment;
380
412
  if (replace_s(z, bra, ket, s_size, s, &adjustment))
381
413
  return -1;
@@ -384,7 +416,7 @@ extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, symbol * s)
384
416
  return 0;
385
417
  }
386
418
 
387
- extern int insert_v(struct SN_env * z, int bra, int ket, symbol * p) {
419
+ extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p) {
388
420
  int adjustment;
389
421
  if (replace_s(z, bra, ket, SIZE(p), p, &adjustment))
390
422
  return -1;
@@ -5,6 +5,8 @@
5
5
  #include <ctype.h>
6
6
  #include <wctype.h>
7
7
  #include <wchar.h>
8
+ #include "internal.h"
9
+ #include "scanner.h"
8
10
 
9
11
  /****************************************************************************
10
12
  *
@@ -27,8 +29,8 @@ INLINE Token *tk_set(Token *tk,
27
29
  return tk;
28
30
  }
29
31
 
30
- INLINE Token *tk_set_ts(Token *tk,
31
- char *start, char *end, char *text, int pos_inc)
32
+ static INLINE Token *tk_set_ts(Token *tk, char *start, char *end,
33
+ char *text, int pos_inc)
32
34
  {
33
35
  return tk_set(tk, start, (int)(end - start),
34
36
  (off_t)(start - text), (off_t)(end - text), pos_inc);
@@ -40,8 +42,8 @@ INLINE Token *tk_set_no_len(Token *tk,
40
42
  return tk_set(tk, text, (int)strlen(text), start, end, pos_inc);
41
43
  }
42
44
 
43
- INLINE Token *w_tk_set(Token *tk, wchar_t *text, off_t start, off_t end,
44
- int pos_inc)
45
+ static INLINE Token *w_tk_set(Token *tk, wchar_t *text, off_t start,
46
+ off_t end, int pos_inc)
45
47
  {
46
48
  int len = wcstombs(tk->text, text, MAX_WORD_SIZE - 1);
47
49
  tk->text[len] = '\0';
@@ -121,7 +123,7 @@ TokenStream *ts_clone_size(TokenStream *orig_ts, size_t size)
121
123
 
122
124
  TokenStream *ts_new_i(size_t size)
123
125
  {
124
- TokenStream *ts = ecalloc(size);
126
+ TokenStream *ts = (TokenStream *)ecalloc(size);
125
127
 
126
128
  ts->destroy_i = (void (*)(TokenStream *))&free;
127
129
  ts->reset = &ts_reset;
@@ -152,7 +154,7 @@ static TokenStream *cts_new()
152
154
 
153
155
  #define MBTS(token_stream) ((MultiByteTokenStream *)(token_stream))
154
156
 
155
- INLINE int mb_next_char(wchar_t *wchr, const char *s, mbstate_t *state)
157
+ static INLINE int mb_next_char(wchar_t *wchr, const char *s, mbstate_t *state)
156
158
  {
157
159
  int num_bytes;
158
160
  if ((num_bytes = (int)mbrtowc(wchr, s, MB_CUR_MAX, state)) < 0) {
@@ -180,7 +182,7 @@ static TokenStream *mb_ts_clone_i(TokenStream *orig_ts)
180
182
  return ts_clone_size(orig_ts, sizeof(MultiByteTokenStream));
181
183
  }
182
184
 
183
- TokenStream *mb_ts_new()
185
+ static TokenStream *mb_ts_new()
184
186
  {
185
187
  TokenStream *ts = ts_new(MultiByteTokenStream);
186
188
  ts->reset = &mb_ts_reset;
@@ -210,7 +212,9 @@ static void a_standard_destroy_i(Analyzer *a)
210
212
  free(a);
211
213
  }
212
214
 
213
- static TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
215
+ static TokenStream *a_standard_get_ts(Analyzer *a,
216
+ Symbol field,
217
+ char *text)
214
218
  {
215
219
  TokenStream *ts;
216
220
  (void)field;
@@ -220,7 +224,8 @@ static TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
220
224
 
221
225
  Analyzer *analyzer_new(TokenStream *ts,
222
226
  void (*destroy_i)(Analyzer *a),
223
- TokenStream *(*get_ts)(Analyzer *a, char *field,
227
+ TokenStream *(*get_ts)(Analyzer *a,
228
+ Symbol field,
224
229
  char *text))
225
230
  {
226
231
  Analyzer *a = ALLOC(Analyzer);
@@ -414,7 +419,7 @@ Analyzer *mb_whitespace_analyzer_new(bool lowercase)
414
419
  /*
415
420
  * LetterTokenizer
416
421
  */
417
- Token *lt_next(TokenStream *ts)
422
+ static Token *lt_next(TokenStream *ts)
418
423
  {
419
424
  char *start;
420
425
  char *t = ts->t;
@@ -446,7 +451,7 @@ TokenStream *letter_tokenizer_new()
446
451
  /*
447
452
  * Multi-byte LetterTokenizer
448
453
  */
449
- Token *mb_lt_next(TokenStream *ts)
454
+ static Token *mb_lt_next(TokenStream *ts)
450
455
  {
451
456
  int i;
452
457
  char *start;
@@ -478,7 +483,7 @@ Token *mb_lt_next(TokenStream *ts)
478
483
  /*
479
484
  * Lowercasing Multi-byte LetterTokenizer
480
485
  */
481
- Token *mb_lt_next_lc(TokenStream *ts)
486
+ static Token *mb_lt_next_lc(TokenStream *ts)
482
487
  {
483
488
  int i;
484
489
  char *start;
@@ -554,43 +559,88 @@ Analyzer *mb_letter_analyzer_new(bool lowercase)
554
559
  /*
555
560
  * StandardTokenizer
556
561
  */
557
- static int std_get_alpha(TokenStream *ts, char *token)
562
+ static Token *std_next(TokenStream *ts)
558
563
  {
559
- int i = 0;
560
- char *t = ts->t;
561
- while (t[i] != '\0' && isalnum(t[i])) {
562
- if (i < MAX_WORD_SIZE) {
563
- token[i] = t[i];
564
- }
565
- i++;
564
+ StandardTokenizer *std_tz = STDTS(ts);
565
+ const char *start = NULL;
566
+ const char *end = NULL;
567
+ int len;
568
+ Token *tk = &(CTS(ts)->token);
569
+
570
+ switch (std_tz->type) {
571
+ case STT_ASCII:
572
+ frt_std_scan(ts->t, tk->text, sizeof(tk->text) - 1,
573
+ &start, &end, &len);
574
+ break;
575
+ case STT_MB:
576
+ frt_std_scan_mb(ts->t, tk->text, sizeof(tk->text) - 1,
577
+ &start, &end, &len);
578
+ break;
579
+ case STT_UTF8:
580
+ frt_std_scan_utf8(ts->t, tk->text, sizeof(tk->text) - 1,
581
+ &start, &end, &len);
582
+ break;
566
583
  }
567
- return i;
584
+
585
+ if (len == 0)
586
+ return NULL;
587
+
588
+ ts->t = (char *)end;
589
+ tk->len = len;
590
+ tk->start = start - ts->text;
591
+ tk->end = end - ts->text;
592
+ tk->pos_inc = 1;
593
+ return &(CTS(ts)->token);
568
594
  }
569
595
 
570
- static int mb_std_get_alpha(TokenStream *ts, char *token)
596
+ static TokenStream *std_ts_clone_i(TokenStream *orig_ts)
571
597
  {
572
- char *t = ts->t;
573
- wchar_t wchr;
574
- int i;
575
- mbstate_t state; ZEROSET(&state, mbstate_t);
598
+ return ts_clone_size(orig_ts, sizeof(StandardTokenizer));
599
+ }
576
600
 
577
- i = mb_next_char(&wchr, t, &state);
601
+ static TokenStream *std_ts_new()
602
+ {
603
+ TokenStream *ts = ts_new(StandardTokenizer);
578
604
 
579
- while (wchr != 0 && iswalnum(wchr)) {
580
- t += i;
581
- i = mb_next_char(&wchr, t, &state);
582
- }
605
+ ts->clone_i = &std_ts_clone_i;
606
+ ts->next = &std_next;
583
607
 
584
- i = (int)(t - ts->t);
585
- if (i > MAX_WORD_SIZE) {
586
- i = MAX_WORD_SIZE - 1;
587
- }
588
- memcpy(token, ts->t, i);
589
- return i;
608
+ return ts;
609
+ }
610
+
611
+ TokenStream *standard_tokenizer_new()
612
+ {
613
+ TokenStream *ts = std_ts_new();
614
+ STDTS(ts)->type = STT_ASCII;
615
+ return ts;
616
+ }
617
+
618
+ TokenStream *mb_standard_tokenizer_new()
619
+ {
620
+ TokenStream *ts = std_ts_new();
621
+ STDTS(ts)->type = STT_MB;
622
+ return ts;
590
623
  }
591
624
 
625
+ TokenStream *utf8_standard_tokenizer_new()
626
+ {
627
+ TokenStream *ts = std_ts_new();
628
+ STDTS(ts)->type = STT_UTF8;
629
+ return ts;
630
+ }
631
+
632
+ /****************************************************************************
633
+ *
634
+ * LegacyStandard
635
+ *
636
+ ****************************************************************************/
637
+
638
+ #define LSTDTS(token_stream) ((LegacyStandardTokenizer *)(token_stream))
639
+
592
640
  /*
593
- static int std_get_alnum(TokenStream *ts, char *token)
641
+ * LegacyStandardTokenizer
642
+ */
643
+ static int legacy_std_get_alpha(TokenStream *ts, char *token)
594
644
  {
595
645
  int i = 0;
596
646
  char *t = ts->t;
@@ -603,7 +653,7 @@ static int std_get_alnum(TokenStream *ts, char *token)
603
653
  return i;
604
654
  }
605
655
 
606
- static int mb_std_get_alnum(TokenStream *ts, char *token)
656
+ static int mb_legacy_std_get_alpha(TokenStream *ts, char *token)
607
657
  {
608
658
  char *t = ts->t;
609
659
  wchar_t wchr;
@@ -624,7 +674,6 @@ static int mb_std_get_alnum(TokenStream *ts, char *token)
624
674
  memcpy(token, ts->t, i);
625
675
  return i;
626
676
  }
627
- */
628
677
 
629
678
  static int isnumpunc(char c)
630
679
  {
@@ -659,7 +708,7 @@ static int isurlxatc(char c)
659
708
  || isalnum(c));
660
709
  }
661
710
 
662
- static bool std_is_tok_char(char *c)
711
+ static bool legacy_std_is_tok_char(char *c)
663
712
  {
664
713
  if (isspace(*c)) {
665
714
  return false; /* most common so check first. */
@@ -671,11 +720,11 @@ static bool std_is_tok_char(char *c)
671
720
  return false;
672
721
  }
673
722
 
674
- static bool mb_std_is_tok_char(char *t)
723
+ static bool mb_legacy_std_is_tok_char(char *t)
675
724
  {
676
725
  wchar_t c;
677
726
  mbstate_t state; ZEROSET(&state, mbstate_t);
678
-
727
+
679
728
  if (((int)mbrtowc(&c, t, MB_CUR_MAX, &state)) < 0) {
680
729
  /* error which we can handle next time round. For now just return
681
730
  * false so that we can return a token */
@@ -696,7 +745,7 @@ static bool mb_std_is_tok_char(char *t)
696
745
  * (alnum) = [a-zA-Z0-9]
697
746
  * (punc) = [_\/.,-]
698
747
  */
699
- static int std_get_number(char *input)
748
+ static int legacy_std_get_number(char *input)
700
749
  {
701
750
  int i = 0;
702
751
  int count = 0;
@@ -732,7 +781,7 @@ static int std_get_number(char *input)
732
781
  }
733
782
  }
734
783
 
735
- static int std_get_apostrophe(char *input)
784
+ static int legacy_std_get_apostrophe(char *input)
736
785
  {
737
786
  char *t = input;
738
787
 
@@ -743,7 +792,7 @@ static int std_get_apostrophe(char *input)
743
792
  return (int)(t - input);
744
793
  }
745
794
 
746
- static int mb_std_get_apostrophe(char *input)
795
+ static int mb_legacy_std_get_apostrophe(char *input)
747
796
  {
748
797
  char *t = input;
749
798
  wchar_t wchr;
@@ -759,8 +808,9 @@ static int mb_std_get_apostrophe(char *input)
759
808
  return (int)(t - input);
760
809
  }
761
810
 
762
- static int std_get_url(char *input, char *token, int i)
811
+ static char *std_get_url(char *input, char *token, int i, int *len)
763
812
  {
813
+ char *next = NULL;
764
814
  while (isurlc(input[i])) {
765
815
  if (isurlpunc(input[i]) && isurlpunc(input[i - 1])) {
766
816
  break; /* can't have two puncs in a row */
@@ -770,18 +820,26 @@ static int std_get_url(char *input, char *token, int i)
770
820
  }
771
821
  i++;
772
822
  }
823
+ next = input + i;
824
+
825
+ /* We don't want to index past the end of the token capacity) */
826
+ if (i >= MAX_WORD_SIZE) {
827
+ i = MAX_WORD_SIZE - 1;
828
+ }
773
829
 
774
830
  /* strip trailing puncs */
775
831
  while (isurlpunc(input[i - 1])) {
776
832
  i--;
777
833
  }
834
+ *len = i;
835
+ token[i] = '\0';
778
836
 
779
- return i;
837
+ return next;
780
838
  }
781
839
 
782
840
  /* Company names can contain '@' and '&' like AT&T and Excite@Home. Let's
783
841
  */
784
- static int std_get_company_name(char *input)
842
+ static int legacy_std_get_company_name(char *input)
785
843
  {
786
844
  int i = 0;
787
845
  while (isalpha(input[i]) || input[i] == '@' || input[i] == '&') {
@@ -791,25 +849,7 @@ static int std_get_company_name(char *input)
791
849
  return i;
792
850
  }
793
851
 
794
- /*
795
- static int mb_std_get_company_name(char *input, TokenStream *ts)
796
- {
797
- char *t = input;
798
- wchar_t wchr;
799
- int i;
800
- mbstate_t state; ZEROSET(&state, mbstate_t);
801
-
802
- i = mb_next_char(&wchr, t, &state);
803
- while (iswalpha(wchr) || wchr == L'@' || wchr == L'&') {
804
- t += i;
805
- i = mb_next_char(&wchr, t, &state);
806
- }
807
-
808
- return (int)(t - input);
809
- }
810
- */
811
-
812
- static bool std_advance_to_start(TokenStream *ts)
852
+ static bool legacy_std_advance_to_start(TokenStream *ts)
813
853
  {
814
854
  char *t = ts->t;
815
855
  while (*t != '\0' && !isalnum(*t)) {
@@ -822,7 +862,7 @@ static bool std_advance_to_start(TokenStream *ts)
822
862
  return (*t != '\0');
823
863
  }
824
864
 
825
- static bool mb_std_advance_to_start(TokenStream *ts)
865
+ static bool mb_legacy_std_advance_to_start(TokenStream *ts)
826
866
  {
827
867
  int i;
828
868
  wchar_t wchr;
@@ -839,9 +879,9 @@ static bool mb_std_advance_to_start(TokenStream *ts)
839
879
  return (wchr != 0);
840
880
  }
841
881
 
842
- static Token *std_next(TokenStream *ts)
882
+ static Token *legacy_std_next(TokenStream *ts)
843
883
  {
844
- StandardTokenizer *std_tz = STDTS(ts);
884
+ LegacyStandardTokenizer *std_tz = LSTDTS(ts);
845
885
  char *s;
846
886
  char *t;
847
887
  char *start = NULL;
@@ -890,13 +930,13 @@ static Token *std_next(TokenStream *ts)
890
930
  }
891
931
 
892
932
  if (*t == '&') { /* apostrophe case. */
893
- t += std_get_company_name(t);
933
+ t += legacy_std_get_company_name(t);
894
934
  ts->t = t;
895
935
  return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
896
936
  }
897
937
 
898
- if ((isdigit(*t) || isnumpunc(*t)) /* possibly a number */
899
- && (len = std_get_number(t) > 0)) {
938
+ if ((isdigit(*start) || isnumpunc(*start)) /* possibly a number */
939
+ && ((len = legacy_std_get_number(start)) > 0)) {
900
940
  num_end = start + len;
901
941
  if (!std_tz->is_tok_char(num_end)) { /* won't find a longer token */
902
942
  ts->t = num_end;
@@ -909,6 +949,7 @@ static Token *std_next(TokenStream *ts)
909
949
  /* check for a known url start */
910
950
  token[token_i] = '\0';
911
951
  t += 3;
952
+ token_i += 3;
912
953
  while (*t == '/') {
913
954
  t++;
914
955
  }
@@ -917,17 +958,16 @@ static Token *std_next(TokenStream *ts)
917
958
  memcmp(token, "http", 4) == 0 ||
918
959
  memcmp(token, "https", 5) == 0 ||
919
960
  memcmp(token, "file", 4) == 0)) {
920
- len = std_get_url(t, token, 0); /* dispose of first part of the URL */
961
+ ts->t = std_get_url(t, token, 0, &len); /* dispose of first part of the URL */
921
962
  }
922
963
  else { /* still treat as url but keep the first part */
923
964
  token_i = (int)(t - start);
924
965
  memcpy(token, start, token_i * sizeof(char));
925
- len = token_i + std_get_url(t, token, token_i); /* keep start */
966
+ ts->t = std_get_url(start, token, token_i, &len); /* keep start */
926
967
  }
927
- ts->t = t + len;
928
- token[len] = 0;
929
- return tk_set(&(CTS(ts)->token), token, len, (off_t)(start - ts->text),
930
- (off_t)(ts->t - ts->text), 1);
968
+ return tk_set(&(CTS(ts)->token), token, len,
969
+ (off_t)(start - ts->text),
970
+ (off_t)(ts->t - ts->text), 1);
931
971
  }
932
972
 
933
973
  /* now see how long a url we can find. */
@@ -989,41 +1029,41 @@ static Token *std_next(TokenStream *ts)
989
1029
  return &(CTS(ts)->token);
990
1030
  }
991
1031
 
992
- static TokenStream *std_ts_clone_i(TokenStream *orig_ts)
1032
+ static TokenStream *legacy_std_ts_clone_i(TokenStream *orig_ts)
993
1033
  {
994
- return ts_clone_size(orig_ts, sizeof(StandardTokenizer));
1034
+ return ts_clone_size(orig_ts, sizeof(LegacyStandardTokenizer));
995
1035
  }
996
1036
 
997
- static TokenStream *std_ts_new()
1037
+ static TokenStream *legacy_std_ts_new()
998
1038
  {
999
- TokenStream *ts = ts_new(StandardTokenizer);
1039
+ TokenStream *ts = ts_new(LegacyStandardTokenizer);
1000
1040
 
1001
- ts->clone_i = &std_ts_clone_i;
1002
- ts->next = &std_next;
1041
+ ts->clone_i = &legacy_std_ts_clone_i;
1042
+ ts->next = &legacy_std_next;
1003
1043
 
1004
1044
  return ts;
1005
1045
  }
1006
1046
 
1007
- TokenStream *standard_tokenizer_new()
1047
+ TokenStream *legacy_standard_tokenizer_new()
1008
1048
  {
1009
- TokenStream *ts = std_ts_new();
1049
+ TokenStream *ts = legacy_std_ts_new();
1010
1050
 
1011
- STDTS(ts)->advance_to_start = &std_advance_to_start;
1012
- STDTS(ts)->get_alpha = &std_get_alpha;
1013
- STDTS(ts)->is_tok_char = &std_is_tok_char;
1014
- STDTS(ts)->get_apostrophe = &std_get_apostrophe;
1051
+ LSTDTS(ts)->advance_to_start = &legacy_std_advance_to_start;
1052
+ LSTDTS(ts)->get_alpha = &legacy_std_get_alpha;
1053
+ LSTDTS(ts)->is_tok_char = &legacy_std_is_tok_char;
1054
+ LSTDTS(ts)->get_apostrophe = &legacy_std_get_apostrophe;
1015
1055
 
1016
1056
  return ts;
1017
1057
  }
1018
1058
 
1019
- TokenStream *mb_standard_tokenizer_new()
1059
+ TokenStream *mb_legacy_standard_tokenizer_new()
1020
1060
  {
1021
- TokenStream *ts = std_ts_new();
1061
+ TokenStream *ts = legacy_std_ts_new();
1022
1062
 
1023
- STDTS(ts)->advance_to_start = &mb_std_advance_to_start;
1024
- STDTS(ts)->get_alpha = &mb_std_get_alpha;
1025
- STDTS(ts)->is_tok_char = &mb_std_is_tok_char;
1026
- STDTS(ts)->get_apostrophe = &mb_std_get_apostrophe;
1063
+ LSTDTS(ts)->advance_to_start = &mb_legacy_std_advance_to_start;
1064
+ LSTDTS(ts)->get_alpha = &mb_legacy_std_get_alpha;
1065
+ LSTDTS(ts)->is_tok_char = &mb_legacy_std_is_tok_char;
1066
+ LSTDTS(ts)->get_apostrophe = &mb_legacy_std_get_apostrophe;
1027
1067
 
1028
1068
  return ts;
1029
1069
  }
@@ -1060,7 +1100,6 @@ static void filter_destroy_i(TokenStream *ts)
1060
1100
  free(ts);
1061
1101
  }
1062
1102
 
1063
- #define tf_new(type, sub) tf_new_i(sizeof(type), sub)
1064
1103
  TokenStream *tf_new_i(size_t size, TokenStream *sub_ts)
1065
1104
  {
1066
1105
  TokenStream *ts = (TokenStream *)ecalloc(size);
@@ -1097,7 +1136,7 @@ static TokenStream *sf_clone_i(TokenStream *orig_ts)
1097
1136
  static Token *sf_next(TokenStream *ts)
1098
1137
  {
1099
1138
  int pos_inc = 0;
1100
- HashTable *words = StopFilt(ts)->words;
1139
+ Hash *words = StopFilt(ts)->words;
1101
1140
  TokenFilter *tf = TkFilt(ts);
1102
1141
  Token *tk = tf->sub_ts->next(tf->sub_ts);
1103
1142
 
@@ -1118,7 +1157,7 @@ TokenStream *stop_filter_new_with_words_len(TokenStream *sub_ts,
1118
1157
  {
1119
1158
  int i;
1120
1159
  char *word;
1121
- HashTable *word_table = h_new_str(&free, (free_ft) NULL);
1160
+ Hash *word_table = h_new_str(&free, (free_ft) NULL);
1122
1161
  TokenStream *ts = tf_new(StopFilter, sub_ts);
1123
1162
 
1124
1163
  for (i = 0; i < len; i++) {
@@ -1136,7 +1175,7 @@ TokenStream *stop_filter_new_with_words(TokenStream *sub_ts,
1136
1175
  const char **words)
1137
1176
  {
1138
1177
  char *word;
1139
- HashTable *word_table = h_new_str(&free, (free_ft) NULL);
1178
+ Hash *word_table = h_new_str(&free, (free_ft) NULL);
1140
1179
  TokenStream *ts = tf_new(StopFilter, sub_ts);
1141
1180
 
1142
1181
  while (*words) {
@@ -1234,7 +1273,7 @@ static Token *hf_next(TokenStream *ts)
1234
1273
  HyphenFilter *hf = HyphenFilt(ts);
1235
1274
  TokenFilter *tf = TkFilt(ts);
1236
1275
  Token *tk = hf->tk;
1237
-
1276
+
1238
1277
  if (hf->pos < hf->len) {
1239
1278
  const int pos = hf->pos;
1240
1279
  const int text_len = strlen(hf->text + pos);
@@ -1301,7 +1340,7 @@ TokenStream *hyphen_filter_new(TokenStream *sub_ts)
1301
1340
  ****************************************************************************/
1302
1341
 
1303
1342
 
1304
- Token *mb_lcf_next(TokenStream *ts)
1343
+ static Token *mb_lcf_next(TokenStream *ts)
1305
1344
  {
1306
1345
  wchar_t wbuf[MAX_WORD_SIZE + 1], *wchr;
1307
1346
  Token *tk = TkFilt(ts)->sub_ts->next(TkFilt(ts)->sub_ts);
@@ -1334,7 +1373,7 @@ TokenStream *mb_lowercase_filter_new(TokenStream *sub_ts)
1334
1373
  return ts;
1335
1374
  }
1336
1375
 
1337
- Token *lcf_next(TokenStream *ts)
1376
+ static Token *lcf_next(TokenStream *ts)
1338
1377
  {
1339
1378
  int i = 0;
1340
1379
  Token *tk = TkFilt(ts)->sub_ts->next(TkFilt(ts)->sub_ts);
@@ -1361,7 +1400,7 @@ TokenStream *lowercase_filter_new(TokenStream *sub_ts)
1361
1400
 
1362
1401
  #define StemFilt(filter) ((StemFilter *)(filter))
1363
1402
 
1364
- void stemf_destroy_i(TokenStream *ts)
1403
+ static void stemf_destroy_i(TokenStream *ts)
1365
1404
  {
1366
1405
  sb_stemmer_delete(StemFilt(ts)->stemmer);
1367
1406
  free(StemFilt(ts)->algorithm);
@@ -1369,7 +1408,7 @@ void stemf_destroy_i(TokenStream *ts)
1369
1408
  filter_destroy_i(ts);
1370
1409
  }
1371
1410
 
1372
- Token *stemf_next(TokenStream *ts)
1411
+ static Token *stemf_next(TokenStream *ts)
1373
1412
  {
1374
1413
  int len;
1375
1414
  const sb_symbol *stemmed;
@@ -1391,7 +1430,7 @@ Token *stemf_next(TokenStream *ts)
1391
1430
  return tk;
1392
1431
  }
1393
1432
 
1394
- TokenStream *stemf_clone_i(TokenStream *orig_ts)
1433
+ static TokenStream *stemf_clone_i(TokenStream *orig_ts)
1395
1434
  {
1396
1435
  TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(StemFilter));
1397
1436
  StemFilter *stemf = StemFilt(new_ts);
@@ -1409,10 +1448,35 @@ TokenStream *stem_filter_new(TokenStream *ts, const char *algorithm,
1409
1448
  const char *charenc)
1410
1449
  {
1411
1450
  TokenStream *tf = tf_new(StemFilter, ts);
1451
+ char *my_algorithm = NULL;
1452
+ char *my_charenc = NULL;
1453
+ char *s = NULL;
1454
+
1455
+ if (algorithm) {
1456
+ my_algorithm = estrdup(algorithm);
1457
+
1458
+ /* algorithms are lowercase */
1459
+ s = my_algorithm;
1460
+ while (*s) {
1461
+ *s = tolower(*s);
1462
+ s++;
1463
+ }
1464
+ StemFilt(tf)->algorithm = my_algorithm;
1465
+ }
1466
+
1467
+ if (charenc) {
1468
+ my_charenc = estrdup(charenc);
1412
1469
 
1413
- StemFilt(tf)->stemmer = sb_stemmer_new(algorithm, charenc);
1414
- StemFilt(tf)->algorithm = algorithm ? estrdup(algorithm) : NULL;
1415
- StemFilt(tf)->charenc = charenc ? estrdup(charenc) : NULL;
1470
+ /* encodings are uppercase and use '_' instead of '-' */
1471
+ s = my_charenc;
1472
+ while (*s) {
1473
+ *s = (*s == '-') ? '_' : toupper(*s);
1474
+ s++;
1475
+ }
1476
+ StemFilt(tf)->charenc = my_charenc;
1477
+ }
1478
+
1479
+ StemFilt(tf)->stemmer = sb_stemmer_new(my_algorithm, my_charenc);
1416
1480
 
1417
1481
  tf->next = &stemf_next;
1418
1482
  tf->destroy_i = &stemf_destroy_i;
@@ -1474,6 +1538,28 @@ Analyzer *mb_standard_analyzer_new_with_words(const char **words,
1474
1538
  return analyzer_new(ts, NULL, NULL);
1475
1539
  }
1476
1540
 
1541
+ Analyzer *utf8_standard_analyzer_new_with_words_len(const char **words,
1542
+ int len, bool lowercase)
1543
+ {
1544
+ TokenStream *ts = utf8_standard_tokenizer_new();
1545
+ if (lowercase) {
1546
+ ts = mb_lowercase_filter_new(ts);
1547
+ }
1548
+ ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
1549
+ return analyzer_new(ts, NULL, NULL);
1550
+ }
1551
+
1552
+ Analyzer *utf8_standard_analyzer_new_with_words(const char **words,
1553
+ bool lowercase)
1554
+ {
1555
+ TokenStream *ts = utf8_standard_tokenizer_new();
1556
+ if (lowercase) {
1557
+ ts = mb_lowercase_filter_new(ts);
1558
+ }
1559
+ ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
1560
+ return analyzer_new(ts, NULL, NULL);
1561
+ }
1562
+
1477
1563
  Analyzer *standard_analyzer_new(bool lowercase)
1478
1564
  {
1479
1565
  return standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
@@ -1486,14 +1572,79 @@ Analyzer *mb_standard_analyzer_new(bool lowercase)
1486
1572
  lowercase);
1487
1573
  }
1488
1574
 
1575
+ Analyzer *utf8_standard_analyzer_new(bool lowercase)
1576
+ {
1577
+ return utf8_standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
1578
+ lowercase);
1579
+ }
1580
+
1581
+ /****************************************************************************
1582
+ * Legacy
1583
+ ****************************************************************************/
1584
+
1585
+ Analyzer *legacy_standard_analyzer_new_with_words_len(const char **words, int len,
1586
+ bool lowercase)
1587
+ {
1588
+ TokenStream *ts = legacy_standard_tokenizer_new();
1589
+ if (lowercase) {
1590
+ ts = lowercase_filter_new(ts);
1591
+ }
1592
+ ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
1593
+ return analyzer_new(ts, NULL, NULL);
1594
+ }
1595
+
1596
+ Analyzer *legacy_standard_analyzer_new_with_words(const char **words,
1597
+ bool lowercase)
1598
+ {
1599
+ TokenStream *ts = legacy_standard_tokenizer_new();
1600
+ if (lowercase) {
1601
+ ts = lowercase_filter_new(ts);
1602
+ }
1603
+ ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
1604
+ return analyzer_new(ts, NULL, NULL);
1605
+ }
1606
+
1607
+ Analyzer *mb_legacy_standard_analyzer_new_with_words_len(const char **words,
1608
+ int len, bool lowercase)
1609
+ {
1610
+ TokenStream *ts = mb_legacy_standard_tokenizer_new();
1611
+ if (lowercase) {
1612
+ ts = mb_lowercase_filter_new(ts);
1613
+ }
1614
+ ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
1615
+ return analyzer_new(ts, NULL, NULL);
1616
+ }
1617
+
1618
+ Analyzer *mb_legacy_standard_analyzer_new_with_words(const char **words,
1619
+ bool lowercase)
1620
+ {
1621
+ TokenStream *ts = mb_legacy_standard_tokenizer_new();
1622
+ if (lowercase) {
1623
+ ts = mb_lowercase_filter_new(ts);
1624
+ }
1625
+ ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
1626
+ return analyzer_new(ts, NULL, NULL);
1627
+ }
1628
+
1629
+ Analyzer *legacy_standard_analyzer_new(bool lowercase)
1630
+ {
1631
+ return legacy_standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
1632
+ lowercase);
1633
+ }
1634
+
1635
+ Analyzer *mb_legacy_standard_analyzer_new(bool lowercase)
1636
+ {
1637
+ return mb_legacy_standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
1638
+ lowercase);
1639
+ }
1640
+
1489
1641
  /****************************************************************************
1490
1642
  *
1491
1643
  * PerFieldAnalyzer
1492
1644
  *
1493
1645
  ****************************************************************************/
1494
1646
 
1495
- #define PFA(analyzer) ((PerFieldAnalyzer *)(analyzer))
1496
- void pfa_destroy_i(Analyzer *self)
1647
+ static void pfa_destroy_i(Analyzer *self)
1497
1648
  {
1498
1649
  h_destroy(PFA(self)->dict);
1499
1650
 
@@ -1501,24 +1652,27 @@ void pfa_destroy_i(Analyzer *self)
1501
1652
  free(self);
1502
1653
  }
1503
1654
 
1504
- TokenStream *pfa_get_ts(Analyzer *self, char *field, char *text)
1655
+ static TokenStream *pfa_get_ts(Analyzer *self,
1656
+ Symbol field, char *text)
1505
1657
  {
1506
- Analyzer *a = h_get(PFA(self)->dict, field);
1658
+ Analyzer *a = (Analyzer *)h_get(PFA(self)->dict, field);
1507
1659
  if (a == NULL) {
1508
1660
  a = PFA(self)->default_a;
1509
1661
  }
1510
1662
  return a_get_ts(a, field, text);
1511
1663
  }
1512
1664
 
1513
- void pfa_sub_a_destroy_i(void *p)
1665
+ static void pfa_sub_a_destroy_i(void *p)
1514
1666
  {
1515
1667
  Analyzer *a = (Analyzer *) p;
1516
1668
  a_deref(a);
1517
1669
  }
1518
1670
 
1519
- void pfa_add_field(Analyzer *self, char *field, Analyzer *analyzer)
1671
+ void pfa_add_field(Analyzer *self,
1672
+ Symbol field,
1673
+ Analyzer *analyzer)
1520
1674
  {
1521
- h_set(PFA(self)->dict, estrdup(field), analyzer);
1675
+ h_set(PFA(self)->dict, field, analyzer);
1522
1676
  }
1523
1677
 
1524
1678
  Analyzer *per_field_analyzer_new(Analyzer *default_a)
@@ -1526,22 +1680,23 @@ Analyzer *per_field_analyzer_new(Analyzer *default_a)
1526
1680
  Analyzer *a = (Analyzer *)ecalloc(sizeof(PerFieldAnalyzer));
1527
1681
 
1528
1682
  PFA(a)->default_a = default_a;
1529
- PFA(a)->dict = h_new_str(&free, &pfa_sub_a_destroy_i);
1683
+ PFA(a)->dict = h_new_ptr(&pfa_sub_a_destroy_i);
1530
1684
 
1531
1685
  a->destroy_i = &pfa_destroy_i;
1532
1686
  a->get_ts = pfa_get_ts;
1533
1687
  a->ref_cnt = 1;
1534
-
1688
+
1535
1689
  return a;
1536
1690
  }
1537
1691
 
1538
- #ifdef ALONE
1692
+ #ifdef TOKENIZE
1539
1693
  int main(int argc, char **argv)
1540
1694
  {
1541
1695
  char buf[10000];
1542
1696
  Analyzer *a = standard_analyzer_new(true);
1543
1697
  TokenStream *ts;
1544
1698
  Token *tk;
1699
+ (void)argc; (void)argv;
1545
1700
  while (fgets(buf, 9999, stdin) != NULL) {
1546
1701
  ts = a_get_ts(a, "hello", buf);
1547
1702
  while ((tk = ts->next(ts)) != NULL) {